In [2]:
import glob
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV
from sklearn.metrics import make_scorer, mean_squared_error # We'll use MSE on log-values
from sklearn.model_selection import KFold


PATH_MODEL_OUTPUTS = "/kaggle/input/........./"
PATH_TRAIN_DATA = '/kaggle/input/playground-series-s5e5/train.csv'
PATH_SAMPLE_SUB = '/kaggle/input/playground-series-s5e5/sample_submission.csv'
OUTPUT_FILENAME = "ridge_ensemble.csv"

#LOADING AND TRANSFORMING 
train_df = pd.read_csv(PATH_TRAIN_DATA)
y_train_original = train_df["Calories"].values
y_train_log = np.log1p(y_train_original) 

# FOR FINAL CLIMBING
CALORIES_MIN = y_train_original.min()
CALORIES_MAX = y_train_original.max()
print(f"Target 'Calories' original min: {CALORIES_MIN}, max: {CALORIES_MAX}")

# LOADING OOFs
# This will be X_train for Ridge.
oof_files = sorted(glob.glob(f"{PATH_MODEL_OUTPUTS}oof_*.npy"))
if not oof_files:
    raise ValueError(f"No OOF files found in {PATH_MODEL_OUTPUTS} matching 'oof_*.npy'")

oof_stack_list_for_ridge = []
print("\nProcessing OOF files for Ridge training set:")
for f_path in oof_files:
    oof_array = np.load(f_path)
    if oof_array.ndim > 1: # Ensure 1D
        oof_array = oof_array.ravel()
    print(f"  Loaded {f_path}, original mean: {oof_array.mean():.4f}, shape: {oof_array.shape}")
    if oof_array.mean() > 10:
        oof_array_transformed = np.log1p(oof_array)
        print(f"    Mean > 10, applied log1p. New mean: {oof_array_transformed.mean():.4f}")
    else:
        oof_array_transformed = oof_array
        print(f"    Mean <= 10, already log-scaled.")
    oof_stack_list_for_ridge.append(oof_array_transformed)

X_train_ridge = np.column_stack(oof_stack_list_for_ridge)
print(f"Shape of X_train_ridge (OOF stack): {X_train_ridge.shape}")



# --- 3. Load and Prepare Submission Predictions (for Ridge prediction) ---
# This will be X_test for Ridge.
# We follow the Hill Climber's logic: pred = np.log1p(df.Calories.values)
sub_files = sorted(glob.glob(f"{PATH_MODEL_OUTPUTS}submission_*.csv"))
if not sub_files:
    raise ValueError(f"No submission files found in {PATH_MODEL_OUTPUTS} matching 'submission_*.csv'")
if len(oof_files) != len(sub_files):
    print(f"Mismatch in count of OOF files ({len(oof_files)}) and submission files ({len(sub_files)}). Ensure they correspond.")

sub_stack_list_for_ridge = []
print("\nProcessing submission files for Ridge prediction set:")
for f_path in sub_files:
    sub_df = pd.read_csv(f_path)
    sub_preds_original_scale = sub_df['Calories'].values
    if sub_preds_original_scale.ndim > 1: # Ensure 1D
        sub_preds_original_scale = sub_preds_original_scale.ravel()
    print(f"  Loaded {f_path}, original 'Calories' mean: {sub_preds_original_scale.mean():.4f}, shape: {sub_preds_original_scale.shape}")
    
    # Crucially, transform these submission predictions to log-scale for Ridge

    sub_preds_log_scale = np.log1p(sub_preds_original_scale)
    print(f"    Applied log1p. New mean: {sub_preds_log_scale.mean():.4f}")
    sub_stack_list_for_ridge.append(sub_preds_log_scale)

X_test_ridge = np.column_stack(sub_stack_list_for_ridge)
print(f"Shape of X_test_ridge (Submission stack): {X_test_ridge.shape}")


# --- 4. RidgeCV Model ---
alphas = np.logspace(-5, 5, 100) # generating alphas 10^-5 to 10^5
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

ridge = RidgeCV(
    alphas=alphas,
    scoring='neg_mean_squared_error', # Optimizes MSE in log-space
    cv=cv_strategy,
    fit_intercept=True
)

print("\nFitting RidgeCV model...")
# Ensure y_train_log matches the number of samples in X_train_ridge
if X_train_ridge.shape[0] != y_train_log.shape[0]:
    raise ValueError(f"Sample number mismatch: X_train_ridge has {X_train_ridge.shape[0]} samples, y_train_log has {y_train_log.shape[0]} samples.")

ridge.fit(X_train_ridge, y_train_log)

# --- 5. Model Info ---
print(f"\nBest alpha: {ridge.alpha_:.6f}")
print("Model weights:")
model_names_for_print = [f.split('/')[-1] for f in oof_files] # Get file names for clarity
for name, w in zip(model_names_for_print, ridge.coef_):
    print(f"  {name}: {w:.4f}")
print(f"Intercept: {ridge.intercept_:.4f}")

# --- 6. Final Ensemble Prediction ---
# Ridge model predicts in log-scale because it was trained on y_train_log.
log_ensemble_preds = ridge.predict(X_test_ridge)
print(f"\nEnsemble predictions (log-scale) mean: {log_ensemble_preds.mean():.4f}, std: {log_ensemble_preds.std():.4f}")

# Inverse transform to original scale
final_ensemble_preds_original_scale = np.expm1(log_ensemble_preds)
print(f"Ensemble predictions (original scale, before clip) mean: {final_ensemble_preds_original_scale.mean():.4f}, std: {final_ensemble_preds_original_scale.std():.4f}")
print(f"  Min pred: {final_ensemble_preds_original_scale.min():.4f}, Max pred: {final_ensemble_preds_original_scale.max():.4f}")

# Clip to train min/max, as done in the Hill Climbing script's submission part
final_ensemble_preds_clipped = np.clip(final_ensemble_preds_original_scale, CALORIES_MIN, CALORIES_MAX)
# An alternative or additional first clip: ensure non-negativity before train min/max clip
# final_ensemble_preds_clipped = np.maximum(0, final_ensemble_preds_original_scale)
# final_ensemble_preds_clipped = np.clip(final_ensemble_preds_clipped, CALORIES_MIN, CALORIES_MAX)


print(f"Ensemble predictions (original scale, after clip to [{CALORIES_MIN:.2f}, {CALORIES_MAX:.2f}]) mean: {final_ensemble_preds_clipped.mean():.4f}, std: {final_ensemble_preds_clipped.std():.4f}")
print(f"  Min clipped pred: {final_ensemble_preds_clipped.min():.4f}, Max clipped pred: {final_ensemble_preds_clipped.max():.4f}")

# --- 7. Crea/kaggle/input/a6sd1sdte Submission File ---
sample_sub_df = pd.read_csv(PATH_SAMPLE_SUB)
if X_test_ridge.shape[0] != len(sample_sub_df):
     raise ValueError(f"Sample number mismatch: X_test_ridge has {X_test_ridge.shape[0]} predictions, sample submission expects {len(sample_sub_df)}.")
sample_sub_df['Calories'] = final_ensemble_preds_clipped

sample_sub_df.to_csv(OUTPUT_FILENAME, index=False)
print(f"\n✅ {OUTPUT_FILENAME} created successfully.")
print("Submission head:")
print(sample_sub_df.head())

# --- 8. Sanity Check: Calculate OOF RMSLE for the Ridge ensemble ---
# Predict on the training OOF stack (X_train_ridge)
oof_ridge_log_preds = ridge.predict(X_train_ridge)
# Transform to original scale
oof_ridge_original_preds = np.expm1(oof_ridge_log_preds)
# Clip (as we would for submission)
oof_ridge_original_preds_clipped = np.clip(oof_ridge_original_preds, CALORIES_MIN, CALORIES_MAX)
# oof_ridge_original_preds_clipped = np.maximum(0, oof_ridge_original_preds) # Alternative

# RMSLE calculation function for original scale values
def rmsle_metric_original_scale(y_true_orig, y_pred_orig):
    y_true_log = np.log1p(y_true_orig)
    y_pred_log = np.log1p(np.maximum(0, y_pred_orig)) # Clip predictions to be non-negative before log1p
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log))

oof_rmsle_ridge = rmsle_metric_original_scale(y_train_original, oof_ridge_original_preds_clipped)
print(f"\nEstimated OOF RMSLE for Ridge ensemble: {oof_rmsle_ridge:.5f}")

Target 'Calories' original min: 1.0, max: 314.0

Processing OOF files for Ridge training set:
  Loaded /kaggle/input/22222222222/oof_cat_05990004.npy, original mean: 4.1412, shape: (750000,)
    Mean <= 10, using as is (potentially original scale but low values, or already log-scale).
  Loaded /kaggle/input/22222222222/oof_catboost_domain_5714.npy, original mean: 4.1411, shape: (750000,)
    Mean <= 10, using as is (potentially original scale but low values, or already log-scale).
  Loaded /kaggle/input/22222222222/oof_xgb_ftr_multiple_05960005.npy, original mean: 4.1410, shape: (750000,)
    Mean <= 10, using as is (potentially original scale but low values, or already log-scale).
Shape of X_train_ridge (OOF stack): (750000, 3)

Processing submission files for Ridge prediction set:
  Loaded /kaggle/input/22222222222/submission_cat_05990004.csv, original 'Calories' mean: 88.1855, shape: (250000,)
    Applied log1p. New mean: 4.1413
  Loaded /kaggle/input/22222222222/submission_catboost