In [None]:
# SCRIPT 2: train_meta_model.py
import pandas as pd
import numpy as np
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import KFold
import lightgbm as lgb
import joblib

print("--- Starting Meta-Model Training and Evaluation ---")

# --- SMAPE Metric ---
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

# --- ‚öôÔ∏è Configuration ---
TRAIN_CSV_PATH = 'train_final.csv'
OOF_PATHS = {
    'lgbm': 'oof_lgbm_preds.npy',
    'xgb': 'oof_xgb_preds.npy',
    'cat': 'oof_cat_preds.npy',
    'mlp': 'oof_mlp_preds.npy'
}
FINAL_META_MODEL_PATH = 'final_meta_model.joblib'

# --- 1. Load Data ---
print("Loading OOF predictions and true target values...")
meta_features = pd.DataFrame()
for name, path in OOF_PATHS.items():
    try:
        meta_features[f'{name}_pred'] = np.load(path)
    except FileNotFoundError:
        print(f"Error: Prediction file not found at '{path}'. Please run the base model script first.")
        exit()

train_df = pd.read_csv(TRAIN_CSV_PATH)

# --- FIX: Add this block to handle both 'price_log' and 'log_price' ---
if 'log_price' not in train_df.columns and 'price_log' in train_df.columns:
    train_df.rename(columns={'price_log': 'log_price'}, inplace=True)
# --- END FIX ---

is_valid_mask = train_df['log_price'].notna()
meta_target = train_df[is_valid_mask]['log_price'].values
y_true_price = np.expm1(meta_target)
print(f"Meta-features created with shape: {meta_features.shape}")
assert len(meta_features) == len(meta_target), "Mismatch between OOF predictions and target length."

# --- 2. Evaluate Blending and Stacking ---
print("\n--- Evaluating Simple Blending ---")
blend_preds_log = meta_features.mean(axis=1).values
blend_score = smape(y_true_price, np.expm1(blend_preds_log))

print("\n--- Evaluating Stacking with RidgeCV ---")
meta_model_ridge = RidgeCV(alphas=np.logspace(-4, 2, 100), cv=5)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_ridge_preds = np.zeros(len(meta_features))
for fold, (train_idx, val_idx) in enumerate(kf.split(meta_features)):
    X_train_meta, X_val_meta = meta_features.iloc[train_idx], meta_features.iloc[val_idx]
    y_train_meta = meta_target[train_idx]
    meta_model_ridge.fit(X_train_meta, y_train_meta)
    oof_ridge_preds[val_idx] = meta_model_ridge.predict(X_val_meta)
ridge_score = smape(y_true_price, np.expm1(oof_ridge_preds))

print("\n--- Evaluating Stacking with LightGBM ---")
lgbm_meta_params = {
    'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 1000,
    'learning_rate': 0.01, 'num_leaves': 16, 'verbose': -1, 'n_jobs': -1, 'seed': 42
}
meta_model_lgbm = lgb.LGBMRegressor(**lgbm_meta_params)
oof_lgbm_meta_preds = np.zeros(len(meta_features))
for fold, (train_idx, val_idx) in enumerate(kf.split(meta_features)):
    X_train_meta, X_val_meta = meta_features.iloc[train_idx], meta_features.iloc[val_idx]
    y_train_meta, y_val_meta = meta_target[train_idx], meta_target[val_idx]
    meta_model_lgbm.fit(X_train_meta, y_train_meta, 
                        eval_set=[(X_val_meta, y_val_meta)],
                        callbacks=[lgb.early_stopping(50, verbose=False)])
    oof_lgbm_meta_preds[val_idx] = meta_model_lgbm.predict(X_val_meta)
lgbm_stack_score = smape(y_true_price, np.expm1(oof_lgbm_meta_preds))

# --- 3. Train and Save the Final (Best) Meta-Model ---
print("\n--- Training the Final LightGBM Meta-Model on ALL OOF Data ---")
final_meta_model = lgb.LGBMRegressor(**lgbm_meta_params)
final_meta_model.fit(meta_features, meta_target)
joblib.dump(final_meta_model, FINAL_META_MODEL_PATH)
print(f"Final meta-model (LightGBM) saved successfully to '{FINAL_META_MODEL_PATH}'")

# --- 4. Final Results ---
print("\n" + "="*50)
print("Final Ensemble Performance üöÄ")
print(f"Simple Blending SMAPE Score:      {blend_score:.4f}")
print(f"Stacking (RidgeCV) SMAPE Score:   {ridge_score:.4f}")
print(f"Stacking (LightGBM) SMAPE Score:  {lgbm_stack_score:.4f}")
print("="*50)