In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

In [13]:
def rmse(a, b): 
    return np.sqrt(mean_squared_error(a, b))

In [14]:
def build_reduced_features(df):
    eps = 1e-6
    X_raw = df.copy()
    if 'Id' in X_raw.columns:
        X_raw = X_raw.drop(columns=['Id'])
    required = ['Therapy Hours', 'Initial Health Score', 'Follow-Up Sessions', 'Average Sleep Hours', 'Lifestyle Activities']
    
    def safe_float_series(s):
        return pd.to_numeric(s, errors='coerce').fillna(0.0).astype(float)
    
    if X_raw['Lifestyle Activities'].dtype == object:
        X_raw['Lifestyle Activities'] = X_raw['Lifestyle Activities'].map({'Yes': 1, 'No': 0})
    X_raw['Lifestyle Activities'] = safe_float_series(X_raw['Lifestyle Activities'])
    
    for c in ['Therapy Hours', 'Initial Health Score', 'Follow-Up Sessions', 'Average Sleep Hours']:
        X_raw[c] = safe_float_series(X_raw[c])
    
    X_eng = pd.DataFrame(index=X_raw.index)
    X_eng['Initial Health Score_p1'] = X_raw['Initial Health Score']
    X_eng['Therapy Hours_p1'] = X_raw['Therapy Hours']
    X_eng['Follow-Up Sessions_p1'] = X_raw['Follow-Up Sessions']
    X_eng['Average Sleep Hours_p1'] = X_raw['Average Sleep Hours']
    X_eng['Lifestyle Activities_p1'] = X_raw['Lifestyle Activities']
    X_eng['Average Sleep Hours_p2'] = X_raw['Average Sleep Hours'] ** 2
    X_eng['Average Sleep Hours_p3'] = X_raw['Average Sleep Hours'] ** 3
    X_eng['InitialHealthxSleep'] = X_raw['Initial Health Score'] * X_raw['Average Sleep Hours']
    X_eng['TherapyxSleep'] = X_raw['Therapy Hours'] * X_raw['Average Sleep Hours']
    X_eng['SleepxFollowUp'] = X_raw['Average Sleep Hours'] * X_raw['Follow-Up Sessions']
    X_eng['TherapyxHealth'] = X_raw['Therapy Hours'] * X_raw['Initial Health Score']
    X_eng['Therapy2xLifestyle'] = (X_raw['Therapy Hours'] ** 2) * X_raw['Lifestyle Activities']
    X_eng['TherapyxFollowUp'] = X_raw['Therapy Hours'] * X_raw['Follow-Up Sessions']
    X_eng['TherapyxLifestyle'] = X_raw['Therapy Hours'] * X_raw['Lifestyle Activities']
    X_eng['LifestylexSleep'] = X_raw['Lifestyle Activities'] * X_raw['Average Sleep Hours']
    X_eng['Therapy3xLifestyle'] = (X_raw['Therapy Hours'] ** 3) * X_raw['Lifestyle Activities']
    X_eng['HealthxLifestyle'] = X_raw['Initial Health Score'] * X_raw['Lifestyle Activities']
    X_eng['FollowUpxSleep'] = X_raw['Follow-Up Sessions'] * X_raw['Average Sleep Hours']
    X_eng['Sleep3_over_FollowUp'] = (X_raw['Average Sleep Hours'] ** 3) / (X_raw['Follow-Up Sessions'] + eps)
    X_eng['TherapyxLifestyle_sq'] = X_raw['Therapy Hours'] * (X_raw['Lifestyle Activities'] ** 2)
    X_eng['Therapy3_over_Health2'] = (X_raw['Therapy Hours'] ** 3) / ((X_raw['Initial Health Score'] ** 2) + eps)
    X_eng['LifestylexFollowUp2'] = X_raw['Lifestyle Activities'] * (X_raw['Follow-Up Sessions'] ** 2)
    
    return X_eng

In [15]:
SEEDS = [0, 11, 22, 33, 44]   # try 5 seeds (extend to 8-10 if you have time)
NFOLDS = 5
LGB_PARAMS_BASE = dict(n_estimators=1000, learning_rate=0.03, max_depth=4, random_state=42)

In [16]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
ids = test['Id'].values if 'Id' in test.columns else np.arange(len(test))

In [17]:
Xtr = build_reduced_features(train)
Xte = build_reduced_features(test)

In [18]:
# Clip extremes using train quantile
flat_abs = Xtr.replace([np.inf, -np.inf], np.nan).abs().stack().dropna()
clip_val = flat_abs.quantile(0.995) if len(flat_abs) else 1e6
Xtr = Xtr.replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(-clip_val, clip_val)
Xte = Xte.replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(-clip_val, clip_val)

y = train['Recovery Index'].values

# Storage for bagging
oof_agg = np.zeros(len(y))
test_agg = np.zeros(len(Xte))
n_seeds = len(SEEDS)

for i, seed in enumerate(SEEDS):
    print(f"\n--- Seed {seed} ({i+1}/{n_seeds}) ---")
    # define base estimators using seed for LGB randomness
    base_estimators = [
        ('ridge', Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(alpha=1.0))])),
        ('lasso', Pipeline([('scaler', StandardScaler()), ('lasso', Lasso(alpha=0.01, random_state=seed))])),
        ('lgb', Pipeline([('scaler', StandardScaler()), ('lgb', LGBMRegressor({**LGB_PARAMS_BASE, 'random_state': seed}))]))
    ]
    meta = Ridge(alpha=0.5)
    stack = StackingRegressor(estimators=base_estimators, final_estimator=meta, cv=NFOLDS, passthrough=True, n_jobs=1)
    # OOF predictions for this seed (use KFold with the seed for consistent folds per seed)
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=seed)
    oof_seed = cross_val_predict(stack, Xtr, y, cv=kf, n_jobs=1)
    print(f"Seed {seed} OOF RMSE: {rmse(y, oof_seed):.6f}")
    oof_agg += oof_seed / n_seeds
    # fit on full train and predict test
    stack.fit(Xtr, y)
    test_pred_seed = stack.predict(Xte)
    test_agg += test_pred_seed / n_seeds

# Final bagged OOF RMSE
print("\nBagged stacking OOF RMSE:", rmse(y, oof_agg))
# Save submission
final_preds = np.clip(test_agg, 0, None)
submission = pd.DataFrame({"Id": ids, "Recovery Index": final_preds})
submission.to_csv("submission_bagged_stack.csv", index=False)
print("Saved submission_bagged_stack.csv")


--- Seed 0 (1/5) ---


TypeError: LGBMRegressor.__init__() takes 1 positional argument but 2 were given