In [3]:
import numpy as np, pandas as pd
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

In [4]:
def build_reduced_features(df):
    eps = 1e-6
    X_raw = df.copy()
    if 'Id' in X_raw.columns:
        X_raw = X_raw.drop(columns=['Id'])
    required = ['Therapy Hours', 'Initial Health Score', 'Follow-Up Sessions', 'Average Sleep Hours', 'Lifestyle Activities']
    missing = [c for c in required if c not in X_raw.columns]
    
    def safe_float_series(s):
        return pd.to_numeric(s, errors='coerce').fillna(0.0).astype(float)
    
    
    if X_raw['Lifestyle Activities'].dtype == object:
        X_raw['Lifestyle Activities'] = X_raw['Lifestyle Activities'].map({'Yes': 1, 'No': 0})
    X_raw['Lifestyle Activities'] = safe_float_series(X_raw['Lifestyle Activities'])
    
    for c in ['Therapy Hours', 'Initial Health Score', 'Follow-Up Sessions', 'Average Sleep Hours']:
        X_raw[c] = safe_float_series(X_raw[c])
    
    X_eng = pd.DataFrame(index=X_raw.index)
    X_eng['Initial Health Score_p1'] = X_raw['Initial Health Score']
    X_eng['Therapy Hours_p1'] = X_raw['Therapy Hours']
    X_eng['Follow-Up Sessions_p1'] = X_raw['Follow-Up Sessions']
    X_eng['Average Sleep Hours_p1'] = X_raw['Average Sleep Hours']
    X_eng['Lifestyle Activities_p1'] = X_raw['Lifestyle Activities']
    X_eng['Average Sleep Hours_p2'] = X_raw['Average Sleep Hours'] ** 2
    X_eng['Average Sleep Hours_p3'] = X_raw['Average Sleep Hours'] ** 3
    X_eng['InitialHealthxSleep'] = X_raw['Initial Health Score'] * X_raw['Average Sleep Hours']
    X_eng['TherapyxSleep'] = X_raw['Therapy Hours'] * X_raw['Average Sleep Hours']
    X_eng['SleepxFollowUp'] = X_raw['Average Sleep Hours'] * X_raw['Follow-Up Sessions']
    X_eng['TherapyxHealth'] = X_raw['Therapy Hours'] * X_raw['Initial Health Score']
    X_eng['Therapy2xLifestyle'] = (X_raw['Therapy Hours'] ** 2) * X_raw['Lifestyle Activities']
    X_eng['TherapyxFollowUp'] = X_raw['Therapy Hours'] * X_raw['Follow-Up Sessions']
    X_eng['TherapyxLifestyle'] = X_raw['Therapy Hours'] * X_raw['Lifestyle Activities']
    X_eng['LifestylexSleep'] = X_raw['Lifestyle Activities'] * X_raw['Average Sleep Hours']
    X_eng['Therapy3xLifestyle'] = (X_raw['Therapy Hours'] ** 3) * X_raw['Lifestyle Activities']
    X_eng['HealthxLifestyle'] = X_raw['Initial Health Score'] * X_raw['Lifestyle Activities']
    X_eng['FollowUpxSleep'] = X_raw['Follow-Up Sessions'] * X_raw['Average Sleep Hours']
    X_eng['Sleep3_over_FollowUp'] = (X_raw['Average Sleep Hours'] ** 3) / (X_raw['Follow-Up Sessions'] + eps)
    X_eng['TherapyxLifestyle_sq'] = X_raw['Therapy Hours'] * (X_raw['Lifestyle Activities'] ** 2)
    X_eng['Therapy3_over_Health2'] = (X_raw['Therapy Hours'] ** 3) / ((X_raw['Initial Health Score'] ** 2) + eps)
    X_eng['LifestylexFollowUp2'] = X_raw['Lifestyle Activities'] * (X_raw['Follow-Up Sessions'] ** 2)
    
    return X_eng

In [5]:
def rmse(a,b):
    return np.sqrt(mean_squared_error(a,b))

In [6]:
train = pd.read_csv('train.csv'); test = pd.read_csv('test.csv')
ids = test['Id'].values if 'Id' in test.columns else np.arange(len(test))
Xtr = build_reduced_features(train); Xte = build_reduced_features(test)
flat_abs = Xtr.replace([np.inf, -np.inf], np.nan).abs().stack().dropna()
clip_val = flat_abs.quantile(0.995) if len(flat_abs) else 1e6
Xtr = Xtr.replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(-clip_val, clip_val)
Xte = Xte.replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(-clip_val, clip_val)
y = train['Recovery Index'].values


In [7]:
# Base estimators (pipelines) - scale inside pipelines
base_estimators = [
    ('ridge', Pipeline([('scaler', StandardScaler()), ('ridge', Ridge(alpha=1.0))])),
    ('lasso', Pipeline([('scaler', StandardScaler()), ('lasso', Lasso(alpha=0.01))])),
    ('lgb', Pipeline([('scaler', StandardScaler()), ('lgb', LGBMRegressor(n_estimators=1000, learning_rate=0.03, max_depth=4, random_state=42))]))
]
meta = Ridge(alpha=0.5)
stack = StackingRegressor(estimators=base_estimators, final_estimator=meta, cv=5, passthrough=True, n_jobs=1)


In [8]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = -cross_val_score(stack, Xtr, y, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=1)
print("Stacking CV RMSE (5-fold):", scores.mean())


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1180
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 22
[LightGBM] [Info] Start training from score 55.329375
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1177
[LightGBM] [Info] Number of data points in the train set: 5120, number of used features: 22
[LightGBM] [Info] Start training from score 55.417383
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1176
[LightGBM] [Info] Number of data points in the train set: 5120, number of used features: 22
[LightGBM] [Info] Start trai

In [9]:
stack.fit(Xtr, y)
pred_test = stack.predict(Xte)
pd.DataFrame({'Id': ids, 'Recovery Index': np.clip(pred_test, 0, None)}).to_csv('submission_C.csv', index=False)
print("Saved submission_C.csv")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000980 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1178
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 22
[LightGBM] [Info] Start training from score 55.311500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1175
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 22
[LightGBM] [Info] Start training from score 55.437344
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1179
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 22
[LightGBM] [Info] Start trai