# CPE342 - Karena Task3 V6: Full Stacking with Optuna & IterativeImputer

## Upgrade Highlights
- **Full Stacking**: Stacking applied to BOTH Classification (Stage 1) and Regression (Stage 2).
- **Dual Optuna Tuning**: Automated hyperparameter optimization for both stages.
- **Advanced Imputation**: Switched to `IterativeImputer` (MICE) for better accuracy.
- **Robustness**: Separate preprocessors for each stage to prevent data leakage/errors.

In [None]:
import pandas as pd
import numpy as np
import warnings
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler
from sklearn.experimental import enable_iterative_imputer  # Explicitly enable
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.ensemble import StackingRegressor, StackingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LassoCV, ElasticNetCV, Lasso, LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.base import clone

from lightgbm import LGBMRegressor, LGBMClassifier
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
pd.set_option('display.max_columns', None)

# FIX: Explicitly set XGBoost global verbosity to 0
xgb.set_config(verbosity=0)

## 1. Load Data & Feature Engineering

In [2]:
try:
    train_df = pd.read_csv("Dataset/task3/train.csv")
    test_df = pd.read_csv("Dataset/task3/test.csv")
except FileNotFoundError:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")

test_ids = test_df['id']

def create_features(df):
    df = df.copy()
    df['spending_per_day'] = df['historical_spending'] / (df['account_age_days'] + 1e-6)
    df['spending_per_transaction'] = df['historical_spending'] / (df['total_transactions'] + 1e-6)
    df['prev_month_ratio'] = df['prev_month_spending'] / (df['historical_spending'] + 1e-6)
    df['playtime_per_day'] = df['total_playtime_hours'] / (df['account_age_days'] + 1e-6)
    df['playtime_per_session'] = df['total_playtime_hours'] / (df['sessions_per_week'] * 4.33 + 1e-6)
    df['interaction_per_friend'] = df['social_interactions'] / (df['friend_count'] + 1e-6)
    df['discount_purchase_ratio'] = df['purchases_on_discount'] / (df['total_transactions'] + 1e-6)
    df['avg_discount_value'] = (df['discount_rate_used'] * df['purchases_on_discount']) / (df['total_transactions'] + 1e-6)
    df['is_whale'] = (df['historical_spending'] > df['historical_spending'].quantile(0.95)).astype(int)
    df['high_activity'] = (df['total_playtime_hours'] > df['total_playtime_hours'].quantile(0.90)).astype(int)
    
    skewed_cols = ['historical_spending', 'total_playtime_hours', 'friend_count']
    for col in skewed_cols:
        df[f'log_{col}'] = np.log1p(df[col])

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

TARGET = 'spending_30d'
CATEGORICAL_FEATURES = [
    'guild_membership', 'vip_status', 'is_premium_member', 'primary_game',
    'games_played', 'cross_game_activity', 'platform', 'seasonal_spending_pattern',
    'owns_limited_edition', 'tournament_participation', 'segment', 'is_whale', 'high_activity'
]
NUMERICAL_FEATURES = [
    col for col in train_df.columns 
    if col not in [TARGET, 'id', 'player_id'] + CATEGORICAL_FEATURES
]

# Preprocessing with IterativeImputer
numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(max_iter=10, random_state=0)), # MICE Imputation
    ('scaler', RobustScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, NUMERICAL_FEATURES),
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ],
    remainder='passthrough'
)

## 2. Stage 1: Classification Tuning (Optuna)
Optimizing LGBM and XGBoost for classification (Will Spend?).

In [3]:
y_class = (train_df[TARGET] > 0).astype(int)
X = train_df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]

# Clone preprocessor for Classification Tuning
preprocessor_clf_tune = clone(preprocessor)
X_processed = preprocessor_clf_tune.fit_transform(X)

def objective_lgbm_clf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'class_weight': 'balanced',
        'n_jobs': -1,
        'verbose': -1,
        'random_state': 42
    }
    model = LGBMClassifier(**params)
    return cross_val_score(model, X_processed, y_class, cv=3, scoring='roc_auc').mean()

def objective_xgb_clf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': (len(y_class) - sum(y_class)) / sum(y_class),
        'n_jobs': 4,
        'verbosity': 0,
        'random_state': 42
    }
    model = XGBClassifier(**params)
    return cross_val_score(model, X_processed, y_class, cv=3, scoring='roc_auc').mean()

print("Tuning Classification Models...")
study_lgbm_clf = optuna.create_study(direction='maximize')
study_lgbm_clf.optimize(objective_lgbm_clf, n_trials=20)
print("Best LGBM Clf Params:", study_lgbm_clf.best_params)

study_xgb_clf = optuna.create_study(direction='maximize')
study_xgb_clf.optimize(objective_xgb_clf, n_trials=20)
print("Best XGB Clf Params:", study_xgb_clf.best_params)

Tuning Classification Models...
Best LGBM Clf Params: {'n_estimators': 377, 'learning_rate': 0.04716650770151098, 'max_depth': 3, 'num_leaves': 91, 'subsample': 0.7858238978905429, 'colsample_bytree': 0.6814656253052074}
Best XGB Clf Params: {'n_estimators': 450, 'learning_rate': 0.03656346779074918, 'max_depth': 4, 'subsample': 0.9474764455162192, 'colsample_bytree': 0.853773046065345}


## 3. Stage 1: Classification Stacking
Combining optimized classifiers.

In [4]:
lgbm_clf_best = LGBMClassifier(**study_lgbm_clf.best_params, class_weight='balanced', n_jobs=-1, verbose=-1, random_state=42)
xgb_clf_best = XGBClassifier(**study_xgb_clf.best_params, scale_pos_weight=(len(y_class) - sum(y_class)) / sum(y_class), n_jobs=4, verbosity=0, random_state=42)

stack_clf = StackingClassifier(
    estimators=[
        ('lgbm', lgbm_clf_best),
        ('xgb', xgb_clf_best),
        ('lr', LogisticRegression(class_weight='balanced', max_iter=1000))
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

# Clone preprocessor for Final Classification Pipeline
preprocessor_clf_final = clone(preprocessor)

clf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_clf_final),
    ('stacking_clf', stack_clf)
])

print("Training Classification Stacking...")
clf_pipeline.fit(X, y_class)
print("Stage 1 Complete.")

Training Classification Stacking...
Stage 1 Complete.


## 4. Stage 2: Regression Tuning (Optuna)
Optimizing Regressors for Spenders.

In [5]:
mask_spenders = train_df[TARGET] > 0
X_spenders = train_df.loc[mask_spenders, NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
y_spenders_log = np.log1p(train_df.loc[mask_spenders, TARGET])

# Clone preprocessor for Regression Tuning
preprocessor_reg_tune = clone(preprocessor)
X_spenders_processed = preprocessor_reg_tune.fit_transform(X_spenders)

def objective_lgbm_reg(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_jobs': -1,
        'verbose': -1,
        'random_state': 42
    }
    model = LGBMRegressor(**params)
    scores = cross_val_score(model, X_spenders_processed, y_spenders_log, cv=3, scoring='neg_root_mean_squared_error')
    return -scores.mean()

def objective_xgb_reg(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'n_jobs': 4,
        'verbosity': 0,
        'random_state': 42
    }
    model = XGBRegressor(**params)
    scores = cross_val_score(model, X_spenders_processed, y_spenders_log, cv=3, scoring='neg_root_mean_squared_error')
    return -scores.mean()

print("Tuning Regression Models...")
study_lgbm_reg = optuna.create_study(direction='minimize')
study_lgbm_reg.optimize(objective_lgbm_reg, n_trials=20)
print("Best LGBM Reg Params:", study_lgbm_reg.best_params)

study_xgb_reg = optuna.create_study(direction='minimize')
study_xgb_reg.optimize(objective_xgb_reg, n_trials=20)
print("Best XGB Reg Params:", study_xgb_reg.best_params)

Tuning Regression Models...
Best LGBM Reg Params: {'n_estimators': 230, 'learning_rate': 0.058772804989323815, 'max_depth': 7, 'num_leaves': 50, 'subsample': 0.6376909960035413, 'colsample_bytree': 0.9098169065185332, 'reg_alpha': 5.833461145808751e-08, 'reg_lambda': 1.5943068065572767e-05}
Best XGB Reg Params: {'n_estimators': 963, 'learning_rate': 0.054452797162615856, 'max_depth': 4, 'subsample': 0.589008253388284, 'colsample_bytree': 0.7933927699859022, 'reg_alpha': 0.0006349588798568599, 'reg_lambda': 0.6668490876906188}


## 5. Stage 2: Regression Stacking
Combining optimized regressors.

In [6]:
lgbm_reg_best = LGBMRegressor(**study_lgbm_reg.best_params, n_jobs=-1, verbose=-1, random_state=42)
xgb_reg_best = XGBRegressor(**study_xgb_reg.best_params, n_jobs=4, verbosity=0, random_state=42)

stack_reg = StackingRegressor(
    estimators=[
        ('lasso', make_pipeline(RobustScaler(), LassoCV(cv=5, random_state=1))),
        ('enet', make_pipeline(RobustScaler(), ElasticNetCV(cv=5, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], random_state=3))),
        ('gboost', GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=5)),
        ('xgb', xgb_reg_best),
        ('lgbm', lgbm_reg_best)
    ],
    final_estimator=Lasso(alpha=0.0005, random_state=1),
    n_jobs=-1,
    passthrough=False
)

# Clone preprocessor for Final Regression Pipeline
preprocessor_reg_final = clone(preprocessor)

reg_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_reg_final),
    ('stacking_reg', stack_reg)
])

print("Training Regression Stacking...")
reg_pipeline.fit(X_spenders, y_spenders_log)
print("Stage 2 Complete.")

Training Regression Stacking...
Stage 2 Complete.


## 6. Final Prediction

In [7]:
X_test = test_df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]

prob_spend = clf_pipeline.predict_proba(X_test)[:, 1]
pred_log = reg_pipeline.predict(X_test)
pred_amount = np.expm1(pred_log)

final_predictions = prob_spend * pred_amount
final_predictions[final_predictions < 0] = 0

In [8]:
final_predictions

array([   425.45942107,   1071.41500554, 170594.40368699, ...,
         4796.6913707 ,    407.19019557,   1587.82907701], shape=(25889,))

In [9]:
submission_df = pd.read_csv("final_submission.csv")
target_column = 'task3'
print(f"Filling '{target_column}' column with predictions...")
submission_df[target_column] = final_predictions
submission_df.to_csv("final_submission_task3_upgrade_fullstacking.csv", index=False)
print(submission_df.head(20))

Filling 'task3' column with predictions...
          id  task1  task2          task3  task4  task5
0   ANS00001    1.0      2     425.459421      1      0
1   ANS00002    0.0      0    1071.415006      3      0
2   ANS00003    1.0      0  170594.403687      3      1
3   ANS00004    0.0      0      81.560468      0      0
4   ANS00005    0.0      0     380.555185      3      0
5   ANS00006    1.0      2      80.807074      2      0
6   ANS00007    0.0      1     242.535729      1      0
7   ANS00008    0.0      0    9258.924874      3      0
8   ANS00009    1.0      0       6.733944      0      0
9   ANS00010    0.0      1      41.017704      3      0
10  ANS00011    0.0      2      10.504390      1      0
11  ANS00012    0.0      0     225.085861      3      0
12  ANS00013    0.0      0       0.804839      1      0
13  ANS00014    0.0      1    1737.155089      2      0
14  ANS00015    1.0      2      48.950250      4      0
15  ANS00016    0.0      0      65.034160      0      0
16  A