# CPE342 - Karena Task3 V8: Hybrid (Voting Classifier + Stacked Regressor)

In [1]:
import pandas as pd
import numpy as np
import warnings
import optuna
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor, VotingClassifier
from sklearn.linear_model import LassoCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.cluster import KMeans
from sklearn.base import clone

from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor
from category_encoders import TargetEncoder
from sklearn.linear_model import LassoCV, ElasticNetCV 
from sklearn.kernel_ridge import KernelRidge

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)
pd.set_option('display.max_columns', None)

# Load Data
try:
    train_df = pd.read_csv("Dataset/task3/train.csv")
    test_df = pd.read_csv("Dataset/task3/test.csv")
except:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")

print("Environment Ready.")

  from .autonotebook import tqdm as notebook_tqdm


Environment Ready.


## 1. Load Data & Feature Engineering

In [2]:
def create_features_full(df_train, df_test):
    # 1. รวมข้อมูลเพื่อคำนวณ Quantile และ Mean ได้แม่นยำขึ้น
    df_train['is_train'] = 1
    df_test['is_train'] = 0
    df_all = pd.concat([df_train, df_test], ignore_index=True)
    
    # --- [PART A] Original Features from V7 (Restored) ---
    # Spending derived features
    df_all['spending_per_day'] = df_all['historical_spending'] / (df_all['account_age_days'] + 1e-6)
    df_all['spending_per_transaction'] = df_all['historical_spending'] / (df_all['total_transactions'] + 1e-6)
    df_all['prev_month_ratio'] = df_all['prev_month_spending'] / (df_all['historical_spending'] + 1e-6)
    
    # Playtime derived features
    df_all['playtime_per_day'] = df_all['total_playtime_hours'] / (df_all['account_age_days'] + 1e-6)
    # สูตรเดิม V7: sessions_per_week * 4.33
    df_all['playtime_per_session'] = df_all['total_playtime_hours'] / (df_all['sessions_per_week'] * 4.33 + 1e-6)
    
    # Social & Discount features
    df_all['interaction_per_friend'] = df_all['social_interactions'] / (df_all['friend_count'] + 1e-6)
    df_all['discount_purchase_ratio'] = df_all['purchases_on_discount'] / (df_all['total_transactions'] + 1e-6)
    df_all['avg_discount_value'] = (df_all['discount_rate_used'] * df_all['purchases_on_discount']) / (df_all['total_transactions'] + 1e-6)
    
    # Quantile-based Flags (V7 logic)
    df_all['is_whale'] = (df_all['historical_spending'] > df_all['historical_spending'].quantile(0.95)).astype(int)
    df_all['high_activity'] = (df_all['total_playtime_hours'] > df_all['total_playtime_hours'].quantile(0.90)).astype(int)
    
    # --- [PART B] New Enhancements (V11) ---
    # Interaction Feature (จับคู่ฟีเจอร์)
    df_all['engagement_score'] = df_all['total_playtime_hours'] * (df_all['friend_count'] + 1)
    
    # Log Transform (ลดความเบ้ของข้อมูล)
    skewed_cols = ['historical_spending', 'total_playtime_hours', 'friend_count', 'total_transactions']
    for col in skewed_cols:
        if col in df_all.columns:
            df_all[f'log_{col}'] = np.log1p(df_all[col])
            
    # Clean data ก่อนเข้า Clustering
    df_all.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # K-Means Clustering (จัดกลุ่มพฤติกรรม)
    print("Generating Clusters...")
    cluster_cols = ['log_historical_spending', 'log_total_playtime_hours', 'spending_per_day', 'prev_month_ratio']
    X_cluster = df_all[cluster_cols].fillna(0)
    
    kmeans = KMeans(n_clusters=7, random_state=42, n_init=10)
    df_all['cluster_group'] = kmeans.fit_predict(X_cluster).astype(str)
    
    # Split back
    train = df_all[df_all['is_train'] == 1].drop('is_train', axis=1).reset_index(drop=True)
    test = df_all[df_all['is_train'] == 0].drop('is_train', axis=1).reset_index(drop=True)
    
    return train, test

print("Engineering Features (Restoring V7 + Adding New)...")
train_df, test_df = create_features_full(train_df, test_df)
print(f"Done. Total Features: {train_df.shape[1]}")

Engineering Features (Restoring V7 + Adding New)...
Generating Clusters...
Done. Total Features: 51


In [3]:
train_df.head()

Unnamed: 0,id,player_id,friend_count,social_interactions,guild_membership,event_participation_rate,daily_login_streak,avg_session_length,sessions_per_week,total_playtime_hours,days_since_last_login,achievement_count,achievement_completion_rate,historical_spending,prev_month_spending,total_transactions,avg_transaction_value,account_age_days,vip_status,is_premium_member,primary_game,games_played,cross_game_activity,platform,days_since_last_purchase,purchase_frequency,payment_methods_used,purchases_on_discount,discount_rate_used,seasonal_spending_pattern,owns_limited_edition,competitive_rank,tournament_participation,segment,spending_30d,spending_per_day,spending_per_transaction,prev_month_ratio,playtime_per_day,playtime_per_session,interaction_per_friend,discount_purchase_ratio,avg_discount_value,is_whale,high_activity,engagement_score,log_historical_spending,log_total_playtime_hours,log_friend_count,log_total_transactions,cluster_group
0,PLY00001,P007453,140.0,39.0,,35.495321,86.0,94.186687,20.6604,793.043827,6.918242,375.0,66.677804,,960.560273,4.0,643.882786,76.0,0.0,0.0,0.0,4.0,27.0,0.0,11.375597,,1.0,1.0,0.25,4.0,,7.0,1.0,1.0,0.0,,,,10.434787,8.864833,0.278571,0.25,0.0625,0,0,111819.17958,,6.677139,4.94876,1.609438,0
1,PLY00002,P120776,126.0,12.0,1.0,35.222611,27.0,76.694036,15.112974,836.397667,3.514021,111.0,48.742068,193.279885,70.552503,43.0,,1037.0,0.0,0.0,3.0,1.0,10.0,1.0,38.325987,11.19359,1.0,25.0,0.581395,5.0,0.0,3.0,0.0,1.0,0.0,0.186384,4.494881,0.365028,0.806555,12.781298,0.095238,0.581395,0.338021,0,0,106222.503649,5.2693,6.730299,4.844187,3.78419,0
2,PLY00003,P083055,157.0,63.0,1.0,54.291374,1.0,66.680997,13.77048,561.212807,1.863368,82.0,35.275348,24.147713,8.402845,49.0,0.49281,418.0,0.0,0.0,,2.0,16.0,2.0,21.426924,5.175272,3.0,27.0,0.55102,7.0,0.0,5.0,0.0,1.0,0.0,0.05777,0.49281,0.347977,1.342614,9.412188,0.401274,0.55102,0.303623,0,0,88671.623485,3.224767,6.33188,5.062595,3.912023,0
3,PLY00004,P087827,89.0,96.0,1.0,52.990011,74.0,75.864572,11.94193,,5.973982,122.0,49.815387,2145.126609,654.006363,41.0,52.320161,129.0,3.0,1.0,4.0,1.0,7.0,0.0,28.915478,10.374656,2.0,22.0,0.536585,6.0,0.0,9.0,0.0,2.0,2223.7,16.628888,52.32016,0.30488,,,1.078652,0.536585,0.287924,0,0,,7.67142,,4.49981,3.73767,0
4,PLY00005,P045133,159.0,80.0,0.0,78.586509,84.0,33.774836,1.405901,617.041867,5.570188,189.0,89.271401,58696.454814,18294.69394,,1397.534638,478.0,3.0,1.0,4.0,2.0,,1.0,13.011954,,3.0,11.0,0.261905,0.0,1.0,4.0,0.0,3.0,156634.22,122.79593,,0.311683,1.290883,101.361223,0.503145,,,1,0,98726.698643,10.980152,6.426556,5.075174,,0


## Data Prep & Pipeline

In [4]:
TARGET = 'spending_30d'

# อัปเดต List ให้ครบถ้วนตาม V7 + Cluster ใหม่
CATEGORICAL_FEATURES = [
    'guild_membership', 'vip_status', 'is_premium_member', 'primary_game',
    'games_played', 'cross_game_activity', 'platform', 'seasonal_spending_pattern',
    'owns_limited_edition', 'tournament_participation', 'segment', 
    'is_whale', 'high_activity', # กู้คืน 2 ตัวนี้กลับมาแล้ว
    'cluster_group' # ของใหม่
]

NUMERICAL_FEATURES = [c for c in train_df.columns if c not in [TARGET, 'id', 'player_id'] + CATEGORICAL_FEATURES]

# --- Prepare Datasets ---
# 1. Classification Data (All Rows)
y_clf = (train_df[TARGET] > 0).astype(int)
X_clf = train_df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]

# 2. Regression Data (Only Spenders)
mask_spenders = train_df[TARGET] > 0
X_reg = train_df.loc[mask_spenders, NUMERICAL_FEATURES + CATEGORICAL_FEATURES]
y_reg_log = np.log1p(train_df.loc[mask_spenders, TARGET])

# --- Pipeline Helper ---
# ใช้ TargetEncoder เพราะเหมาะกับทั้ง Classification และ Regression
def get_pipeline(model):
    num_trans = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', RobustScaler())])
    cat_trans = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('target', TargetEncoder())])
    
    preprocessor = ColumnTransformer([
        ('num', num_trans, NUMERICAL_FEATURES),
        ('cat', cat_trans, CATEGORICAL_FEATURES)
    ])
    return Pipeline([('preprocessor', preprocessor), ('model', model)])

## Stage 1 - Classification Tuning

In [5]:
# --- 1. LGBM Classifier ---
def objective_lgbm_clf(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'class_weight': 'balanced',
        'random_state': 42, 'verbose': -1, 'n_jobs': -1
    }
    pipeline = get_pipeline(LGBMClassifier(**params))
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    return cross_val_score(pipeline, X_clf, y_clf, cv=cv, scoring='roc_auc').mean()

# --- 2. XGB Classifier ---
def objective_xgb_clf(trial):
    ratio = float(np.sum(y_clf == 0)) / np.sum(y_clf == 1)
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 800),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'scale_pos_weight': ratio,
        'random_state': 42, 'verbosity': 0, 'n_jobs': 4
    }
    pipeline = get_pipeline(XGBClassifier(**params))
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    return cross_val_score(pipeline, X_clf, y_clf, cv=cv, scoring='roc_auc').mean()

print("--- Tuning STAGE 1: Classification ---")
study_lgbm_clf = optuna.create_study(direction='maximize')
study_lgbm_clf.optimize(objective_lgbm_clf, n_trials=20) # เพิ่มรอบตามเวลาที่มี
print(f"LGBM Clf Best AUC: {study_lgbm_clf.best_value:.4f}")

study_xgb_clf = optuna.create_study(direction='maximize')
study_xgb_clf.optimize(objective_xgb_clf, n_trials=20)
print(f"XGB Clf Best AUC: {study_xgb_clf.best_value:.4f}")

--- Tuning STAGE 1: Classification ---
LGBM Clf Best AUC: 0.7835
XGB Clf Best AUC: 0.7837


## Stage 2: Regression Tuning (Optuna)

In [6]:
# --- 1. LGBM Regressor ---
def objective_lgbm_reg(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 80),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'random_state': 42, 'verbose': -1, 'n_jobs': -1
    }
    pipeline = get_pipeline(LGBMRegressor(**params))
    return -cross_val_score(pipeline, X_reg, y_reg_log, cv=3, scoring='neg_root_mean_squared_error').mean()

# --- 2. XGB Regressor ---
def objective_xgb_reg(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42, 'verbosity': 0, 'n_jobs': 4
    }
    pipeline = get_pipeline(XGBRegressor(**params))
    return -cross_val_score(pipeline, X_reg, y_reg_log, cv=3, scoring='neg_root_mean_squared_error').mean()

# --- 3. CatBoost Regressor ---
def objective_cat_reg(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'random_seed': 42, 'verbose': 0, 'allow_writing_files': False
    }
    pipeline = get_pipeline(CatBoostRegressor(**params))
    return -cross_val_score(pipeline, X_reg, y_reg_log, cv=3, scoring='neg_root_mean_squared_error').mean()

print("\n--- Tuning STAGE 2: Regression ---")
study_lgbm_reg = optuna.create_study(direction='minimize')
study_lgbm_reg.optimize(objective_lgbm_reg, n_trials=20)
print(f"LGBM Best RMSE: {study_lgbm_reg.best_value:.4f}")

study_xgb_reg = optuna.create_study(direction='minimize')
study_xgb_reg.optimize(objective_xgb_reg, n_trials=20)
print(f"XGB Best RMSE: {study_xgb_reg.best_value:.4f}")

study_cat_reg = optuna.create_study(direction='minimize')
study_cat_reg.optimize(objective_cat_reg, n_trials=15)
print(f"CatBoost Best RMSE: {study_cat_reg.best_value:.4f}")


--- Tuning STAGE 2: Regression ---
LGBM Best RMSE: 0.2221


[W 2025-11-26 00:58:06,138] Trial 4 failed with parameters: {'n_estimators': 349, 'learning_rate': 0.029607363211508603, 'max_depth': 4, 'min_child_weight': 1} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\DELL\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\optuna\study\_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\DELL\AppData\Local\Temp\ipykernel_1980\838964874.py", line 23, in objective_xgb_reg
    return -cross_val_score(pipeline, X_reg, y_reg_log, cv=3, scoring='neg_root_mean_squared_error').mean()
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DELL\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\utils\_param_validatio

KeyboardInterrupt: 

## Build Final Models

In [None]:
# --- Build Final Classifier (Voting) ---
print("Training Final Classifier...")
ratio = float(np.sum(y_clf == 0)) / np.sum(y_clf == 1)

clf_lgbm = LGBMClassifier(**study_lgbm_clf.best_params, class_weight='balanced', random_state=42, verbose=-1)
clf_xgb = XGBClassifier(**study_xgb_clf.best_params, scale_pos_weight=ratio, random_state=42, verbosity=0)

voting_clf = VotingClassifier(estimators=[('lgbm', clf_lgbm), ('xgb', clf_xgb)], voting='soft', n_jobs=-1)
final_clf_pipeline = get_pipeline(voting_clf)
final_clf_pipeline.fit(X_clf, y_clf)

# --- Build Final Regressor (Stacking) ---
print("Fitting Final Regressor...")

# โมเดลที่จูนมาแล้ว (จาก Cell 5)
reg_lgbm = LGBMRegressor(**study_lgbm_reg.best_params, random_state=42, verbose=-1)
reg_xgb = XGBRegressor(**study_xgb_reg.best_params, random_state=42, verbosity=0)
reg_cat = CatBoostRegressor(**study_cat_reg.best_params, random_seed=42, verbose=0, allow_writing_files=False)

stack_reg = StackingRegressor(
    estimators=[
        # 3 เทพ Boosting
        ('lgbm', reg_lgbm),
        ('xgb', reg_xgb),
        ('cat', reg_cat),
        
        # [ADDED] เพิ่มกลับมาตามคำขอ (เพื่อความหลากหลายสูงสุด)
        ('gboost', GradientBoostingRegressor(n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42)),
        
        # Linear Models (ช่วยเกลี่ยค่า)
        ('lasso', make_pipeline(RobustScaler(), LassoCV(random_state=42))),
        ('enet', make_pipeline(RobustScaler(), ElasticNetCV(cv=5, l1_ratio=[.1, .5, .7, .9, .95, .99, 1], random_state=42)))
    ],
    final_estimator=KernelRidge(alpha=0.5),
    n_jobs=-1,
    passthrough=False
)

final_reg_pipeline = get_pipeline(stack_reg)
final_reg_pipeline.fit(X_reg, y_reg_log)

print("All Models Trained Successfully (Full Stack).")

Training Final Classifier...
Fitting Final Regressor...


## 6. Final Prediction

In [None]:
X_test = test_df[NUMERICAL_FEATURES + CATEGORICAL_FEATURES]

# 1. Probability
prob_spend = final_clf_pipeline.predict_proba(X_test)[:, 1]

# 2. Amount
pred_log = final_reg_pipeline.predict(X_test)
pred_amount = np.expm1(pred_log)

# 3. Hybrid
final_predictions = prob_spend * pred_amount
final_predictions[final_predictions < 0] = 0

## Submission

In [8]:
submission_df = pd.read_csv("final_submission.csv")
target_column = 'task3'
print(f"Filling '{target_column}' column with predictions...")
submission_df[target_column] = final_predictions
submission_df.to_csv("final_submission_task3_upgrade_Hybrid.csv", index=False)
print(submission_df.head(20))

Filling 'task3' column with predictions...
          id  task1  task2          task3  task4  task5
0   ANS00001    1.0      2     467.766371      1      0
1   ANS00002    0.0      0    1344.248627      3      0
2   ANS00003    1.0      0  172439.899746      3      1
3   ANS00004    0.0      0      80.241973      0      0
4   ANS00005    0.0      0     351.336613      3      0
5   ANS00006    1.0      2      90.908155      2      0
6   ANS00007    0.0      1     218.502550      1      0
7   ANS00008    0.0      0   10508.420949      3      0
8   ANS00009    1.0      0       6.638823      0      0
9   ANS00010    0.0      1      42.069916      3      0
10  ANS00011    0.0      2      12.265593      1      0
11  ANS00012    0.0      0     222.355253      3      0
12  ANS00013    0.0      0       0.928631      1      0
13  ANS00014    0.0      1    1588.812869      2      0
14  ANS00015    1.0      2      50.094786      4      0
15  ANS00016    0.0      0      61.773818      0      0
16  A