In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
from scipy import stats
import warnings

warnings.filterwarnings("ignore")

# ==========================================
# CONFIGURATION
# ==========================================
RANDOM_STATE = 42
TARGET = 'spending_30d'
N_FOLDS = 7  # Increased for better generalization

# GPU Configuration (Set to your available GPUs)
USE_MULTI_GPU = True  # Enable multi-GPU support
GPU_IDS = [0, 1]  # Your 2 GPUs

# ==========================================
# ADVANCED FEATURE ENGINEERING
# ==========================================
def create_advanced_features(df):
    """
    Comprehensive feature engineering focusing on:
    - Interaction features
    - Behavioral patterns
    - Statistical aggregations
    - Temporal features
    """
    df = df.copy()
    
    # === Base Preprocessing ===
    cat_cols = ['primary_game', 'platform', 'vip_status', 'segment']
    for col in cat_cols:
        df[col] = df[col].fillna(-1).astype(int)
    
    zero_cols = [
        'is_premium_member', 'guild_membership', 'owns_limited_edition', 
        'tournament_participation', 'friend_count', 'social_interactions', 
        'daily_login_streak', 'historical_spending', 'prev_month_spending',
        'total_transactions', 'avg_transaction_value', 'purchases_on_discount'
    ]
    for col in zero_cols:
        df[col] = df[col].fillna(0)
    
    df['days_since_last_purchase'] = df['days_since_last_purchase'].fillna(9999)
    
    # Fill remaining numeric columns
    filled_cols = cat_cols + zero_cols + ['days_since_last_purchase', 'id', 'player_id', TARGET]
    num_cols = [c for c in df.columns if c not in filled_cols]
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # === 1. SPENDING BEHAVIOR FEATURES ===
    # Historical spending patterns
    df['has_spent_before'] = (df['historical_spending'] > 0).astype(int)
    df['spending_momentum'] = df['prev_month_spending'] / (df['historical_spending'] + 1)
    df['spending_acceleration'] = df['prev_month_spending'] - df['historical_spending'] / 12  # Monthly avg
    
    # Transaction patterns
    df['transactions_per_dollar'] = df['total_transactions'] / (df['historical_spending'] + 1)
    df['avg_transaction_growth'] = df['avg_transaction_value'] / (df['historical_spending'] / (df['total_transactions'] + 1) + 1)
    
    # Purchase recency score
    df['purchase_recency_score'] = 1 / (df['days_since_last_purchase'] + 1)
    df['is_recent_buyer'] = (df['days_since_last_purchase'] < 30).astype(int)
    df['is_dormant'] = (df['days_since_last_purchase'] > 180).astype(int)
    
    # Discount behavior
    df['discount_dependency'] = df['purchases_on_discount'] / (df['total_transactions'] + 1)
    df['full_price_purchases'] = df['total_transactions'] - df['purchases_on_discount']
    
    # === 2. ENGAGEMENT FEATURES ===
    # Social engagement
    df['social_engagement_ratio'] = df['social_interactions'] / (df['friend_count'] + 1)
    df['is_social_player'] = (df['friend_count'] > df['friend_count'].median()).astype(int)
    
    # Activity intensity
    df['activity_score'] = (
        df['daily_login_streak'] * 0.3 + 
        df['social_interactions'] * 0.3 + 
        df['tournament_participation'] * 0.4
    )
    
    # Commitment indicators
    df['commitment_score'] = (
        df['is_premium_member'] * 3 + 
        df['guild_membership'] * 2 + 
        df['owns_limited_edition'] * 2 +
        (df['daily_login_streak'] > 7).astype(int) * 1
    )
    
    # === 3. VALUE FEATURES ===
    # Lifetime value indicators
    df['ltv_score'] = df['historical_spending'] / (df['days_since_last_purchase'] + 1)
    df['monthly_value'] = df['historical_spending'] / 12
    df['value_consistency'] = df['avg_transaction_value'] / (df['monthly_value'] + 1)
    
    # Spending capacity
    df['spending_capacity'] = df['avg_transaction_value'] * df['daily_login_streak']
    df['whale_indicator'] = (df['avg_transaction_value'] > df['avg_transaction_value'].quantile(0.9)).astype(int)
    
    # === 4. INTERACTION FEATURES ===
    # VIP interactions
    df['vip_spending_ratio'] = df['historical_spending'] * (df['vip_status'] + 1)
    df['vip_activity'] = df['activity_score'] * (df['vip_status'] + 1)
    
    # Premium features interaction
    df['premium_features'] = (
        df['is_premium_member'] + 
        df['guild_membership'] + 
        df['owns_limited_edition']
    )
    df['premium_spending'] = df['historical_spending'] * df['premium_features']
    
    # Segment interactions
    df['segment_value'] = df['segment'] * df['avg_transaction_value']
    df['segment_activity'] = df['segment'] * df['daily_login_streak']
    
    # === 5. STATISTICAL FEATURES ===
    # Log transformations for skewed features
    log_features = [
        'historical_spending', 'prev_month_spending', 'total_transactions',
        'avg_transaction_value', 'friend_count', 'social_interactions'
    ]
    for col in log_features:
        if col in df.columns:
            df[f'{col}_log1p'] = np.log1p(df[col])
            df[f'{col}_sqrt'] = np.sqrt(df[col])
    
    # Binned features for categorical interactions
    df['spending_tier'] = pd.qcut(df['historical_spending'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    df['activity_tier'] = pd.qcut(df['daily_login_streak'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    df['transaction_tier'] = pd.qcut(df['total_transactions'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    
    # === 6. RISK/CHURN FEATURES ===
    df['churn_risk'] = (
        (df['days_since_last_purchase'] > 90).astype(int) * 3 +
        (df['daily_login_streak'] < 3).astype(int) * 2 +
        (df['prev_month_spending'] == 0).astype(int) * 2
    )
    
    # Engagement decay
    df['engagement_decay'] = df['daily_login_streak'] / (df['days_since_last_purchase'] + 1)
    
    return df

# ==========================================
# OPTIMIZED MODEL PARAMETERS
# ==========================================

# Stage 1: Classification (Spender vs Non-Spender)
PARAMS_S1_CAT = {
    'iterations': 2000,
    'learning_rate': 0.03,
    'depth': 8,
    'l2_leaf_reg': 8.0,
    'border_count': 254,
    'bagging_temperature': 0.8,
    'random_strength': 1.5,
    'scale_pos_weight': 1.0,  # Will be set dynamically
    'eval_metric': 'Logloss',
    'random_state': RANDOM_STATE,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'GPU',
    'devices': '0:1' if USE_MULTI_GPU else '0'  # Use both GPUs
}

PARAMS_S1_LGB = {
    'n_estimators': 1500,
    'learning_rate': 0.02,
    'num_leaves': 40,
    'max_depth': 10,
    'min_child_samples': 25,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_lambda': 3.0,
    'reg_alpha': 1.5,
    'scale_pos_weight': 1.0,  # Will be set dynamically
    'objective': 'binary',
    'metric': 'auc',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

# Stage 2: Regression (Amount Prediction)
PARAMS_S2_CAT = {
    'iterations': 5000,
    'learning_rate': 0.008,
    'depth': 7,
    'l2_leaf_reg': 3.0,
    'border_count': 200,
    'bagging_temperature': 0.6,
    'random_strength': 1.2,
    'min_data_in_leaf': 10,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_state': RANDOM_STATE,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'GPU',  # GPU acceleration
    'devices': '0'
}

PARAMS_S2_LGB_MAIN = {
    'n_estimators': 4500,
    'learning_rate': 0.01,
    'num_leaves': 64,
    'max_depth': 10,
    'min_child_samples': 15,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'reg_lambda': 2.0,
    'reg_alpha': 0.8,
    'min_split_gain': 0.01,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

PARAMS_S2_LGB_DEEP = {
    'n_estimators': 4000,
    'learning_rate': 0.006,
    'num_leaves': 150,
    'max_depth': 15,
    'min_child_samples': 8,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'reg_lambda': 4.0,
    'reg_alpha': 0.5,
    'min_split_gain': 0.005,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

# Ensemble weights (tuned based on typical performance)
W_S1 = {'cat': 0.55, 'lgb': 0.45}
W_S2 = {'cat': 0.50, 'lgb_main': 0.30, 'lgb_deep': 0.20}

# ==========================================
# DATA LOADING & PREPROCESSING
# ==========================================
print("=" * 60)
print("ADVANCED TWO-STAGE PREDICTION PIPELINE (GPU ACCELERATED)")
print("=" * 60)

# Check GPU availability
print("\nüîç Checking GPU availability...")
try:
    import subprocess
    gpu_info = subprocess.check_output(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader']).decode()
    print(f"‚úì GPU Detected: {gpu_info.strip()}")
except:
    print("‚ö† GPU not detected. Models will fallback to CPU if GPU unavailable.")

print("\n[1/5] Loading data...")
train = pd.read_csv('/kaggle/input/cpe342-karena/public_dataset/task3/train.csv')
test = pd.read_csv('/kaggle/input/cpe342-karena/public_dataset/task3/test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

print("\n[2/5] Creating advanced features...")
train_fe = create_advanced_features(train)
test_fe = create_advanced_features(test)

# Feature selection
exclude_cols = ['id', 'player_id', TARGET, 'days_since_last_purchase']  # days_since excluded as it's often leaky
feature_cols = [c for c in train_fe.columns if c not in exclude_cols]

# Identify categorical features
cat_features = [
    'primary_game', 'platform', 'vip_status', 'segment',
    'spending_tier', 'activity_tier', 'transaction_tier'
]
cat_features = [c for c in cat_features if c in feature_cols]

print(f"Total features: {len(feature_cols)}")
print(f"Categorical features: {len(cat_features)}")

X = train_fe[feature_cols].copy()
y_binary = (train_fe[TARGET] > 0).astype(int)
X_test = test_fe[feature_cols].copy()

# Convert categoricals
for c in cat_features:
    X[c] = X[c].astype('category')
    X_test[c] = X_test[c].astype('category')

# ==========================================
# STAGE 1: CLASSIFICATION
# ==========================================
print("\n" + "=" * 60)
print("[3/5] STAGE 1: Spender Classification")
print("=" * 60)

# Calculate class imbalance
pos_rate = y_binary.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate
print(f"Positive class rate: {pos_rate:.4f}")
print(f"Scale pos weight: {scale_pos_weight:.4f}")

PARAMS_S1_CAT['scale_pos_weight'] = scale_pos_weight
PARAMS_S1_LGB['scale_pos_weight'] = scale_pos_weight

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof_prob_s1 = np.zeros(len(X))
test_prob_s1 = np.zeros(len(X_test))
fold_aucs = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y_binary), 1):
    print(f"\n  Fold {fold}/{N_FOLDS}")
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y_binary.iloc[tr_idx], y_binary.iloc[val_idx]
    
    # CatBoost
    m1 = CatBoostClassifier(**PARAMS_S1_CAT, cat_features=cat_features)
    m1.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100)
    p1_val = m1.predict_proba(X_val)[:, 1]
    p1_test = m1.predict_proba(X_test)[:, 1]
    
    # LightGBM
    m2 = lgb.LGBMClassifier(**PARAMS_S1_LGB)
    m2.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
    p2_val = m2.predict_proba(X_val)[:, 1]
    p2_test = m2.predict_proba(X_test)[:, 1]
    
    # Ensemble
    fold_pred = W_S1['cat'] * p1_val + W_S1['lgb'] * p2_val
    oof_prob_s1[val_idx] = fold_pred
    test_prob_s1 += (W_S1['cat'] * p1_test + W_S1['lgb'] * p2_test) / N_FOLDS
    
    fold_auc = roc_auc_score(y_val, fold_pred)
    fold_aucs.append(fold_auc)
    print(f"    AUC: {fold_auc:.5f}")

# Overall metrics
overall_auc = roc_auc_score(y_binary, oof_prob_s1)
print(f"\n  Overall OOF AUC: {overall_auc:.5f} ¬± {np.std(fold_aucs):.5f}")

# Optimize threshold
best_acc = 0
best_thresh = 0.5
for t in np.arange(0.2, 0.8, 0.005):
    acc = accuracy_score(y_binary, (oof_prob_s1 > t).astype(int))
    if acc > best_acc:
        best_acc, best_thresh = acc, t

print(f"  Optimal Threshold: {best_thresh:.4f} (Accuracy: {best_acc:.4f})")

# ==========================================
# STAGE 2: REGRESSION
# ==========================================
print("\n" + "=" * 60)
print("[4/5] STAGE 2: Amount Prediction (Spenders Only)")
print("=" * 60)

mask_spenders = train_fe[TARGET] > 0
X_reg = train_fe[mask_spenders][feature_cols].reset_index(drop=True)
y_reg = train_fe[mask_spenders][TARGET].reset_index(drop=True)
y_reg_log = np.log1p(y_reg)

print(f"Training on {len(X_reg)} spenders")
print(f"Mean spending: ‡∏ø{y_reg.mean():.2f}, Median: ‡∏ø{y_reg.median():.2f}")

# Convert categoricals
for c in cat_features:
    X_reg[c] = X_reg[c].astype('category')

oof_amount_s2 = np.zeros(len(X_reg))
test_amount_s2 = np.zeros(len(X_test))
kf_reg = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
fold_rmses = []

for fold, (tr_idx, val_idx) in enumerate(kf_reg.split(X_reg), 1):
    print(f"\n  Fold {fold}/{N_FOLDS}")
    X_tr, X_val = X_reg.iloc[tr_idx], X_reg.iloc[val_idx]
    y_tr, y_val = y_reg_log.iloc[tr_idx], y_reg_log.iloc[val_idx]
    
    # CatBoost
    r1 = CatBoostRegressor(**PARAMS_S2_CAT, cat_features=cat_features)
    r1.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=150)
    p1_val = r1.predict(X_val)
    p1_test = r1.predict(X_test)
    
    # LightGBM Main
    r2 = lgb.LGBMRegressor(**PARAMS_S2_LGB_MAIN)
    r2.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(150, verbose=False)])
    p2_val = r2.predict(X_val)
    p2_test = r2.predict(X_test)
    
    # LightGBM Deep
    r3 = lgb.LGBMRegressor(**PARAMS_S2_LGB_DEEP)
    r3.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(150, verbose=False)])
    p3_val = r3.predict(X_val)
    p3_test = r3.predict(X_test)
    
    # Ensemble (log space)
    fold_pred = (
        W_S2['cat'] * p1_val + 
        W_S2['lgb_main'] * p2_val + 
        W_S2['lgb_deep'] * p3_val
    )
    oof_amount_s2[val_idx] = fold_pred
    test_amount_s2 += (
        W_S2['cat'] * p1_test + 
        W_S2['lgb_main'] * p2_test + 
        W_S2['lgb_deep'] * p3_test
    ) / N_FOLDS
    
    fold_rmse = np.sqrt(mean_squared_error(y_val, fold_pred))
    fold_rmses.append(fold_rmse)
    print(f"    RMSE (log): {fold_rmse:.5f}")

# Evaluation
rmse_log = np.sqrt(mean_squared_error(y_reg_log, oof_amount_s2))
oof_amount_raw = np.expm1(oof_amount_s2)
rmse_raw = np.sqrt(mean_squared_error(y_reg, oof_amount_raw))

print(f"\n  Overall OOF RMSE (log): {rmse_log:.5f} ¬± {np.std(fold_rmses):.5f}")
print(f"  Overall OOF RMSE (THB): ‡∏ø{rmse_raw:.2f}")

# ==========================================
# FINAL PREDICTIONS
# ==========================================
print("\n" + "=" * 60)
print("[5/5] Generating Final Predictions")
print("=" * 60)

# Convert test predictions from log to raw
test_amount_raw = np.expm1(test_amount_s2)

# Combine stages with threshold
final_preds = np.where(test_prob_s1 > best_thresh, test_amount_raw, 0)

# Safety clip
final_preds = np.clip(final_preds, 0, 500000)

# Statistics
print(f"\nPrediction Statistics:")
print(f"  Threshold: {best_thresh:.4f}")
print(f"  Predicted spenders: {np.sum(final_preds > 0):,} / {len(final_preds):,} ({100*np.sum(final_preds > 0)/len(final_preds):.2f}%)")
print(f"  Mean prediction: ‡∏ø{final_preds[final_preds > 0].mean():.2f}")
print(f"  Median prediction: ‡∏ø{np.median(final_preds[final_preds > 0]):.2f}")
print(f"  Max prediction: ‡∏ø{final_preds.max():.2f}")
print(f"  Total predicted revenue: ‡∏ø{final_preds.sum():,.2f}")

# Save submission
submission = pd.DataFrame({
    'id': test['id'],
    'spending_30d': final_preds
})
submission.to_csv('submission_enhanced.csv', index=False)

print("\n" + "=" * 60)
print("‚úì Pipeline Complete! Submission saved as 'submission_enhanced.csv'")
print("=" * 60)

ADVANCED TWO-STAGE PREDICTION PIPELINE (GPU ACCELERATED)

üîç Checking GPU availability...
‚úì GPU Detected: Tesla T4, 15360 MiB
Tesla T4, 15360 MiB

[1/5] Loading data...
Train shape: (104000, 35), Test shape: (25889, 34)

[2/5] Creating advanced features...
Total features: 73
Categorical features: 7

[3/5] STAGE 1: Spender Classification
Positive class rate: 0.5180
Scale pos weight: 0.9306

  Fold 1/7
    AUC: 0.78826

  Fold 2/7
    AUC: 0.78667

  Fold 3/7
    AUC: 0.78298

  Fold 4/7
    AUC: 0.78354

  Fold 5/7
    AUC: 0.78035

  Fold 6/7
    AUC: 0.78393

  Fold 7/7
    AUC: 0.78375

  Overall OOF AUC: 0.78384 ¬± 0.00238
  Optimal Threshold: 0.4700 (Accuracy: 0.7198)

[4/5] STAGE 2: Amount Prediction (Spenders Only)
Training on 53868 spenders
Mean spending: ‡∏ø20019.98, Median: ‡∏ø2297.29

  Fold 1/7
    RMSE (log): 0.22014

  Fold 2/7
    RMSE (log): 0.21886

  Fold 3/7
    RMSE (log): 0.22397

  Fold 4/7
    RMSE (log): 0.20172

  Fold 5/7
    RMSE (log): 0.22872

  Fold 6/7

In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
from scipy import stats
import warnings

warnings.filterwarnings("ignore")

# ==========================================
# CONFIGURATION
# ==========================================
RANDOM_STATE = 42
TARGET = 'spending_30d'
N_FOLDS = 7  # Increased for better generalization

# GPU Configuration (Set to your available GPUs)
USE_MULTI_GPU = True  # Enable multi-GPU support
GPU_IDS = [0, 1]  # Your 2 GPUs

# ==========================================
# ADVANCED FEATURE ENGINEERING
# ==========================================
def create_advanced_features(df):
    """
    Comprehensive feature engineering focusing on:
    - Interaction features
    - Behavioral patterns
    - Statistical aggregations
    - Temporal features
    """
    df = df.copy()
    
    # === Base Preprocessing ===
    cat_cols = ['primary_game', 'platform', 'vip_status', 'segment']
    for col in cat_cols:
        df[col] = df[col].fillna(-1).astype(int)
    
    zero_cols = [
        'is_premium_member', 'guild_membership', 'owns_limited_edition', 
        'tournament_participation', 'friend_count', 'social_interactions', 
        'daily_login_streak', 'historical_spending', 'prev_month_spending',
        'total_transactions', 'avg_transaction_value', 'purchases_on_discount'
    ]
    for col in zero_cols:
        df[col] = df[col].fillna(0)
    
    df['days_since_last_purchase'] = df['days_since_last_purchase'].fillna(9999)
    
    # Fill remaining numeric columns
    filled_cols = cat_cols + zero_cols + ['days_since_last_purchase', 'id', 'player_id', TARGET]
    num_cols = [c for c in df.columns if c not in filled_cols]
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # === 1. SPENDING BEHAVIOR FEATURES ===
    # Historical spending patterns
    df['has_spent_before'] = (df['historical_spending'] > 0).astype(int)
    df['spending_momentum'] = df['prev_month_spending'] / (df['historical_spending'] + 1)
    df['spending_acceleration'] = df['prev_month_spending'] - df['historical_spending'] / 12  # Monthly avg
    
    # Transaction patterns
    df['transactions_per_dollar'] = df['total_transactions'] / (df['historical_spending'] + 1)
    df['avg_transaction_growth'] = df['avg_transaction_value'] / (df['historical_spending'] / (df['total_transactions'] + 1) + 1)
    
    # Purchase recency score
    df['purchase_recency_score'] = 1 / (df['days_since_last_purchase'] + 1)
    df['is_recent_buyer'] = (df['days_since_last_purchase'] < 30).astype(int)
    df['is_dormant'] = (df['days_since_last_purchase'] > 180).astype(int)
    
    # Discount behavior
    df['discount_dependency'] = df['purchases_on_discount'] / (df['total_transactions'] + 1)
    df['full_price_purchases'] = df['total_transactions'] - df['purchases_on_discount']
    
    # === 2. ENGAGEMENT FEATURES ===
    # Social engagement
    df['social_engagement_ratio'] = df['social_interactions'] / (df['friend_count'] + 1)
    df['is_social_player'] = (df['friend_count'] > df['friend_count'].median()).astype(int)
    
    # Activity intensity
    df['activity_score'] = (
        df['daily_login_streak'] * 0.3 + 
        df['social_interactions'] * 0.3 + 
        df['tournament_participation'] * 0.4
    )
    
    # Commitment indicators
    df['commitment_score'] = (
        df['is_premium_member'] * 3 + 
        df['guild_membership'] * 2 + 
        df['owns_limited_edition'] * 2 +
        (df['daily_login_streak'] > 7).astype(int) * 1
    )
    
    # === 3. VALUE FEATURES ===
    # Lifetime value indicators
    df['ltv_score'] = df['historical_spending'] / (df['days_since_last_purchase'] + 1)
    df['monthly_value'] = df['historical_spending'] / 12
    df['value_consistency'] = df['avg_transaction_value'] / (df['monthly_value'] + 1)
    
    # Spending capacity
    df['spending_capacity'] = df['avg_transaction_value'] * df['daily_login_streak']
    df['whale_indicator'] = (df['avg_transaction_value'] > df['avg_transaction_value'].quantile(0.9)).astype(int)
    
    # === 4. INTERACTION FEATURES ===
    # VIP interactions
    df['vip_spending_ratio'] = df['historical_spending'] * (df['vip_status'] + 1)
    df['vip_activity'] = df['activity_score'] * (df['vip_status'] + 1)
    
    # Premium features interaction
    df['premium_features'] = (
        df['is_premium_member'] + 
        df['guild_membership'] + 
        df['owns_limited_edition']
    )
    df['premium_spending'] = df['historical_spending'] * df['premium_features']
    
    # Segment interactions
    df['segment_value'] = df['segment'] * df['avg_transaction_value']
    df['segment_activity'] = df['segment'] * df['daily_login_streak']
    
    # === 5. STATISTICAL FEATURES ===
    # Log transformations for skewed features
    log_features = [
        'historical_spending', 'prev_month_spending', 'total_transactions',
        'avg_transaction_value', 'friend_count', 'social_interactions'
    ]
    for col in log_features:
        if col in df.columns:
            df[f'{col}_log1p'] = np.log1p(df[col])
            df[f'{col}_sqrt'] = np.sqrt(df[col])
    
    # Binned features for categorical interactions
    df['spending_tier'] = pd.qcut(df['historical_spending'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    df['activity_tier'] = pd.qcut(df['daily_login_streak'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    df['transaction_tier'] = pd.qcut(df['total_transactions'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    
    # === 6. RISK/CHURN FEATURES ===
    df['churn_risk'] = (
        (df['days_since_last_purchase'] > 90).astype(int) * 3 +
        (df['daily_login_streak'] < 3).astype(int) * 2 +
        (df['prev_month_spending'] == 0).astype(int) * 2
    )
    
    # Engagement decay
    df['engagement_decay'] = df['daily_login_streak'] / (df['days_since_last_purchase'] + 1)
    
    return df

# ==========================================
# OPTIMIZED MODEL PARAMETERS
# ==========================================

# Stage 1: Classification (Spender vs Non-Spender)
PARAMS_S1_CAT = {
    'iterations': 2000,
    'learning_rate': 0.03,
    'depth': 8,
    'l2_leaf_reg': 8.0,
    'border_count': 254,
    'bagging_temperature': 0.8,
    'random_strength': 1.5,
    'scale_pos_weight': 1.0,  # Will be set dynamically
    'eval_metric': 'Logloss',  # GPU-compatible metric (AUC not supported on GPU)
    'random_state': RANDOM_STATE,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'GPU',
    'devices': '0:1' if USE_MULTI_GPU else '0'  # Use both GPUs
}

PARAMS_S1_LGB = {
    'n_estimators': 1500,
    'learning_rate': 0.02,
    'num_leaves': 40,
    'max_depth': 10,
    'min_child_samples': 25,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_lambda': 3.0,
    'reg_alpha': 1.5,
    'scale_pos_weight': 1.0,  # Will be set dynamically
    'objective': 'binary',
    'metric': 'auc',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

# Stage 2: Regression (Amount Prediction)
PARAMS_S2_CAT = {
    'iterations': 5000,
    'learning_rate': 0.007,  # Slightly lower for more stable learning
    'depth': 6,  # Reduced from 7 to prevent overfitting
    'l2_leaf_reg': 5.0,  # Increased regularization
    'border_count': 200,
    'bagging_temperature': 0.7,  # Increased for more randomness
    'random_strength': 1.5,  # Increased for better generalization
    'min_data_in_leaf': 15,  # Increased from 10
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_state': RANDOM_STATE,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'GPU',
    'devices': '0:1' if USE_MULTI_GPU else '0'  # Use both GPUs
}

PARAMS_S2_LGB_MAIN = {
    'n_estimators': 4500,
    'learning_rate': 0.009,  # Slightly lower
    'num_leaves': 50,  # Reduced from 64
    'max_depth': 9,  # Reduced from 10
    'min_child_samples': 20,  # Increased from 15
    'subsample': 0.82,  # Slightly reduced
    'colsample_bytree': 0.82,  # Slightly reduced
    'reg_lambda': 3.0,  # Increased regularization
    'reg_alpha': 1.2,  # Increased regularization
    'min_split_gain': 0.015,  # Increased
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

PARAMS_S2_LGB_DEEP = {
    'n_estimators': 4000,
    'learning_rate': 0.006,
    'num_leaves': 150,
    'max_depth': 15,
    'min_child_samples': 8,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'reg_lambda': 4.0,
    'reg_alpha': 0.5,
    'min_split_gain': 0.005,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

# Ensemble weights (tuned based on typical performance)
W_S1 = {'cat': 0.55, 'lgb': 0.45}
W_S2 = {'cat': 0.50, 'lgb_main': 0.30, 'lgb_deep': 0.20}

# ==========================================
# DATA LOADING & PREPROCESSING
# ==========================================
print("=" * 60)
print("ADVANCED TWO-STAGE PREDICTION PIPELINE (MULTI-GPU)")
print("=" * 60)

# Check GPU availability
print("\nüîç Checking GPU availability...")
try:
    import subprocess
    gpu_info = subprocess.check_output(['nvidia-smi', '--query-gpu=index,name,memory.total', '--format=csv,noheader']).decode()
    gpu_lines = gpu_info.strip().split('\n')
    print(f"‚úì Detected {len(gpu_lines)} GPU(s):")
    for line in gpu_lines:
        print(f"  - GPU {line}")
    
    if USE_MULTI_GPU and len(gpu_lines) >= 2:
        print(f"\nüöÄ Multi-GPU Mode ENABLED: Using GPU {GPU_IDS}")
    else:
        print(f"\n‚ö° Single-GPU Mode: Using GPU {GPU_IDS[0]}")
except:
    print("‚ö† GPU not detected. Models will fallback to CPU if GPU unavailable.")

print("\n[1/5] Loading data...")
train = pd.read_csv('/kaggle/input/cpe342-karena/public_dataset/task3/train.csv')
test = pd.read_csv('/kaggle/input/cpe342-karena/public_dataset/task3/test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

print("\n[2/5] Creating advanced features...")
train_fe = create_advanced_features(train)
test_fe = create_advanced_features(test)

# Feature selection
exclude_cols = ['id', 'player_id', TARGET, 'days_since_last_purchase']  # days_since excluded as it's often leaky
feature_cols = [c for c in train_fe.columns if c not in exclude_cols]

# Identify categorical features
cat_features = [
    'primary_game', 'platform', 'vip_status', 'segment',
    'spending_tier', 'activity_tier', 'transaction_tier'
]
cat_features = [c for c in cat_features if c in feature_cols]

print(f"Total features: {len(feature_cols)}")
print(f"Categorical features: {len(cat_features)}")

X = train_fe[feature_cols].copy()
y_binary = (train_fe[TARGET] > 0).astype(int)
X_test = test_fe[feature_cols].copy()

# Convert categoricals
for c in cat_features:
    X[c] = X[c].astype('category')
    X_test[c] = X_test[c].astype('category')

# ==========================================
# STAGE 1: CLASSIFICATION
# ==========================================
print("\n" + "=" * 60)
print("[3/5] STAGE 1: Spender Classification")
print("=" * 60)

# Calculate class imbalance
pos_rate = y_binary.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate
print(f"Positive class rate: {pos_rate:.4f}")
print(f"Scale pos weight: {scale_pos_weight:.4f}")

PARAMS_S1_CAT['scale_pos_weight'] = scale_pos_weight
PARAMS_S1_LGB['scale_pos_weight'] = scale_pos_weight

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof_prob_s1 = np.zeros(len(X))
test_prob_s1 = np.zeros(len(X_test))
fold_aucs = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y_binary), 1):
    print(f"\n  Fold {fold}/{N_FOLDS}")
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y_binary.iloc[tr_idx], y_binary.iloc[val_idx]
    
    # Dynamically assign GPU for LightGBM models (alternate between GPUs)
    current_gpu = GPU_IDS[fold % len(GPU_IDS)] if USE_MULTI_GPU else GPU_IDS[0]
    
    # CatBoost (uses both GPUs automatically with '0:1')
    m1 = CatBoostClassifier(**PARAMS_S1_CAT, cat_features=cat_features)
    m1.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100)
    p1_val = m1.predict_proba(X_val)[:, 1]
    p1_test = m1.predict_proba(X_test)[:, 1]
    
    # LightGBM (assign specific GPU per fold for load balancing)
    params_lgb_fold = PARAMS_S1_LGB.copy()
    params_lgb_fold['gpu_device_id'] = current_gpu
    m2 = lgb.LGBMClassifier(**params_lgb_fold)
    m2.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
    p2_val = m2.predict_proba(X_val)[:, 1]
    p2_test = m2.predict_proba(X_test)[:, 1]
    
    # Ensemble
    fold_pred = W_S1['cat'] * p1_val + W_S1['lgb'] * p2_val
    oof_prob_s1[val_idx] = fold_pred
    test_prob_s1 += (W_S1['cat'] * p1_test + W_S1['lgb'] * p2_test) / N_FOLDS
    
    fold_auc = roc_auc_score(y_val, fold_pred)
    fold_aucs.append(fold_auc)
    print(f"    AUC: {fold_auc:.5f} [GPU {current_gpu if 'LGB' in str(type(m2)) else '0:1'}]")

# Overall metrics
overall_auc = roc_auc_score(y_binary, oof_prob_s1)
print(f"\n  Overall OOF AUC: {overall_auc:.5f} ¬± {np.std(fold_aucs):.5f}")

# Optimize threshold
best_acc = 0
best_thresh = 0.5
for t in np.arange(0.2, 0.8, 0.005):
    acc = accuracy_score(y_binary, (oof_prob_s1 > t).astype(int))
    if acc > best_acc:
        best_acc, best_thresh = acc, t

print(f"  Optimal Threshold: {best_thresh:.4f} (Accuracy: {best_acc:.4f})")

# ==========================================
# STAGE 2: REGRESSION
# ==========================================
print("\n" + "=" * 60)
print("[4/5] STAGE 2: Amount Prediction (Spenders Only)")
print("=" * 60)

mask_spenders = train_fe[TARGET] > 0
X_reg = train_fe[mask_spenders][feature_cols].reset_index(drop=True)
y_reg = train_fe[mask_spenders][TARGET].reset_index(drop=True)
y_reg_log = np.log1p(y_reg)

print(f"Training on {len(X_reg)} spenders")
print(f"Mean spending: ‡∏ø{y_reg.mean():.2f}, Median: ‡∏ø{y_reg.median():.2f}")

# Convert categoricals
for c in cat_features:
    X_reg[c] = X_reg[c].astype('category')

oof_amount_s2 = np.zeros(len(X_reg))
test_amount_s2 = np.zeros(len(X_test))
kf_reg = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
fold_rmses = []

for fold, (tr_idx, val_idx) in enumerate(kf_reg.split(X_reg), 1):
    print(f"\n  Fold {fold}/{N_FOLDS}")
    X_tr, X_val = X_reg.iloc[tr_idx], X_reg.iloc[val_idx]
    y_tr, y_val = y_reg_log.iloc[tr_idx], y_reg_log.iloc[val_idx]
    
    # Dynamically assign GPU for LightGBM models
    current_gpu = GPU_IDS[fold % len(GPU_IDS)] if USE_MULTI_GPU else GPU_IDS[0]
    
    # CatBoost (uses both GPUs)
    r1 = CatBoostRegressor(**PARAMS_S2_CAT, cat_features=cat_features)
    r1.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=150)
    p1_val = r1.predict(X_val)
    p1_test = r1.predict(X_test)
    
    # LightGBM Main (GPU rotation)
    params_lgb_main_fold = PARAMS_S2_LGB_MAIN.copy()
    params_lgb_main_fold['gpu_device_id'] = current_gpu
    r2 = lgb.LGBMRegressor(**params_lgb_main_fold)
    r2.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(150, verbose=False)])
    p2_val = r2.predict(X_val)
    p2_test = r2.predict(X_test)
    
    # LightGBM Deep (alternate GPU for parallel processing)
    alternate_gpu = GPU_IDS[(fold + 1) % len(GPU_IDS)] if USE_MULTI_GPU else GPU_IDS[0]
    params_lgb_deep_fold = PARAMS_S2_LGB_DEEP.copy()
    params_lgb_deep_fold['gpu_device_id'] = alternate_gpu
    r3 = lgb.LGBMRegressor(**params_lgb_deep_fold)
    r3.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(150, verbose=False)])
    p3_val = r3.predict(X_val)
    p3_test = r3.predict(X_test)
    
    # Ensemble (log space)
    fold_pred = (
        W_S2['cat'] * p1_val + 
        W_S2['lgb_main'] * p2_val + 
        W_S2['lgb_deep'] * p3_val
    )
    oof_amount_s2[val_idx] = fold_pred
    test_amount_s2 += (
        W_S2['cat'] * p1_test + 
        W_S2['lgb_main'] * p2_test + 
        W_S2['lgb_deep'] * p3_test
    ) / N_FOLDS
    
    fold_rmse = np.sqrt(mean_squared_error(y_val, fold_pred))
    fold_rmses.append(fold_rmse)
    print(f"    RMSE (log): {fold_rmse:.5f} [GPU {current_gpu}/{alternate_gpu}]")

# Evaluation
rmse_log = np.sqrt(mean_squared_error(y_reg_log, oof_amount_s2))
oof_amount_raw = np.expm1(oof_amount_s2)
rmse_raw = np.sqrt(mean_squared_error(y_reg, oof_amount_raw))

print(f"\n  Overall OOF RMSE (log): {rmse_log:.5f} ¬± {np.std(fold_rmses):.5f}")
print(f"  Overall OOF RMSE (THB): ‡∏ø{rmse_raw:.2f}")

# ==========================================
# FINAL PREDICTIONS
# ==========================================
print("\n" + "=" * 60)
print("[5/5] Generating Final Predictions")
print("=" * 60)

# Convert test predictions from log to raw
test_amount_raw = np.expm1(test_amount_s2)

# Combine stages with threshold
final_preds = np.where(test_prob_s1 > best_thresh, test_amount_raw, 0)

# Safety clip
final_preds = np.clip(final_preds, 0, 500000)

# Statistics
print(f"\nPrediction Statistics:")
print(f"  Threshold: {best_thresh:.4f}")
print(f"  Predicted spenders: {np.sum(final_preds > 0):,} / {len(final_preds):,} ({100*np.sum(final_preds > 0)/len(final_preds):.2f}%)")
print(f"  Mean prediction: ‡∏ø{final_preds[final_preds > 0].mean():.2f}")
print(f"  Median prediction: ‡∏ø{np.median(final_preds[final_preds > 0]):.2f}")
print(f"  Max prediction: ‡∏ø{final_preds.max():.2f}")
print(f"  Total predicted revenue: ‡∏ø{final_preds.sum():,.2f}")

# Save submission
submission = pd.DataFrame({
    'id': test['id'],
    'spending_30d': final_preds
})
submission.to_csv('submission_enhanced.csv', index=False)

print("\n" + "=" * 60)
print("‚úì Pipeline Complete! Submission saved as 'submission_enhanced.csv'")
print("=" * 60)

ADVANCED TWO-STAGE PREDICTION PIPELINE (MULTI-GPU)

üîç Checking GPU availability...
‚úì Detected 2 GPU(s):
  - GPU 0, Tesla T4, 15360 MiB
  - GPU 1, Tesla T4, 15360 MiB

üöÄ Multi-GPU Mode ENABLED: Using GPU [0, 1]

[1/5] Loading data...
Train shape: (104000, 35), Test shape: (25889, 34)

[2/5] Creating advanced features...
Total features: 73
Categorical features: 7

[3/5] STAGE 1: Spender Classification
Positive class rate: 0.5180
Scale pos weight: 0.9306

  Fold 1/7
    AUC: 0.78825 [GPU 1]

  Fold 2/7
    AUC: 0.78668 [GPU 0]

  Fold 3/7
    AUC: 0.78297 [GPU 1]

  Fold 4/7
    AUC: 0.78354 [GPU 0]

  Fold 5/7
    AUC: 0.78035 [GPU 1]

  Fold 6/7
    AUC: 0.78393 [GPU 0]

  Fold 7/7
    AUC: 0.78375 [GPU 1]

  Overall OOF AUC: 0.78386 ¬± 0.00238
  Optimal Threshold: 0.4700 (Accuracy: 0.7198)

[4/5] STAGE 2: Amount Prediction (Spenders Only)
Training on 53868 spenders
Mean spending: ‡∏ø20019.98, Median: ‡∏ø2297.29

  Fold 1/7
    RMSE (log): 0.22103 [GPU 1/0]

  Fold 2/7
    RMSE 

KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, accuracy_score, roc_auc_score
from scipy import stats
import warnings

warnings.filterwarnings("ignore")

# ==========================================
# CONFIGURATION
# ==========================================
RANDOM_STATE = 42
TARGET = 'spending_30d'
N_FOLDS = 7  # Increased for better generalization

# GPU Configuration (Set to your available GPUs)
USE_MULTI_GPU = True  # Enable multi-GPU support
GPU_IDS = [0, 1]  # Your 2 GPUs

# ==========================================
# ADVANCED FEATURE ENGINEERING
# ==========================================
def create_advanced_features(df):
    """
    Comprehensive feature engineering focusing on:
    - Interaction features
    - Behavioral patterns
    - Statistical aggregations
    - Temporal features
    """
    df = df.copy()
    
    # === Base Preprocessing ===
    cat_cols = ['primary_game', 'platform', 'vip_status', 'segment']
    for col in cat_cols:
        df[col] = df[col].fillna(-1).astype(int)
    
    zero_cols = [
        'is_premium_member', 'guild_membership', 'owns_limited_edition', 
        'tournament_participation', 'friend_count', 'social_interactions', 
        'daily_login_streak', 'historical_spending', 'prev_month_spending',
        'total_transactions', 'avg_transaction_value', 'purchases_on_discount'
    ]
    for col in zero_cols:
        df[col] = df[col].fillna(0)
    
    df['days_since_last_purchase'] = df['days_since_last_purchase'].fillna(9999)
    
    # Fill remaining numeric columns
    filled_cols = cat_cols + zero_cols + ['days_since_last_purchase', 'id', 'player_id', TARGET]
    num_cols = [c for c in df.columns if c not in filled_cols]
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    
    # === 1. SPENDING BEHAVIOR FEATURES ===
    # Historical spending patterns
    df['has_spent_before'] = (df['historical_spending'] > 0).astype(int)
    df['spending_momentum'] = df['prev_month_spending'] / (df['historical_spending'] + 1)
    df['spending_acceleration'] = df['prev_month_spending'] - df['historical_spending'] / 12  # Monthly avg
    
    # Transaction patterns
    df['transactions_per_dollar'] = df['total_transactions'] / (df['historical_spending'] + 1)
    df['avg_transaction_growth'] = df['avg_transaction_value'] / (df['historical_spending'] / (df['total_transactions'] + 1) + 1)
    
    # Purchase recency score
    df['purchase_recency_score'] = 1 / (df['days_since_last_purchase'] + 1)
    df['is_recent_buyer'] = (df['days_since_last_purchase'] < 30).astype(int)
    df['is_dormant'] = (df['days_since_last_purchase'] > 180).astype(int)
    
    # Discount behavior
    df['discount_dependency'] = df['purchases_on_discount'] / (df['total_transactions'] + 1)
    df['full_price_purchases'] = df['total_transactions'] - df['purchases_on_discount']
    
    # === 2. ENGAGEMENT FEATURES ===
    # Social engagement
    df['social_engagement_ratio'] = df['social_interactions'] / (df['friend_count'] + 1)
    df['is_social_player'] = (df['friend_count'] > df['friend_count'].median()).astype(int)
    
    # Activity intensity
    df['activity_score'] = (
        df['daily_login_streak'] * 0.3 + 
        df['social_interactions'] * 0.3 + 
        df['tournament_participation'] * 0.4
    )
    
    # Commitment indicators
    df['commitment_score'] = (
        df['is_premium_member'] * 3 + 
        df['guild_membership'] * 2 + 
        df['owns_limited_edition'] * 2 +
        (df['daily_login_streak'] > 7).astype(int) * 1
    )
    
    # === 3. VALUE FEATURES ===
    # Lifetime value indicators
    df['ltv_score'] = df['historical_spending'] / (df['days_since_last_purchase'] + 1)
    df['monthly_value'] = df['historical_spending'] / 12
    df['value_consistency'] = df['avg_transaction_value'] / (df['monthly_value'] + 1)
    
    # Spending capacity
    df['spending_capacity'] = df['avg_transaction_value'] * df['daily_login_streak']
    df['whale_indicator'] = (df['avg_transaction_value'] > df['avg_transaction_value'].quantile(0.9)).astype(int)
    
    # === 4. INTERACTION FEATURES ===
    # VIP interactions
    df['vip_spending_ratio'] = df['historical_spending'] * (df['vip_status'] + 1)
    df['vip_activity'] = df['activity_score'] * (df['vip_status'] + 1)
    
    # Premium features interaction
    df['premium_features'] = (
        df['is_premium_member'] + 
        df['guild_membership'] + 
        df['owns_limited_edition']
    )
    df['premium_spending'] = df['historical_spending'] * df['premium_features']
    
    # Segment interactions
    df['segment_value'] = df['segment'] * df['avg_transaction_value']
    df['segment_activity'] = df['segment'] * df['daily_login_streak']
    
    # === 5. STATISTICAL FEATURES ===
    # Log transformations for skewed features
    log_features = [
        'historical_spending', 'prev_month_spending', 'total_transactions',
        'avg_transaction_value', 'friend_count', 'social_interactions'
    ]
    for col in log_features:
        if col in df.columns:
            df[f'{col}_log1p'] = np.log1p(df[col])
            df[f'{col}_sqrt'] = np.sqrt(df[col])
    
    # Binned features for categorical interactions
    df['spending_tier'] = pd.qcut(df['historical_spending'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    df['activity_tier'] = pd.qcut(df['daily_login_streak'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    df['transaction_tier'] = pd.qcut(df['total_transactions'], q=5, labels=False, duplicates='drop').fillna(-1).astype(int)
    
    # === 6. RISK/CHURN FEATURES ===
    df['churn_risk'] = (
        (df['days_since_last_purchase'] > 90).astype(int) * 3 +
        (df['daily_login_streak'] < 3).astype(int) * 2 +
        (df['prev_month_spending'] == 0).astype(int) * 2
    )
    
    # Engagement decay
    df['engagement_decay'] = df['daily_login_streak'] / (df['days_since_last_purchase'] + 1)
    
    return df

# ==========================================
# OPTIMIZED MODEL PARAMETERS
# ==========================================

# Stage 1: Classification (Spender vs Non-Spender)
PARAMS_S1_CAT = {
    'iterations': 2000,
    'learning_rate': 0.03,
    'depth': 8,
    'l2_leaf_reg': 8.0,
    'border_count': 254,
    'bagging_temperature': 0.8,
    'random_strength': 1.5,
    'scale_pos_weight': 1.0,  # Will be set dynamically
    'eval_metric': 'Logloss',
    'random_state': RANDOM_STATE,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'GPU',
    'devices': '0:1' if USE_MULTI_GPU else '0'  # Use both GPUs
}

PARAMS_S1_LGB = {
    'n_estimators': 1500,
    'learning_rate': 0.02,
    'num_leaves': 40,
    'max_depth': 10,
    'min_child_samples': 25,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_lambda': 3.0,
    'reg_alpha': 1.5,
    'scale_pos_weight': 1.0,  # Will be set dynamically
    'objective': 'binary',
    'metric': 'auc',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

# Stage 2: Regression (Amount Prediction)
PARAMS_S2_CAT = {
    'iterations': 5000,
    'learning_rate': 0.008,
    'depth': 7,
    'l2_leaf_reg': 3.0,
    'border_count': 200,
    'bagging_temperature': 0.6,
    'random_strength': 1.2,
    'min_data_in_leaf': 10,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_state': RANDOM_STATE,
    'verbose': False,
    'allow_writing_files': False,
    'task_type': 'GPU',  # GPU acceleration
    'devices': '0'
}

PARAMS_S2_LGB_MAIN = {
    'n_estimators': 4500,
    'learning_rate': 0.01,
    'num_leaves': 64,
    'max_depth': 10,
    'min_child_samples': 15,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'reg_lambda': 2.0,
    'reg_alpha': 0.8,
    'min_split_gain': 0.01,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

PARAMS_S2_LGB_DEEP = {
    'n_estimators': 4000,
    'learning_rate': 0.006,
    'num_leaves': 150,
    'max_depth': 15,
    'min_child_samples': 8,
    'subsample': 0.9,
    'colsample_bytree': 0.8,
    'reg_lambda': 4.0,
    'reg_alpha': 0.5,
    'min_split_gain': 0.005,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': RANDOM_STATE,
    'verbosity': -1,
    'device': 'gpu',  # GPU acceleration
    'gpu_platform_id': 0,
    'gpu_device_id': 0
}

# Ensemble weights (tuned based on typical performance)
W_S1 = {'cat': 0.55, 'lgb': 0.45}
W_S2 = {'cat': 0.50, 'lgb_main': 0.30, 'lgb_deep': 0.20}

# ==========================================
# DATA LOADING & PREPROCESSING
# ==========================================
print("=" * 60)
print("ADVANCED TWO-STAGE PREDICTION PIPELINE (GPU ACCELERATED)")
print("=" * 60)

# Check GPU availability
print("\nüîç Checking GPU availability...")
try:
    import subprocess
    gpu_info = subprocess.check_output(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader']).decode()
    print(f"‚úì GPU Detected: {gpu_info.strip()}")
except:
    print("‚ö† GPU not detected. Models will fallback to CPU if GPU unavailable.")

print("\n[1/5] Loading data...")
train = pd.read_csv('/kaggle/input/cpe342-karena/public_dataset/task3/train.csv')
test = pd.read_csv('/kaggle/input/cpe342-karena/public_dataset/task3/test.csv')

print(f"Train shape: {train.shape}, Test shape: {test.shape}")

print("\n[2/5] Creating advanced features...")
train_fe = create_advanced_features(train)
test_fe = create_advanced_features(test)

# Feature selection
exclude_cols = ['id', 'player_id', TARGET, 'days_since_last_purchase']  # days_since excluded as it's often leaky
feature_cols = [c for c in train_fe.columns if c not in exclude_cols]

# Identify categorical features
cat_features = [
    'primary_game', 'platform', 'vip_status', 'segment',
    'spending_tier', 'activity_tier', 'transaction_tier'
]
cat_features = [c for c in cat_features if c in feature_cols]

print(f"Total features: {len(feature_cols)}")
print(f"Categorical features: {len(cat_features)}")

X = train_fe[feature_cols].copy()
y_binary = (train_fe[TARGET] > 0).astype(int)
X_test = test_fe[feature_cols].copy()

# Convert categoricals
for c in cat_features:
    X[c] = X[c].astype('category')
    X_test[c] = X_test[c].astype('category')

# ==========================================
# STAGE 1: CLASSIFICATION
# ==========================================
print("\n" + "=" * 60)
print("[3/5] STAGE 1: Spender Classification")
print("=" * 60)

# Calculate class imbalance
pos_rate = y_binary.mean()
scale_pos_weight = (1 - pos_rate) / pos_rate
print(f"Positive class rate: {pos_rate:.4f}")
print(f"Scale pos weight: {scale_pos_weight:.4f}")

PARAMS_S1_CAT['scale_pos_weight'] = scale_pos_weight
PARAMS_S1_LGB['scale_pos_weight'] = scale_pos_weight

kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
oof_prob_s1 = np.zeros(len(X))
test_prob_s1 = np.zeros(len(X_test))
fold_aucs = []

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y_binary), 1):
    print(f"\n  Fold {fold}/{N_FOLDS}")
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y_binary.iloc[tr_idx], y_binary.iloc[val_idx]
    
    # CatBoost
    m1 = CatBoostClassifier(**PARAMS_S1_CAT, cat_features=cat_features)
    m1.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=100)
    p1_val = m1.predict_proba(X_val)[:, 1]
    p1_test = m1.predict_proba(X_test)[:, 1]
    
    # LightGBM
    m2 = lgb.LGBMClassifier(**PARAMS_S1_LGB)
    m2.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(100, verbose=False)])
    p2_val = m2.predict_proba(X_val)[:, 1]
    p2_test = m2.predict_proba(X_test)[:, 1]
    
    # Ensemble
    fold_pred = W_S1['cat'] * p1_val + W_S1['lgb'] * p2_val
    oof_prob_s1[val_idx] = fold_pred
    test_prob_s1 += (W_S1['cat'] * p1_test + W_S1['lgb'] * p2_test) / N_FOLDS
    
    fold_auc = roc_auc_score(y_val, fold_pred)
    fold_aucs.append(fold_auc)
    print(f"    AUC: {fold_auc:.5f}")

# Overall metrics
overall_auc = roc_auc_score(y_binary, oof_prob_s1)
print(f"\n  Overall OOF AUC: {overall_auc:.5f} ¬± {np.std(fold_aucs):.5f}")

# Optimize threshold (by accuracy, for reference only)
best_acc = 0
best_thresh_acc = 0.5
for t in np.arange(0.2, 0.8, 0.005):
    acc = accuracy_score(y_binary, (oof_prob_s1 > t).astype(int))
    if acc > best_acc:
        best_acc, best_thresh_acc = acc, t

print(f"  Optimal Threshold (by ACC): {best_thresh_acc:.4f} (Accuracy: {best_acc:.4f})")

# ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ default ‡πÄ‡∏ú‡∏∑‡πà‡∏≠‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ tune RMSE ‡∏†‡∏≤‡∏¢‡∏´‡∏•‡∏±‡∏á
best_thresh = best_thresh_acc


# ==========================================
# STAGE 2: REGRESSION
# ==========================================
print("\n" + "=" * 60)
print("[4/5] STAGE 2: Amount Prediction (Spenders Only)")
print("=" * 60)

mask_spenders = train_fe[TARGET] > 0
X_reg = train_fe[mask_spenders][feature_cols].reset_index(drop=True)
y_reg = train_fe[mask_spenders][TARGET].reset_index(drop=True)
y_reg_log = np.log1p(y_reg)

print(f"Training on {len(X_reg)} spenders")
print(f"Mean spending: ‡∏ø{y_reg.mean():.2f}, Median: ‡∏ø{y_reg.median():.2f}")

# Convert categoricals
for c in cat_features:
    X_reg[c] = X_reg[c].astype('category')

oof_amount_s2 = np.zeros(len(X_reg))
test_amount_s2 = np.zeros(len(X_test))
kf_reg = KFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
fold_rmses = []

for fold, (tr_idx, val_idx) in enumerate(kf_reg.split(X_reg), 1):
    print(f"\n  Fold {fold}/{N_FOLDS}")
    X_tr, X_val = X_reg.iloc[tr_idx], X_reg.iloc[val_idx]
    y_tr, y_val = y_reg_log.iloc[tr_idx], y_reg_log.iloc[val_idx]
    
    # CatBoost
    r1 = CatBoostRegressor(**PARAMS_S2_CAT, cat_features=cat_features)
    r1.fit(X_tr, y_tr, eval_set=(X_val, y_val), early_stopping_rounds=150)
    p1_val = r1.predict(X_val)
    p1_test = r1.predict(X_test)
    
    # LightGBM Main
    r2 = lgb.LGBMRegressor(**PARAMS_S2_LGB_MAIN)
    r2.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(150, verbose=False)])
    p2_val = r2.predict(X_val)
    p2_test = r2.predict(X_test)
    
    # LightGBM Deep
    r3 = lgb.LGBMRegressor(**PARAMS_S2_LGB_DEEP)
    r3.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(150, verbose=False)])
    p3_val = r3.predict(X_val)
    p3_test = r3.predict(X_test)
    
    # Ensemble (log space)
    fold_pred = (
        W_S2['cat'] * p1_val + 
        W_S2['lgb_main'] * p2_val + 
        W_S2['lgb_deep'] * p3_val
    )
    oof_amount_s2[val_idx] = fold_pred
    test_amount_s2 += (
        W_S2['cat'] * p1_test + 
        W_S2['lgb_main'] * p2_test + 
        W_S2['lgb_deep'] * p3_test
    ) / N_FOLDS
    
    fold_rmse = np.sqrt(mean_squared_error(y_val, fold_pred))
    fold_rmses.append(fold_rmse)
    print(f"    RMSE (log): {fold_rmse:.5f}")

# Evaluation
rmse_log = np.sqrt(mean_squared_error(y_reg_log, oof_amount_s2))
oof_amount_raw = np.expm1(oof_amount_s2)
rmse_raw = np.sqrt(mean_squared_error(y_reg, oof_amount_raw))

print(f"\n  Overall OOF RMSE (log): {rmse_log:.5f} ¬± {np.std(fold_rmses):.5f}")
print(f"  Overall OOF RMSE (THB): ‡∏ø{rmse_raw:.2f}")
# ==========================================
# [4.5] Tuning threshold by RMSE on spending_30d
# ==========================================
print("\n" + "=" * 60)
print("[4.5] Tuning classification threshold by RMSE on full train")
print("=" * 60)

# y_true = ‡∏Ñ‡πà‡∏≤ spending ‡∏à‡∏£‡∏¥‡∏á‡∏ó‡∏±‡πâ‡∏á train
y_true = train_fe[TARGET].values  # shape (104000,)

# ‡∏™‡∏£‡πâ‡∏≤‡∏á amount prediction ‡πÄ‡∏ï‡πá‡∏°‡∏ó‡∏±‡πâ‡∏á train (‡πÉ‡∏ô‡∏´‡∏ô‡πà‡∏ß‡∏¢‡πÄ‡∏á‡∏¥‡∏ô‡∏à‡∏£‡∏¥‡∏á)
amount_raw_full = np.zeros_like(y_true, dtype=float)
amount_raw_full[mask_spenders.values] = np.expm1(oof_amount_s2)

best_t = None
best_rmse = 1e18

for t in np.linspace(0.10, 0.90, 81):  # 0.10, 0.11, ..., 0.90
    y_pred = np.where(oof_prob_s1 > t, amount_raw_full, 0.0)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    if rmse < best_rmse:
        best_rmse = rmse
        best_t = t

print(f"  Best threshold by RMSE: {best_t:.4f}  (Train RMSE: {best_rmse:.2f})")
print(f"  Previous threshold by ACC: {best_thresh_acc:.4f}")

# ‡πÉ‡∏ä‡πâ threshold ‡πÅ‡∏ö‡∏ö RMSE ‡πÄ‡∏õ‡πá‡∏ô‡∏ï‡∏±‡∏ß‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö test
best_thresh = best_t


# ==========================================
# FINAL PREDICTIONS
# ==========================================
print("\n" + "=" * 60)
print("[5/5] Generating Final Predictions")
print("=" * 60)

# Convert test predictions from log to raw
test_amount_raw = np.expm1(test_amount_s2)

# Combine stages with threshold
final_preds = np.where(test_prob_s1 > best_thresh, test_amount_raw, 0)

# Safety clip
final_preds = np.clip(final_preds, 0, 500000)

# Statistics
print(f"\nPrediction Statistics:")
print(f"  Threshold: {best_thresh:.4f}")
print(f"  Predicted spenders: {np.sum(final_preds > 0):,} / {len(final_preds):,} ({100*np.sum(final_preds > 0)/len(final_preds):.2f}%)")
print(f"  Mean prediction: ‡∏ø{final_preds[final_preds > 0].mean():.2f}")
print(f"  Median prediction: ‡∏ø{np.median(final_preds[final_preds > 0]):.2f}")
print(f"  Max prediction: ‡∏ø{final_preds.max():.2f}")
print(f"  Total predicted revenue: ‡∏ø{final_preds.sum():,.2f}")

# Save submission
submission = pd.DataFrame({
    'id': test['id'],
    'spending_30d': final_preds
})
submission.to_csv('submission_enhanced.csv', index=False)

print("\n" + "=" * 60)
print("‚úì Pipeline Complete! Submission saved as 'submission_enhanced.csv'")
print("=" * 60)

ADVANCED TWO-STAGE PREDICTION PIPELINE (GPU ACCELERATED)

üîç Checking GPU availability...
‚úì GPU Detected: Tesla T4, 15360 MiB
Tesla T4, 15360 MiB

[1/5] Loading data...
Train shape: (104000, 35), Test shape: (25889, 34)

[2/5] Creating advanced features...
Total features: 73
Categorical features: 7

[3/5] STAGE 1: Spender Classification
Positive class rate: 0.5180
Scale pos weight: 0.9306

  Fold 1/7
    AUC: 0.78826

  Fold 2/7
    AUC: 0.78667

  Fold 3/7
    AUC: 0.78297

  Fold 4/7
    AUC: 0.78354

  Fold 5/7
    AUC: 0.78035

  Fold 6/7
    AUC: 0.78393

  Fold 7/7
    AUC: 0.78375

  Overall OOF AUC: 0.78384 ¬± 0.00238
  Optimal Threshold (by ACC): 0.4700 (Accuracy: 0.7198)

[4/5] STAGE 2: Amount Prediction (Spenders Only)
Training on 53868 spenders
Mean spending: ‡∏ø20019.98, Median: ‡∏ø2297.29

  Fold 1/7
    RMSE (log): 0.22017

  Fold 2/7
    RMSE (log): 0.21894

  Fold 3/7
    RMSE (log): 0.22386

  Fold 4/7
    RMSE (log): 0.20170

  Fold 5/7
    RMSE (log): 0.22873

 