In [None]:
import os
import sys

# CRITICAL: Fix CuPy circular import BEFORE any imports
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'

# Disable CuPy in Dask (which LightGBM tries to import)
os.environ['DASK_ARRAY__BACKEND__CUPY'] = '0'

# Mock cupy to prevent import
sys.modules['cupy'] = None

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, QuantileTransformer
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from functools import lru_cache

# Try importing with error handling
try:
    from imblearn.over_sampling import BorderlineSMOTE
    SMOTE_AVAILABLE = True
except ImportError:
    SMOTE_AVAILABLE = False
    print("‚ö†Ô∏è  BorderlineSMOTE not available, will skip oversampling")

try:
    from xgboost import XGBClassifier
    XGB_AVAILABLE = True
except ImportError:
    XGB_AVAILABLE = False
    print("‚ö†Ô∏è  XGBoost not available")

try:
    from lightgbm import LGBMClassifier
    LGBM_AVAILABLE = True
except ImportError:
    LGBM_AVAILABLE = False
    print("‚ö†Ô∏è  LightGBM not available")

try:
    from catboost import CatBoostClassifier
    CAT_AVAILABLE = True
except ImportError:
    CAT_AVAILABLE = False
    print("‚ö†Ô∏è  CatBoost not available")

import joblib

print("="*80)
print("üöÄ OPTIMIZED ML PIPELINE - PLAYER SEGMENTATION (Segment-Specific FE)")
print("="*80)

# ===========================
# 1. DATA LOADING (Optimized)
# ===========================
# IMPORTANT: Use relative paths for execution environment
TRAIN_PATH = '/kaggle/input/datasettask2/train.csv'
TEST_PATH = '/kaggle/input/datasettask2/test.csv'
SUBMISSION_TEMPLATE = '/kaggle/input/datasettask2/sample_submission.csv' 

# Utility to create dummy submission file if it doesn't exist
try:
    df_temp_test = pd.read_csv(TEST_PATH, low_memory=False)
    df_sub_dummy = pd.DataFrame({'id': df_temp_test['id'], 'task2': 'Segment 0'})
    df_sub_dummy.to_csv(SUBMISSION_TEMPLATE, index=False)
    del df_temp_test, df_sub_dummy
except Exception as e:
    print(f"Could not create dummy submission file: {e}")

df_train = pd.read_csv(TRAIN_PATH, low_memory=False)
df_test = pd.read_csv(TEST_PATH, low_memory=False)
df_sub = pd.read_csv(SUBMISSION_TEMPLATE)
print("‚úÖ Data loaded successfully.")

print(f"\nüìä Train shape: {df_train.shape}")
print(f"üìä Test shape: {df_test.shape}")

# ===========================
# 2. INITIAL SETUP
# ===========================
TARGET_VAR = 'segment'
NON_FEATURE_COLS = ['id', 'player_id']

# Vectorized operations
X = df_train.drop(columns=[TARGET_VAR] + NON_FEATURE_COLS, errors='ignore')
y = df_train[TARGET_VAR]
X_test = df_test.drop(columns=NON_FEATURE_COLS, errors='ignore')

# Handle NaN in target (vectorized)
valid_idx = y.notna()
X = X[valid_idx].reset_index(drop=True)
y = y[valid_idx].reset_index(drop=True)

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

class_dist = pd.Series(y_encoded).value_counts().sort_index()
print(f"\nüìä Class Distribution:")
for i, count in enumerate(class_dist):
    print(f"    Segment {i}: {count:,} ({count/len(X)*100:.1f}%)")

# ===========================
# 3. OPTIMIZED DATA CLEANING
# ===========================
print("\nüßπ Cleaning data (optimized)...")

numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Vectorized imputation - compute medians once
numerical_medians = X[numerical_cols].median()

# Apply to train
X[numerical_cols] = X[numerical_cols].fillna(numerical_medians)

# Apply to test (reuse computed medians)
X_test[numerical_cols] = X_test[numerical_cols].fillna(numerical_medians)

# Categorical imputation (vectorized)
X[categorical_cols] = X[categorical_cols].fillna('Missing_Value')
X_test[categorical_cols] = X_test[categorical_cols].fillna('Missing_Value')

print("‚úÖ Data cleaned with vectorized operations")

# ===========================
# 4. OPTIMIZED FEATURE ENGINEERING (Segment-Specific Decoupling)
# ===========================

def create_ultimate_features(df):
    """
    Optimized feature creation focusing on explicit Segment-Specific Decoupling.
    """
    result = df.copy()
    
    # Precompute common terms
    eps = 1e-6
    account_age_safe = result['account_age_days'] + eps
    playtime_safe = result['total_playtime_hours'] + eps
    
    new_features = {}

    # --- S0: Casual Segment Features (Focus: Low Commitment, Recency) ---
    
    # 1. Recency Index (Recency / Age)
    new_features['S0_Recency_Index'] = result['days_since_last_login'] / account_age_safe
    
    # 2. Low Commitment Ratio (Session Duration / Login Streak)
    new_features['S0_Low_Commitment_Ratio'] = result['avg_session_duration'] / (result['login_streak'] + eps)
    
    # 3. Non-Competitive Focus
    new_features['S0_Non_Competitive_Focus'] = 1 - result['ranked_participation_rate']
    
    # 4. Low Spending Binary (Binary flag for non-payers)
    new_features['S0_Low_Spending_Binary'] = (result['total_spending_thb'] == 0).astype(np.int8)
    
    # 5. Play Intermittence Score
    new_features['S0_Play_Intermittence'] = result['days_since_last_login'] * (1 / (result['play_frequency'] + eps))
    
    # 6. Low Playtime Per Day
    new_features['S0_Playtime_Per_Day_Inv'] = 1.0 / (result['total_playtime_hours'] / account_age_safe + eps)


    # --- S1: Grinder Segment Features (Focus: Dedication, Competitive, Progress) ---
    
    # 1. Grinder Intensity (Login/Playtime relative to Age)
    new_features['S1_Grinder_Intensity'] = (result['login_streak'] * result['play_frequency']) / account_age_safe
    
    # 2. Ranked Dedication Index (Participation * Win Rate)
    new_features['S1_Ranked_Dedication_Index'] = result['ranked_participation_rate'] * result['win_rate_ranked']
    
    # 3. Progression Pace (Achievement * Speed of Progression)
    new_features['S1_Progression_Pace'] = result['achievement_completion_rate'] * result['speed_of_progression']
    
    # 4. Competitive Ranked Hours (Total Playtime * Ranked %)
    new_features['S1_Competitive_Ranked_Hours'] = result['total_playtime_hours'] * result['ranked_participation_rate']
    
    # 5. Consistent Play Hours (Total Playtime / Streak)
    new_features['S1_Consistent_Play_Hours'] = result['total_playtime_hours'] / (result['login_streak'] + eps)
    
    # 6. Tournament Engagement
    new_features['S1_Tournament_Engagement'] = result['tournament_entries'] / playtime_safe


    # --- S2: Social Segment Features (Focus: Friends, Chat, Team Play) ---
    
    # 1. Network Reach Score (Friend Count * Chat Score)
    new_features['S2_Network_Reach_Score'] = result['friend_count'] * result['chat_activity_score']
    
    # 2. Gifting Generosity (Gifts / Friend Count)
    new_features['S2_Gifting_Generosity'] = result['gifts_sent_received'] / (result['friend_count'] + eps)
    
    # 3. Team Play Ratio (Team Play % / Ranked %)
    new_features['S2_Team_Play_Ratio'] = result['team_play_percentage'] / (result['ranked_participation_rate'] + eps)
    
    # 4. Chat Per Playtime
    new_features['S2_Chat_Per_Playtime'] = result['chat_activity_score'] / playtime_safe
    
    # 5. Social Dominance (Invites/Gifts * Team Play)
    new_features['S2_Social_Dominance'] = (result['friend_invites_sent'] + result['gifts_sent_received']) * result['team_play_percentage']
    
    # 6. Non-Monetary Social Score
    new_features['S2_Non_Monetary_Social'] = result['friend_count'] + result['gifts_sent_received'] + result['friend_invites_sent']


    # --- S3: Whale Segment Features (Focus: Spending, VIP, Value) ---
    
    # 1. Log Spending (Key Transformation)
    new_features['S3_Log_Spending'] = np.log1p(result['total_spending_thb'])
    
    # 2. Monthly Spending Power
    new_features['S3_Monthly_Spending_Power'] = result['avg_monthly_spending'] * result['spending_frequency']
    
    # 3. VIP Spending Ratio
    new_features['S3_VIP_Spending_Ratio'] = result['vip_tier'] * result['total_spending_thb']
    
    # 4. Investment Return Rate (Spending / Playtime)
    new_features['S3_Investment_Return_Rate'] = result['total_spending_thb'] / playtime_safe
    
    # 5. Discount Responsiveness
    new_features['S3_Discount_Responsiveness'] = result['responds_to_discounts'] * result['spending_frequency']
    
    # 6. High Value Collection (Rare Items * Collection Progress)
    new_features['S3_High_Value_Collection'] = result['rare_items_count'] * result['collection_progress']
    
    # 7. Whale Per Day (Spending / Age)
    new_features['S3_Whale_Per_Day'] = result['total_spending_thb'] / account_age_safe


    # Assign all at once (faster than individual assignments)
    for col, values in new_features.items():
        result[col] = values
        
    # --- Cleanup for new ratio features ---
    cols_to_clean = list(new_features.keys())
    # Handle inf/NaN and fill with 0 (a safe assumption for ratio features derived from 0 denominators)
    result[cols_to_clean].replace([np.inf, -np.inf], np.nan, inplace=True)
    result[cols_to_clean].fillna(0, inplace=True) 
    
    return result

X = create_ultimate_features(X)
X_test = create_ultimate_features(X_test)

print(f"‚úÖ Features created. Total numerical: {X.select_dtypes(include=np.number).shape[1]}")

# ===========================
# 5. OPTIMIZED ENCODING
# ===========================
print("\nüîÑ Encoding categorical features (optimized)...")

categorical_cols_final = X.select_dtypes(include='object').columns.tolist()

# Efficient concatenation
combined_df = pd.concat([X, X_test], axis=0, ignore_index=True, copy=False)

# One-Hot Encoding with optimized parameters
combined_df = pd.get_dummies(
    combined_df, 
    columns=categorical_cols_final, 
    drop_first=True, 
    dummy_na=False,
    dtype=np.int8  # Use int8 instead of int64 to save memory
)

# Split efficiently
split_idx = len(X)
X = combined_df.iloc[:split_idx].copy()
X_test_temp = combined_df.iloc[split_idx:].copy()

# Align columns efficiently
missing_cols = set(X.columns) - set(X_test_temp.columns)
if missing_cols:
    for c in missing_cols:
        X_test_temp[c] = 0

X_test = X_test_temp[X.columns].copy()

print(f"‚úÖ Final dataset: {X.shape[1]} features")

# Convert to numpy arrays for faster model training
X_array = X.values
X_test_array = X_test.values

# ===========================
# 6. OPTIMIZED CLASS WEIGHTS
# ===========================
print(f"\n{'='*80}")
print("‚öñÔ∏è COMPUTING OPTIMAL CLASS WEIGHTS")
print(f"{'='*80}")

# Vectorized class weight calculation
class_counts = np.bincount(y_encoded)
total = len(y_encoded)
num_classes = len(class_counts)

class_weights = {i: total / (num_classes * count) for i, count in enumerate(class_counts)}

# Custom adjustments (vectorized)
if num_classes >= 4:
    # Adjusted weights to slightly boost S1 and S2 (Grinder, Social)
    adjustments = [1.00, 1.08, 1.20, 1.12]
    for i, adj in enumerate(adjustments):
        class_weights[i] *= adj

print("\nüìä Optimized class weights:")
for i, w in class_weights.items():
    print(f"    Segment {i}: {w:.3f}")

# ===========================
# 7. OPTIMIZED MODEL ENSEMBLE
# ===========================
print(f"\n{'='*80}")
print("üèóÔ∏è BUILDING OPTIMIZED ENSEMBLE")
print(f"{'='*80}")

# Build ensemble based on available libraries
base_models = []

if XGB_AVAILABLE:
    base_models.extend([
        ('xgb_deep', XGBClassifier(
            n_estimators=550, max_depth=9, learning_rate=0.028,
            tree_method='hist', use_label_encoder=False,
            random_state=42, n_jobs=-1, eval_metric='mlogloss',
            device='cpu'
        )),
        ('xgb_wide', XGBClassifier(
            n_estimators=550, max_depth=6, learning_rate=0.028,
            tree_method='hist', use_label_encoder=False,
            random_state=43, n_jobs=-1, eval_metric='mlogloss',
            device='cpu'
        ))
    ])
    print("‚úì XGBoost models added")

if LGBM_AVAILABLE:
    base_models.append(
        ('lgbm', LGBMClassifier(
            n_estimators=550, num_leaves=50, learning_rate=0.028,
            class_weight=class_weights, random_state=42,
            n_jobs=-1, verbose=-1, force_col_wise=True,
            device='cpu'
        ))
    )
    print("‚úì LightGBM model added")

if CAT_AVAILABLE:
    base_models.append(
        ('cat', CatBoostClassifier(
            n_estimators=550, depth=9, learning_rate=0.028,
            class_weights=list(class_weights.values()),
            random_seed=42, verbose=0, thread_count=-1,
            task_type='CPU', bootstrap_type='Bernoulli'
        ))
    )
    print("‚úì CatBoost model added")

# Always add sklearn models (most stable)
base_models.extend([
    ('rf', RandomForestClassifier(
        n_estimators=450, max_depth=19,
        class_weight=class_weights, random_state=42,
        n_jobs=-1, max_features='sqrt'
    )),
    ('et', ExtraTreesClassifier(
        n_estimators=450, max_depth=19,
        class_weight=class_weights, random_state=44,
        n_jobs=-1, max_features='sqrt'
    ))
])
print("‚úì RandomForest & ExtraTrees models added")

if len(base_models) == 0:
    raise RuntimeError("No models available! Please install at least one of: xgboost, lightgbm, catboost")

print(f"\nüìä Total models in ensemble: {len(base_models)}")

# Meta model - use LightGBM if available, else RandomForest
if LGBM_AVAILABLE:
    meta_base = LGBMClassifier(
        n_estimators=300, num_leaves=35, learning_rate=0.035,
        random_state=42, n_jobs=-1, verbose=-1, force_col_wise=True,
        device='cpu'
    )
    print("‚úì Meta-model: LightGBM")
else:
    meta_base = RandomForestClassifier(
        n_estimators=300, max_depth=15,
        class_weight=class_weights, random_state=42, n_jobs=-1
    )
    print("‚úì Meta-model: RandomForest (fallback)")

# ===========================
# 8. OPTIMIZED TRAINING
# ===========================
print(f"\n{'='*80}")
print("üéØ TRAINING PHASE - Optimized CV")
print(f"{'='*80}")

N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

n_classes = len(le.classes_)

# Preallocate arrays (more efficient)
oof_preds_L1 = np.zeros((len(X_array), len(base_models) * n_classes), dtype=np.float32)
test_preds_L1 = np.zeros((len(X_test_array), len(base_models) * n_classes), dtype=np.float32)

fold_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_array, y_encoded)):
    print(f"\n{'‚îÄ'*80}")
    print(f"üìç Fold {fold+1}/{N_SPLITS}")
    print(f"{'‚îÄ'*80}")
    
    X_train, X_val = X_array[train_idx], X_array[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]
    
    # BorderlineSMOTE with error handling
    if SMOTE_AVAILABLE:
        try:
            smote = BorderlineSMOTE(random_state=42, k_neighbors=6, kind='borderline-1', n_jobs=-1)
            X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
            print(f"  ‚úì BorderlineSMOTE: {len(X_train):,} -> {len(X_train_res):,}")
        except Exception as e:
            X_train_res, y_train_res = X_train, y_train
            print(f"  ‚ö† BorderlineSMOTE skipped: {str(e)[:50]}")
    else:
        X_train_res, y_train_res = X_train, y_train
        print(f"  ‚ö† SMOTE not available, using original data")
    
    # Train base models with progress
    print(f"\n  Training base models:")
    for i, (name, model) in enumerate(base_models):
        model.fit(X_train_res, y_train_res)
        
        # Predict probabilities
        val_probs = model.predict_proba(X_val)
        oof_preds_L1[val_idx, i*n_classes:(i+1)*n_classes] = val_probs
        
        test_probs = model.predict_proba(X_test_array)
        test_preds_L1[:, i*n_classes:(i+1)*n_classes] += test_probs / N_SPLITS
        
        print(f"    ‚Ä¢ {name:12s} ‚úì")
    
    # Fold evaluation (vectorized)
    fold_preds = oof_preds_L1[val_idx].reshape(len(val_idx), len(base_models), -1).mean(axis=1).argmax(axis=1)
    fold_f1 = f1_score(y_val, fold_preds, average='macro')
    fold_scores.append(fold_f1)
    
    print(f"\n  üìä Fold F1: {fold_f1:.4f}")

# ===========================
# 9. OPTIMIZED META-MODEL
# ===========================
print(f"\n{'='*80}")
print("üéì TRAINING CALIBRATED META-MODEL")
print(f"{'='*80}")

# Fit scaler on float32 data
scaler = QuantileTransformer(output_distribution='normal', random_state=42)
X_meta_scaled = scaler.fit_transform(oof_preds_L1)
X_test_meta_scaled = scaler.transform(test_preds_L1)

# Train meta-model with calibration
meta_model = CalibratedClassifierCV(meta_base, method='isotonic', cv=3, n_jobs=-1)
meta_model.fit(X_meta_scaled, y_encoded)

# Final predictions
oof_preds_final = meta_model.predict(X_meta_scaled)
final_predictions = meta_model.predict(X_test_meta_scaled)

# ===========================
# 10. EVALUATION
# ===========================
cv_f1 = f1_score(y_encoded, oof_preds_final, average='macro')

print(f"\n{'='*80}")
print(f"‚ú® FINAL RESULTS")
print(f"{'='*80}")
print(f"\nüéØ Final CV F1-Macro: {cv_f1:.4f}")

print(f"\n{'='*80}")
print(classification_report(
    y_encoded, oof_preds_final,
    target_names=[f"Segment {c}" for c in le.classes_],
    digits=4
))

# Confusion matrix
cm = confusion_matrix(y_encoded, oof_preds_final)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print("\nüîç Normalized Confusion Matrix:\n")
print("    \t\t Predicted:", " ".join([f"S{i}" for i in range(n_classes)]))
for i, row in enumerate(cm_norm):
    print(f"  True S{i}:\t\t", " ".join([f"{v:.2f}" for v in row]))

# ===========================
# 11. SUBMISSION
# ===========================
final_predictions_decoded = le.inverse_transform(final_predictions)
df_sub['task2'] = final_predictions_decoded
submission = df_sub[['id', 'task2']]

submission.to_csv('submission_task2_optimized_v2.csv', index=False)

print(f"\n{'='*80}")
print("‚úÖ SUBMISSION SAVED: 'submission_task2_optimized_v2.csv'")
print(f"{'='*80}")