# CPE342 - Karena Task2

In [74]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import warnings

In [75]:
warnings.filterwarnings('ignore')

print("Starting Player Segmentation Task...")

Starting Player Segmentation Task...


In [76]:
train_df = pd.read_csv("Dataset/task2/train.csv")
test_df = pd.read_csv("Dataset/task2/test.csv")

In [77]:
train_df.head()

Unnamed: 0,id,player_id,play_frequency,avg_session_duration,total_playtime_hours,login_streak,days_since_last_login,total_spending_thb,avg_monthly_spending,spending_frequency,...,platform,device_type,payment_method,language,account_status,player_type_tag,engagement_level,loyalty_tier,skill_tier,segment
0,PLY00001,P050236,5.495437,24.837349,2740.945124,60.0,56.034052,58219.91566,434.038311,17.79097,...,PC,Phone,Wallet,DE,Active,Social,Low,Silver,,0
1,PLY00002,P108696,9.991089,88.376322,,22.0,75.036888,28966.163953,4233.532935,28.862134,...,Console,Desktop,Card,ES,Dormant,Collector,Low,Platinum,Gold,2
2,PLY00003,P113532,14.234225,101.712292,2828.479467,66.0,,44478.823835,4849.491895,22.536406,...,PC,Laptop,,EN,Dormant,Casual,High,Diamond,Gold,3
3,PLY00004,P123930,3.373683,191.975841,1915.08295,80.0,0.12791,57074.116993,6863.516123,10.565666,...,Console,Tablet,,ES,Dormant,Competitive,Veteran,Silver,Gold,3
4,PLY00005,P068623,22.469353,28.042509,517.921948,,45.07846,,4408.101247,11.226504,...,PC,Tablet,Gift,EN,Active,Casual,Mid,Silver,,1


In [78]:
test_df.head()

Unnamed: 0,id,player_id,play_frequency,avg_session_duration,total_playtime_hours,login_streak,days_since_last_login,total_spending_thb,avg_monthly_spending,spending_frequency,...,region,platform,device_type,payment_method,language,account_status,player_type_tag,engagement_level,loyalty_tier,skill_tier
0,ANS00001,P106074,4.917599,50.951821,1610.535142,3.0,40.047516,61031.190124,5403.618682,21.099955,...,,PC,Tablet,,EN,Active,Competitive,Mid,Diamond,Gold
1,ANS00002,P024878,8.060471,81.376671,543.088681,104.0,37.296412,10862.656232,,17.909547,...,EU,Mobile,Laptop,Card,ES,Active,,Mid,Silver,Bronze
2,ANS00003,P033678,,27.707037,721.533684,14.0,52.484579,27515.141077,1098.230071,26.183854,...,,PC,Phone,Wallet,ES,Dormant,Collector,Low,Bronze,Platinum
3,ANS00004,P020935,27.002787,26.859972,1442.810933,62.0,43.851594,5299.499711,,3.228531,...,APAC,PC,Phone,Wallet,ES,Dormant,Competitive,,Bronze,Silver
4,ANS00005,P049711,6.188164,49.545383,2039.185739,8.0,15.519366,7491.446985,1645.853549,27.231039,...,LATAM,Console,Desktop,Wallet,ES,Dormant,Competitive,Low,Silver,Platinum


In [79]:
print(f"Train data loaded: {train_df.shape}")
print(f"Test data loaded: {test_df.shape}")

Train data loaded: (101658, 47)
Test data loaded: (25889, 46)


In [80]:
# Define target and features to drop
TARGET = 'segment'
# Drop IDs and the explicitly random/noise columns
USELESS_COLS = ['id', 'player_id', 'random_metric_1', 'random_metric_2', 'random_metric_3']

In [81]:
# Separate target variable from training data
y_train = train_df[TARGET]
# Drop target from training set
train_df = train_df.drop(columns=[TARGET])

# Combine train and test data for consistent preprocessing
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# Drop the useless columns from the combined dataframe
combined_df = combined_df.drop(columns=USELESS_COLS)

In [82]:
# Identify categorical features automatically
# LightGBM can handle categorical features natively and efficiently
categorical_features = combined_df.select_dtypes(include=['object']).columns.tolist()

In [83]:
# Convert categorical columns to 'category' dtype for LightGBM
print(f"Identified {len(categorical_features)} categorical features: {categorical_features}")
for col in categorical_features:
    combined_df[col] = combined_df[col].astype('category')

# Separate back into processed X_train and X_test
X_train = combined_df.iloc[:len(y_train)]
X_test = combined_df.iloc[len(y_train):]

print(f"Processed X_train shape: {X_train.shape}")
print(f"Processed X_test shape: {X_test.shape}")

Identified 10 categorical features: ['region', 'platform', 'device_type', 'payment_method', 'language', 'account_status', 'player_type_tag', 'engagement_level', 'loyalty_tier', 'skill_tier']
Processed X_train shape: (101658, 41)
Processed X_test shape: (25889, 41)


In [84]:
N_SPLITS = 10
RANDOM_STATE = 42

In [85]:
# Setup StratifiedKFold for cross-validation
# Stratified is crucial for imbalanced classification tasks
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

In [86]:
# LightGBM Model Parameters
# Tuned for a good balance of speed and accuracy (multi-class F1)
lgb_params = {
    'objective': 'multiclass',
    'num_class': 4,
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'n_estimators': 2000,
    'learning_rate': 0.02,
    'num_leaves': 31,
    'max_depth': -1,
    'seed': RANDOM_STATE,
    'n_jobs': -1,
    'verbose': -1,
    'colsample_bytree': 0.7,
    'subsample': 0.7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
}

In [87]:
oof_f1_scores = []
test_predictions = np.zeros((len(X_test), 4)) # (n_samples, n_classes)
models = []

In [88]:
print(f"\n--- Starting {N_SPLITS}-Fold LightGBM Ensemble Training ---")

for fold, (train_index, val_index) in enumerate(skf.split(X_train, y_train)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    # Create train and validation splits
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Initialize the model
    model = lgb.LGBMClassifier(**lgb_params)
    
    # Train the model
    model.fit(X_train_fold, y_train_fold,
              eval_set=[(X_val_fold, y_val_fold)],
              eval_metric='multi_logloss',
              callbacks=[lgb.early_stopping(100, verbose=False)],
              categorical_feature=categorical_features
             )
    
    # --- 4. Validation ---
    # Predict on the validation set
    val_preds = model.predict(X_val_fold)
    
    # Calculate weighted F1-score for this fold
    fold_f1 = f1_score(y_val_fold, val_preds, average='weighted')
    print(f"Fold {fold+1} Weighted F1: {fold_f1:.4f}")
    oof_f1_scores.append(fold_f1)
    
    # Add this fold's predictions (probabilities) to the test set predictions
    # We average the probabilities from all 10 models
    test_predictions += model.predict_proba(X_test) / N_SPLITS
    models.append(model)

print("-----------------------------------")
print(f"Mean OOF Weighted F1: {np.mean(oof_f1_scores):.4f}")
print("Ensemble training complete.")


--- Starting 10-Fold LightGBM Ensemble Training ---
--- Fold 1/10 ---
Fold 1 Weighted F1: 0.7653
--- Fold 2/10 ---
Fold 2 Weighted F1: 0.7687
--- Fold 3/10 ---
Fold 3 Weighted F1: 0.7636
--- Fold 4/10 ---
Fold 4 Weighted F1: 0.7649
--- Fold 5/10 ---
Fold 5 Weighted F1: 0.7582
--- Fold 6/10 ---
Fold 6 Weighted F1: 0.7649
--- Fold 7/10 ---
Fold 7 Weighted F1: 0.7561
--- Fold 8/10 ---
Fold 8 Weighted F1: 0.7666
--- Fold 9/10 ---
Fold 9 Weighted F1: 0.7631
--- Fold 10/10 ---
Fold 10 Weighted F1: 0.7577
-----------------------------------
Mean OOF Weighted F1: 0.7629
Ensemble training complete.


In [89]:
# --- [This code would run right after the training loop] ---

print("-----------------------------------")
print(f"Mean OOF Weighted F1: {np.mean(oof_f1_scores):.4f}")
print("Ensemble training complete.")

# --- 5. Feature Importance (Ensemble Average) ---
print("\nCalculating feature importances...")

try:
    # Get feature names from the processed X_train dataframe
    features = X_train.columns.tolist()
    
    # Initialize an array to store the sum of importances from all models
    total_importances = np.zeros(len(features))
    
    # Sum importances from all models in the ensemble
    for model in models:
        total_importances += model.feature_importances_
        
    # Calculate the average importance
    average_importances = total_importances / len(models)
    
    # Create the feature importance DataFrame
    feature_importance_df = pd.DataFrame({
        'feature': features,
        'importance': average_importances
    }).sort_values(by='importance', ascending=False)

    print("\nTop 15 Most Important Features (Averaged across 10-fold ensemble):")
    print(feature_importance_df.head(15))
    
except Exception as e:
    print(f"\nAn error occurred while calculating feature importances: {e}")


# --- 6. Generate Submission File ---
# (This part is from the original script, it uses the ensemble predictions)

print("\nMaking predictions on the test dataset (using ensemble)...")
# Get the final class prediction by finding the class with the highest probability
final_predictions = np.argmax(test_predictions, axis=1)

-----------------------------------
Mean OOF Weighted F1: 0.7629
Ensemble training complete.

Calculating feature importances...

Top 15 Most Important Features (Averaged across 10-fold ensemble):
                  feature  importance
4   days_since_last_login      4968.1
2    total_playtime_hours      4756.8
7      spending_frequency      4749.9
6    avg_monthly_spending      4653.2
8            friend_count      4646.8
0          play_frequency      4506.7
20       rare_items_count      4451.7
12    gifts_sent_received      4427.0
30  peak_concurrent_hours      4415.1
1    avg_session_duration      4404.2
21   speed_of_progression      4376.1
16        win_rate_ranked      4371.3
5      total_spending_thb      4346.5
10    chat_activity_score      4238.2
11    friend_invites_sent      4219.5

Making predictions on the test dataset (using ensemble)...


In [90]:
import joblib
print(f"Saving the best model to 'Player_segment_classification'...")
model_filename = 'Model/Player_segment_classification_V1'
joblib.dump(model, model_filename)
print(f"Model saved successfully to {model_filename}.")

Saving the best model to 'Player_segment_classification'...
Model saved successfully to Model/Player_segment_classification_V1.


In [91]:
final_predictions = np.argmax(test_predictions, axis=1)
final_predictions

array([2, 0, 0, ..., 1, 0, 3], shape=(25889,))