<a href="https://colab.research.google.com/github/zakirangwala/esrb-wizard/blob/main/notebooks/classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Do Players Prefer Games with Mature Content?**

# Data Cleanse & Preprocessing

In [None]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import sklearn
import ast

In [5]:
from google.colab import drive
drive.mount('/content/drive')

#load data
clf_df = pd.read_csv("/content/drive/MyDrive/CP322/classification.csv") # save to github: notebooks/classification.ipynb

print(clf_df.shape)
clf_df.head()

(113132, 96)


Unnamed: 0,Peak CCU,Price,Discount,DLC count,Windows,Mac,Linux,Metacritic score,User score,Achievements,...,Steam Workshop,SteamVR Collectibles,Tracked Controller Support,Tracked Motion Controller Support,VR Only,VR Support,VR Supported,Valve Anti-Cheat enabled,played_before,esrb_available
0,0,19.99,0,0,1,0,0,0,0,30,...,0,0,0,0,0,0,0,0,0,0
1,0,0.99,0,0,1,1,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
2,0,4.99,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,5.99,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.0,0,0,1,1,0,0,0,17,...,0,0,0,0,0,0,0,0,0,0


In [33]:
#define target and features
clf_df = clf_df.rename(columns={'played_before': 'played_past_2weeks'})
TARGET_COL = "played_past_2weeks"

#columns that leak "Average playtime two weeks"

leak_cols = [
    "Average playtime two weeks",
]

cols_to_drop = [TARGET_COL] + [c for c in leak_cols if c in clf_df.columns]

#drop target, IDs, and leakage columns
X = clf_df.drop(columns=cols_to_drop, errors="ignore")
y = clf_df[TARGET_COL].astype(int)

#keep only numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns
X = X[numeric_cols]

print("Final feature columns:", list(numeric_cols))
print("X shape:", X.shape)
print("y value counts:")
print(y.value_counts())

clf_df.head()

Final feature columns: ['Peak CCU', 'Price', 'Discount', 'DLC count', 'Windows', 'Mac', 'Linux', 'Metacritic score', 'User score', 'Achievements', 'Recommendations', 'release_year', 'midpoint_estimated_owners', 'esrb_age', 'voice_acting', '%_positive_reviews', '360 Video', 'Accounting', 'Action', 'Adventure', 'Animation & Modeling', 'Audio Production', 'Casual', 'Design & Illustration', 'Documentary', 'Early Access', 'Education', 'Episodic', 'Game Development', 'Gore', 'Indie', 'Massively Multiplayer', 'Movie', 'Nudity', 'Photo Editing', 'RPG', 'Racing', 'Sexual Content', 'Short', 'Simulation', 'Software Training', 'Sports', 'Strategy', 'Tutorial', 'Utilities', 'Video Production', 'Violent', 'Web Publishing', 'Captions available', 'Co-op', 'Commentary available', 'Cross-Platform Multiplayer', 'Family Sharing', 'Full controller support', 'HDR available', 'In-App Purchases', 'Includes Source SDK', 'Includes level editor', 'LAN Co-op', 'LAN PvP', 'MMO', 'Mods', 'Mods (require HL2)', 'Mult

Unnamed: 0,Peak CCU,Price,Discount,DLC count,Windows,Mac,Linux,Metacritic score,User score,Achievements,...,Steam Workshop,SteamVR Collectibles,Tracked Controller Support,Tracked Motion Controller Support,VR Only,VR Support,VR Supported,Valve Anti-Cheat enabled,played_past_2weeks,esrb_available
0,0,19.99,0,0,1,0,0,0,0,30,...,0,0,0,0,0,0,0,0,0,0
1,0,0.99,0,0,1,1,0,0,0,12,...,0,0,0,0,0,0,0,0,0,0
2,0,4.99,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,5.99,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0.0,0,0,1,1,0,0,0,17,...,0,0,0,0,0,0,0,0,0,0


In [50]:
from sklearn.model_selection import train_test_split

#train+val vs test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.10,
    stratify=y,
    random_state=42,
)

#split train+val into train and val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.1/0.9,       #0.11 of 0.90 = 0.10. final 80/10/10 split
    stratify=y_temp,
    random_state=42,
)

print("Train size:", X_train.shape[0])
print("Val size:",   X_val.shape[0])
print("Test size:",  X_test.shape[0])
print("")
print("Train class balance:")
print(y_train.value_counts(normalize=True))

Train size: 90504
Val size: 11314
Test size: 11314

Train class balance:
played_past_2weeks
0    0.975791
1    0.024209
Name: proportion, dtype: float64


In [69]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    average_precision_score,
    precision_recall_curve,
)

# Dataframe to compare performance metrics on classifier models
classifier_performance_data = {'Metric': ['Average Precision', 'Accuracy', 'AUC_ROC', 'Precision', 'Recall', 'F1 Score']}

clf_performance = pd.DataFrame(classifier_performance_data)
clf_performance = clf_performance.set_index('Metric')
clf_performance.head(7)

Average Precision
Accuracy
AUC_ROC
Precision
Recall
F1 Score


# Random Forest Classifier

In [61]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def randomForestObjective(trial):
    # paramaters to hypertune
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 15, 45)
    min_samples_split = trial.suggest_int("min_samples_split", 5, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 10)
    #max_features = trial.suggest_int("max_features", 1, X_temp.shape[1]) #50
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    class_weight = trial.suggest_categorical("class_weight", ["balanced", "balanced_subsample"])
    bootstrap = trial.suggest_categorical("bootstrap", [True, False])
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])


    # random forest classifier
    clf_rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        max_features=max_features,
        class_weight=class_weight,
        bootstrap=bootstrap,
        criterion=criterion,
        n_jobs=-1
    )

    # precision is the best metric for such an imbalanced dataset
    prec_score = cross_val_score(clf_rf, X_train, y_train, cv=3, scoring="average_precision").mean()

    return prec_score

# Utilize Optuna to obtain the best parameters
study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(randomForestObjective, n_trials=25)
best_params = study_rf.best_params

# print best parameters
print("Best hyperparameters:", best_params)
print("Best average precision:", study_rf.best_value)

[I 2025-11-22 03:13:16,941] A new study created in memory with name: no-name-0e13fa49-af3d-40d7-8956-5913fa636015
[I 2025-11-22 03:15:26,582] Trial 0 finished with value: 0.775589764094445 and parameters: {'n_estimators': 456, 'max_depth': 29, 'min_samples_split': 14, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'class_weight': 'balanced_subsample', 'bootstrap': False, 'criterion': 'gini'}. Best is trial 0 with value: 0.775589764094445.
[I 2025-11-22 03:16:36,209] Trial 1 finished with value: 0.791265573841598 and parameters: {'n_estimators': 84, 'max_depth': 44, 'min_samples_split': 18, 'min_samples_leaf': 10, 'max_features': None, 'class_weight': 'balanced', 'bootstrap': True, 'criterion': 'log_loss'}. Best is trial 1 with value: 0.791265573841598.
[I 2025-11-22 03:18:49,581] Trial 2 finished with value: 0.7818411268740246 and parameters: {'n_estimators': 389, 'max_depth': 30, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'class_weight': 'balanced_subsampl

KeyboardInterrupt: 

balanced sub_sample as a class weight consistently makes the score low. sqrt is better than log2. More features leads to lower scores

Trial 13 finished with value: 0.765587526567404 and parameters: {'n_estimators': 122, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 6, 'max_features': 40}. Best is trial 13 with value: 0.765587526567404.

{'n_estimators': 311, 'max_depth': 28, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': 40, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.7885331937288078.

In [62]:
from sklearn.model_selection import StratifiedKFold, cross_val_predict

# refit model based on best parameters
clf_rf = RandomForestClassifier(
    **best_params,
    random_state=42,
    n_jobs=-1
)

# check model fit based on cross-validation
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

y_train_proba = cross_val_predict(
    clf_rf,
    X_train,
    y_train,
    cv=skf,
    method='predict_proba'
)[:, 1]

y_train_class = (y_train_proba >= 0.5).astype(int)

print("\nTRAIN METRICS:")
print("Average Precision:", average_precision_score(y_train, y_train_proba))
print("Accuracy:", accuracy_score(y_train, y_train_class))
print("ROC AUC:", roc_auc_score(y_train, y_train_proba))
print("F1:", f1_score(y_train, y_train_class))
print("Precision:", precision_score(y_train, y_train_class))
print("Recall:", recall_score(y_train, y_train_class))


# threshold tuning based on validation set
clf_rf.fit(X_train, y_train)
y_val_proba = clf_rf.predict_proba(X_val)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-9)

best_thresh = thresholds[f1_scores[:-1].argmax()]
print("\nBest threshold for F1 (val set):", best_thresh)

# retrain final model based on training and validation sets
clf_rf_final = RandomForestClassifier(
    **best_params,
    random_state=42,
    n_jobs=-1
)

clf_rf_final.fit(X_temp, y_temp)

# performance of model against the test dataset
y_test_proba = clf_rf_final.predict_proba(X_test)[:, 1]
y_test_class = (y_test_proba >= best_thresh).astype(int)

print("\nTEST METRICS:")
print("Average Precision:", average_precision_score(y_test, y_test_proba))
print("Accuracy:", accuracy_score(y_test, y_test_class))
print("ROC AUC:", roc_auc_score(y_test, y_test_proba))
print("F1 score:", f1_score(y_test, y_test_class))
print("Precision:", precision_score(y_test, y_test_class))
print("Recall:", recall_score(y_test, y_test_class))

# add metrics to table for comparison later
clf_performance['Random Forest'] = [average_precision_score(y_test, y_test_proba),
                                    accuracy_score(y_test, y_test_class),
                                    roc_auc_score(y_test, y_test_proba),
                                    precision_score(y_test, y_test_class),
                                    recall_score(y_test, y_test_class),
                                    f1_score(y_test, y_test_class),
                                    ]


TRAIN METRICS (3-fold CV):
Average Precision: 0.7922235699569227
Accuracy: 0.9815698753646248
ROC AUC: 0.9862429275053245
F1: 0.6712652739456051
Precision: 0.5907041276448144
Recall: 0.7772706526700137

Best threshold for F1 (val set): 0.6570397437002733

TEST METRICS:
Average Precision: 0.7702027818165552
Accuracy: 0.9843556655471097
ROC AUC: 0.9810814027292923
F1 score: 0.6833631484794276
Precision: 0.6701754385964912
Recall: 0.6970802919708029


ValueError: Classification metrics can't handle a mix of binary and continuous targets

# XGBoost Classifier

In [None]:
# https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
# https://www.google.com/search?q=Zero-Inflated+Bernoulli+%28ZIBer%29+Model+python&sca_esv=16a5b3c49c9a2e73&sxsrf=AE3TifMOdqtXCBqe2iRojqJlBYye0wsDBw%3A1763759245463&ei=jdQgadSGHNKe0PEPm_qyqAw&ved=0ahUKEwiUga6ak4SRAxVSDzQIHRu9DMUQ4dUDCBE&uact=5&oq=Zero-Inflated+Bernoulli+%28ZIBer%29+Model+python&gs_lp=Egxnd3Mtd2l6LXNlcnAiLFplcm8tSW5mbGF0ZWQgQmVybm91bGxpIChaSUJlcikgTW9kZWwgcHl0aG9uMgUQABjvBTIFEAAY7wVIlhRQ2wVY7BJwBHgAkAEAmAHXAaABtgiqAQUxLjUuMbgBA8gBAPgBAZgCC6AC2gjCAggQABiwAxjvBcICCxAAGIAEGLADGKIEwgIFECEYnwXCAggQABiABBiiBMICBBAhGBXCAgcQIRigARgKmAMAiAYBkAYEkgcFNS41LjGgB_EMsgcFMS41LjG4B84IwgcGMC4xMC4xyAcV&sclient=gws-wiz-serp

In [64]:
from xgboost import XGBClassifier

#compute scale_pos_weight for imbalance (pos / neg)
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / pos if pos > 0 else 1.0
print("scale_pos_weight:", scale_pos_weight)

def xgboostObjective(trial):
    # paramaters to hypertune
    eval_metric = trial.suggest_categorical("eval_metric", ["logloss", "aucpr"])
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)
    gamma = trial.suggest_int("gamma", 0, 5)
    max_delta_step = trial.suggest_int("max_delta_step", 0, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.3)
    subsample = trial.suggest_float("subsample", 0.1, 1)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1)
    reg_alpha = trial.suggest_float("reg_alpha", 0, 1)
    reg_lambda = trial.suggest_float("reg_lambda", 0, 10)

    # xgboost classifier
    clf = XGBClassifier(
        objective="binary:logistic",
        eval_metric=eval_metric,
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        gamma=gamma,
        max_delta_step=max_delta_step,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=scale_pos_weight,
    )

    # precision is the best metric for such an imbalanced dataset
    prec_score = cross_val_score(clf, X_train, y_train, cv=3, scoring="average_precision").mean()

    return prec_score

# Utilize Optuna to obtain the best parameters
study = optuna.create_study(direction="maximize")
study.optimize(xgboostObjective, n_trials=25)
best_params = study.best_params

# print best parameters
print("Best hyperparameters:", best_params)
print("Best average precision:", study.best_value)

[I 2025-11-22 03:48:40,153] A new study created in memory with name: no-name-6f9762e0-e3b1-471c-b720-c64dbf7b6484


scale_pos_weight: 40.30716567777271


[I 2025-11-22 03:49:11,743] Trial 0 finished with value: 0.778014997375767 and parameters: {'eval_metric': 'aucpr', 'n_estimators': 358, 'max_depth': 6, 'min_child_weight': 2, 'gamma': 3, 'max_delta_step': 8, 'learning_rate': 0.23644272134512706, 'subsample': 0.7733959057386347, 'colsample_bytree': 0.8014534063263812, 'reg_alpha': 0.8209756376755918, 'reg_lambda': 3.3368864869992976}. Best is trial 0 with value: 0.778014997375767.
[I 2025-11-22 03:49:28,861] Trial 1 finished with value: 0.7781325558554301 and parameters: {'eval_metric': 'aucpr', 'n_estimators': 149, 'max_depth': 7, 'min_child_weight': 8, 'gamma': 3, 'max_delta_step': 6, 'learning_rate': 0.07859907478153513, 'subsample': 0.2506885350608607, 'colsample_bytree': 0.9355596501555814, 'reg_alpha': 0.502642503578705, 'reg_lambda': 4.930131417342091}. Best is trial 1 with value: 0.7781325558554301.
[I 2025-11-22 03:49:59,505] Trial 2 finished with value: 0.7705387820059517 and parameters: {'eval_metric': 'logloss', 'n_estimato

KeyboardInterrupt: 

In [70]:
# refit model based on best parameters
clf = XGBClassifier(
    objective="binary:logistic",
    **best_params,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight
)

# check model fit based on cross-validation
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

y_train_proba = cross_val_predict(
    clf,
    X_train,
    y_train,
    cv=skf,
    method='predict_proba'
)[:, 1]

y_train_class = (y_train_proba >= 0.5).astype(int)

print("\nTRAIN METRICS:")
print("Average Precision:", average_precision_score(y_train, y_train_proba))
print("Accuracy:", accuracy_score(y_train, y_train_class))
print("ROC AUC:", roc_auc_score(y_train, y_train_proba))
print("F1:", f1_score(y_train, y_train_class))
print("Precision:", precision_score(y_train, y_train_class))
print("Recall:", recall_score(y_train, y_train_class))


# threshold tuning based on validation set
clf.fit(X_train, y_train)
y_val_proba = clf.predict_proba(X_val)[:, 1]

precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_proba)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-9)

best_thresh = thresholds[f1_scores[:-1].argmax()]
print("\nBest threshold for F1 (val set):", best_thresh)

# retrain final model based on training and validation sets
clf_final = XGBClassifier(
    objective="binary:logistic",
    **best_params,
    random_state=42,
    n_jobs=-1,
    scale_pos_weight=scale_pos_weight
)

clf_final.fit(X_temp, y_temp)

# performance of model against the test dataset
y_test_proba = clf_final.predict_proba(X_test)[:, 1]
y_test_class = (y_test_proba >= best_thresh).astype(int)

print("\nTEST METRICS:")
print("Average Precision:", average_precision_score(y_test, y_test_proba))
print("Accuracy:", accuracy_score(y_test, y_test_class))
print("ROC AUC:", roc_auc_score(y_test, y_test_proba))
print("F1 score:", f1_score(y_test, y_test_class))
print("Precision:", precision_score(y_test, y_test_class))
print("Recall:", recall_score(y_test, y_test_class))

# add metrics to table for comparison later
clf_performance['XGBoost'] = [average_precision_score(y_test, y_test_proba),
                                    accuracy_score(y_test, y_test_class),
                                    roc_auc_score(y_test, y_test_proba),
                                    precision_score(y_test, y_test_class),
                                    recall_score(y_test, y_test_class),
                                    f1_score(y_test, y_test_class),
                                    ]

Parameters: { "class_weight", "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "class_weight", "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "class_weight", "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



TRAIN METRICS:
Average Precision: 0.7760397576003065
Accuracy: 0.9853487138690002
ROC AUC: 0.9830914489898623
F1: 0.6909090909090909
Precision: 0.7060505002382087
Recall: 0.6764034687357371


Parameters: { "class_weight", "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best threshold for F1 (val set): 0.74380964


Parameters: { "class_weight", "max_features", "min_samples_leaf", "min_samples_split" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



TEST METRICS:
Average Precision: 0.756463498565578
Accuracy: 0.9853279123210182
ROC AUC: 0.979478406326034
F1 score: 0.6719367588932806
Precision: 0.7327586206896551
Recall: 0.6204379562043796


# LightGBM

In [None]:
import lightgbm as lgb
from sklearn.metrics import make_scorer

# Custom scorer for cross_val_score
ap_scorer = make_scorer(average_precision_score, needs_proba=True)

def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 15, 255),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'class_weight': 'balanced',  # handles imbalance
        'random_state': 42,
        'n_jobs': -1,
    }

    clf = lgb.LGBMClassifier(**param)

    # Stratified KFold for imbalanced data
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    score = cross_val_score(
        clf, X_temp, y_temp,
        cv=skf,
        scoring=ap_scorer
    ).mean()

    return score

# Create Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # increase trials for better search

print("Best hyperparameters:", study.best_params)
print("Best average precision:", study.best_value)


# Performance Metric Comparison

In [71]:
clf_performance.head(7)

Unnamed: 0_level_0,XGBoost
Metric,Unnamed: 1_level_1
Average Precision,0.756463
Accuracy,0.985328
AUC_ROC,0.979478
Precision,0.732759
Recall,0.620438
F1 Score,0.671937


In [None]:
from sklearn.metrics import confusion_matrix

y_val_pred = best_clf.predict(X_val)
cm = confusion_matrix(y_val, y_val_pred)
cm

array([[19814,  2265],
       [  211,   337]], dtype=int64)