In [11]:
# Cell 1 – Imports & path setup
from pathlib import Path
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    balanced_accuracy_score,
    classification_report,
    f1_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier

sns.set_theme(style="whitegrid")

SEED = 42

PROJ_ROOT = Path.cwd()
while PROJ_ROOT != PROJ_ROOT.parent and not (PROJ_ROOT / "data").exists():
    PROJ_ROOT = PROJ_ROOT.parent

DATA_DIR = PROJ_ROOT / "data" / "processed"
REPORTS_DIR = PROJ_ROOT / "reports"
TABLES_DIR = REPORTS_DIR / "tables"
FIGURES_DIR = REPORTS_DIR / "figures"

for path in [REPORTS_DIR, TABLES_DIR, FIGURES_DIR]:
    path.mkdir(parents=True, exist_ok=True)

DATA_PATH = DATA_DIR / "clean_data.csv"

In [12]:
# Cell 2 – Load processed dataset
df = pd.read_csv(DATA_PATH, low_memory=False)
print(f"Loaded {len(df):,} rows from {DATA_PATH}")
df.head()

Loaded 90,189 rows from c:\Users\umyana\Documents\mobile_game_analytics_pipeline\data\processed\clean_data.csv


Unnamed: 0,userid,version,session_count,retention_1,retention_7,acquisition_channel,country,platform,purchase,CAC,revenue,ROI
0,116,gate_30,3,0,0,Facebook,USA,Google Play,0,2.8,0.023765,-0.991512
1,337,gate_30,38,0,0,TikTok,USA,Google Play,0,1.7,0.062804,-0.963057
2,377,gate_40,165,1,0,Facebook,USA,Google Play,0,2.8,0.087634,-0.968702
3,483,gate_40,1,0,0,Facebook,Mexico,Google Play,0,2.8,0.011883,-0.995756
4,488,gate_40,179,0,1,TikTok,USA,App Store,0,1.7,0.771525,-0.546162


In [13]:
# Cell 3 – Target & feature lists
# Define churn target: 1 = churn by day 7, 0 = retained
df["churn_flag"] = (~df["retention_7"].astype(bool)).astype(int)

numeric_features = [
    "session_count",
    "purchase",
    "CAC",
    "revenue",
    "ROI",
    "retention_1",
]

categorical_features = [
    "acquisition_channel",
    "country",
    "platform",
    "version",
]

feature_cols = numeric_features + categorical_features
target_col = "churn_flag"

missing = [col for col in feature_cols + [target_col] if col not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

df[feature_cols + [target_col]].describe(include="all")

Unnamed: 0,session_count,purchase,CAC,revenue,ROI,retention_1,acquisition_channel,country,platform,version,churn_flag
count,90189.0,90189.0,90189.0,90189.0,90189.0,90189.0,90189,90189,90189,90189,90189.0
unique,,,,,,,4,3,2,2,
top,,,,,,,Instagram,USA,Google Play,gate_40,
freq,,,,,,,36281,69543,67690,45489,
mean,51.872457,0.055772,2.125861,0.546676,-0.593476,0.454867,,,,,0.667055
std,195.050858,0.229482,0.72223,3.315148,3.92046,0.497962,,,,,0.47127
min,0.0,0.0,0.3,0.0,-1.0,0.0,,,,,0.0
25%,5.0,0.0,1.7,0.035648,-0.983623,0.0,,,,,0.0
50%,16.0,0.0,2.3,0.062359,-0.970268,0.0,,,,,1.0
75%,51.0,0.0,2.8,0.205964,-0.858005,1.0,,,,,1.0


In [14]:
# Cell 4 – Train/test split and preprocessing pipeline
X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=SEED
)

numeric_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numeric_features),
        ("cat", categorical_pipeline, categorical_features),
    ]
)

In [15]:
# Cell 5 – Define candidate models
models = {
    "log_reg": Pipeline(
        steps=[
            ("prep", preprocessor),
            (
                "model",
                LogisticRegression(
                    max_iter=200,
                    class_weight="balanced",
                    random_state=SEED,
                ),
            ),
        ]
    ),
    "xgb": Pipeline(
        steps=[
            ("prep", preprocessor),
            (
                "model",
                XGBClassifier(
                    objective="binary:logistic",
                    eval_metric="logloss",
                    subsample=0.8,
                    colsample_bytree=0.8,
                    max_depth=4,
                    learning_rate=0.1,
                    n_estimators=300,
                    reg_lambda=1.0,
                    random_state=SEED,
                    n_jobs=-1,
                    use_label_encoder=False,
                ),
            ),
        ]
    ),
}

models

{'log_reg': Pipeline(steps=[('prep',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   ['session_count', 'purchase',
                                                    'CAC', 'revenue', 'ROI',
                                                    'retention_1']),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='most_frequent')),
                                                                   ('encoder',
   

In [16]:
# Cell 6 – Cross-validation scores
scoring = {
    "roc_auc": "roc_auc",
    "pr_auc": "average_precision",
    "balanced_accuracy": "balanced_accuracy",
    "accuracy": "accuracy",
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

records = []
for name, pipeline in models.items():
    cv_results = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False,
    )
    for fold_idx in range(cv.n_splits):
        records.append(
            {
                "model": name,
                "fold": fold_idx + 1,
                "roc_auc": cv_results["test_roc_auc"][fold_idx],
                "pr_auc": cv_results["test_pr_auc"][fold_idx],
                "balanced_accuracy": cv_results["test_balanced_accuracy"][fold_idx],
                "accuracy": cv_results["test_accuracy"][fold_idx],
            }
        )

backtest_df = pd.DataFrame(records)
backtest_path = TABLES_DIR / "backtest_scores.csv"
backtest_df.to_csv(backtest_path, index=False)
print(f"Saved CV scores to {backtest_path}")
backtest_df.groupby("model").mean()

Saved CV scores to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\backtest_scores.csv


Unnamed: 0_level_0,fold,roc_auc,pr_auc,balanced_accuracy,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
log_reg,3.0,0.621893,0.746809,0.614358,0.616866
xgb,3.0,0.615817,0.742376,0.514276,0.661075


In [17]:
# Cell 7 – Fit models and evaluate on test set
metrics_summary = {}

for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= 0.5).astype(int)

    metrics_summary[name] = {
        "roc_auc": roc_auc_score(y_test, y_proba),
        "pr_auc": average_precision_score(y_test, y_proba),
        "accuracy": accuracy_score(y_test, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
    }

metrics_path = TABLES_DIR / "model_metrics.json"
with metrics_path.open("w", encoding="utf-8") as f:
    json.dump(metrics_summary, f, indent=2)

print(f"Saved test metrics to {metrics_path}")
metrics_summary

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Saved test metrics to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\model_metrics.json


{'log_reg': {'roc_auc': 0.6214952724658067,
  'pr_auc': 0.7461229903529246,
  'accuracy': 0.6153305983664117,
  'balanced_accuracy': 0.6135817909151395,
  'f1': 0.6821596530874},
 'xgb': {'roc_auc': 0.6210055821005125,
  'pr_auc': 0.7434859252372938,
  'accuracy': 0.6641534538197139,
  'balanced_accuracy': 0.5153557430300996,
  'f1': 0.7923683308579915}}

In [18]:
# Cell 8 – Plot ROC and PR curves for top model (choose by ROC-AUC)
best_model_name = max(metrics_summary, key=lambda m: metrics_summary[m]["roc_auc"])
best_pipeline = models[best_model_name]
best_pipeline.fit(X_train, y_train)

y_test_proba = best_pipeline.predict_proba(X_test)[:, 1]

fpr, tpr, _ = roc_curve(y_test, y_test_proba)
precision, recall, _ = precision_recall_curve(y_test, y_test_proba)

fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# ROC
ax[0].plot(fpr, tpr, label=f"{best_model_name} (AUC={metrics_summary[best_model_name]['roc_auc']:.3f})")
ax[0].plot([0, 1], [0, 1], linestyle="--", color="gray")
ax[0].set_xlabel("False Positive Rate")
ax[0].set_ylabel("True Positive Rate")
ax[0].set_title("ROC Curve")
ax[0].legend(loc="lower right")

# PR
ax[1].plot(recall, precision, label=f"{best_model_name} (AP={metrics_summary[best_model_name]['pr_auc']:.3f})")
baseline = y_test.mean()
ax[1].hlines(baseline, 0, 1, linestyle="--", color="gray", label=f"Baseline (churn rate={baseline:.3f})")
ax[1].set_xlabel("Recall")
ax[1].set_ylabel("Precision")
ax[1].set_title("Precision-Recall Curve")
ax[1].legend(loc="upper right")

fig.tight_layout()
roc_pr_path = FIGURES_DIR / "roc_pr_curves.png"
fig.savefig(roc_pr_path, dpi=150, bbox_inches="tight")
plt.close(fig)

print(f"Saved ROC/PR plot to {roc_pr_path}")

Saved ROC/PR plot to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\figures\roc_pr_curves.png


In [19]:
# Cell 9 – Identify high-risk segments on test set
test_preds = best_pipeline.predict_proba(X_test)[:, 1]
test_results = X_test.copy()
test_results["churn_prob"] = test_preds
test_results["retained"] = 1 - y_test.values

segment_summary = (
    test_results.groupby(["acquisition_channel", "platform"])
    .agg(
        n_users=("churn_prob", "size"),
        avg_churn_prob=("churn_prob", "mean"),
        median_churn_prob=("churn_prob", "median"),
    )
    .reset_index()
    .sort_values("avg_churn_prob", ascending=False)
)

risk_path = TABLES_DIR / "churn_risk_segments.csv"
segment_summary.to_csv(risk_path, index=False)
print(f"Saved segment risk table to {risk_path}")
segment_summary.head(10)

Saved segment risk table to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\churn_risk_segments.csv


Unnamed: 0,acquisition_channel,platform,n_users,avg_churn_prob,median_churn_prob
7,TikTok,Google Play,3994,0.52509,0.619004
1,Facebook,Google Play,5974,0.518362,0.613544
6,TikTok,App Store,1345,0.51549,0.609315
0,Facebook,App Store,2006,0.507324,0.604405
3,Instagram,Google Play,8236,0.503371,0.599991
5,Organic,Google Play,2062,0.49395,0.590272
2,Instagram,App Store,2702,0.490219,0.585706
4,Organic,App Store,738,0.472635,0.370864


In [20]:
# Cell 10 – Top-decile lift & narrative
eval_df = pd.DataFrame(
    {
        "churn_flag": y_test.values,
        "churn_prob": test_preds,
        "acquisition_channel": X_test["acquisition_channel"].values,
        "platform": X_test["platform"].values,
    }
)

eval_df = eval_df.sort_values("churn_prob", ascending=False)
top_decile = eval_df.head(max(1, int(0.1 * len(eval_df))))

overall_churn_rate = eval_df["churn_flag"].mean()
top_decile_churn_rate = top_decile["churn_flag"].mean()
lift = top_decile_churn_rate / overall_churn_rate if overall_churn_rate else np.nan

print(f"Overall churn rate (test): {overall_churn_rate:.2%}")
print(f"Top 10% predicted churn rate: {top_decile_churn_rate:.2%}")
print(f"Top-decile lift: {lift:.2f}×")

top_decile.groupby(["acquisition_channel", "platform"]).size().sort_values(ascending=False).head()

Overall churn rate (test): 66.71%
Top 10% predicted churn rate: 78.30%
Top-decile lift: 1.17×


acquisition_channel  platform   
TikTok               Google Play    1520
Facebook             Google Play    1185
dtype: int64