In [9]:
# Cell 1 – Imports & path setup
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

sns.set_theme(style="whitegrid")

PROJ_ROOT = Path.cwd()
while PROJ_ROOT != PROJ_ROOT.parent and not (PROJ_ROOT / "data").exists():
    PROJ_ROOT = PROJ_ROOT.parent

DATA_DIR = PROJ_ROOT / "data" / "processed"
REPORTS_DIR = PROJ_ROOT / "reports"
TABLES_DIR = REPORTS_DIR / "tables"
FIGURES_DIR = REPORTS_DIR / "figures"

for path in [REPORTS_DIR, TABLES_DIR, FIGURES_DIR]:
    path.mkdir(parents=True, exist_ok=True)

PARQUET_PATH = DATA_DIR / "events.parquet"
CSV_PATH = DATA_DIR / "clean_data.csv"


In [10]:
# Cell 2 – Load processed dataset
if PARQUET_PATH.exists():
    df = pd.read_parquet(PARQUET_PATH)
    source_path = PARQUET_PATH
else:
    df = pd.read_csv(CSV_PATH, low_memory=False)
    source_path = CSV_PATH

print(f"Loaded {len(df):,} rows from {source_path}")
df.head()


Loaded 90,189 rows from c:\Users\umyana\Documents\mobile_game_analytics_pipeline\data\processed\events.parquet


Unnamed: 0,userid,version,session_count,retention_1,retention_7,acquisition_channel,country,platform,purchase,CAC,revenue,ROI
0,116,gate_30,3,False,False,Facebook,USA,Google Play,0,2.8,0.038024,-0.98642
1,337,gate_30,38,True,False,TikTok,USA,Google Play,0,1.7,0.100486,-0.94089
2,377,gate_40,165,True,False,Facebook,USA,Google Play,0,2.8,0.140215,-0.949923
3,483,gate_40,1,False,False,Facebook,Mexico,Google Play,0,2.8,0.019012,-0.99321
4,488,gate_40,179,True,True,TikTok,USA,App Store,0,1.7,1.23444,-0.273859


In [11]:
# Cell 3 – Sanity checks
required_cols = {
    "userid",
    "acquisition_channel",
    "platform",
    "version",
    "retention_1",
    "retention_7",
}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns: {sorted(missing)}")

summary = df[["retention_1", "retention_7"]].mean().rename("rate")
print(f"Average D1 retention: {summary['retention_1']:.2%}")
print(f"Average D7 retention: {summary['retention_7']:.2%}")


Average D1 retention: 44.52%
Average D7 retention: 18.61%


In [12]:
# Cell 4 – Retention by acquisition channel
channel_retention = (
    df.groupby("acquisition_channel", as_index=False)
    .agg(
        users=("userid", "nunique"),
        d1_rate=("retention_1", "mean"),
        d7_rate=("retention_7", "mean"),
    )
)

channel_retention["d7_from_d1"] = channel_retention["d7_rate"] / channel_retention["d1_rate"].replace(0, np.nan)
channel_retention = channel_retention.sort_values("d7_rate", ascending=False)

channel_retention


Unnamed: 0,acquisition_channel,users,d1_rate,d7_rate,d7_from_d1
2,Organic,9153,0.445865,0.187807,0.42122
1,Instagram,36281,0.448637,0.187178,0.417214
3,TikTok,17979,0.441404,0.185772,0.420867
0,Facebook,26776,0.442897,0.184157,0.415802


In [13]:
# Cell 5 – Export channel retention table
retention_csv_path = TABLES_DIR / "retention_by_channel.csv"
channel_retention.round(6).to_csv(retention_csv_path, index=False)
print(f"Saved channel retention table to {retention_csv_path}")


Saved channel retention table to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\retention_by_channel.csv


In [14]:
# Cell 6 – Cohort table by version
cohort_version = (
    df.groupby(["version", "acquisition_channel"], as_index=False)
    .agg(
        users=("userid", "nunique"),
        d1_rate=("retention_1", "mean"),
        d7_rate=("retention_7", "mean"),
    )
)

cohort_version_path = TABLES_DIR / "retention_cohort_by_version.csv"
cohort_version.round(6).to_csv(cohort_version_path, index=False)
print(f"Saved version-level cohort table to {cohort_version_path}")
cohort_version.head()


Saved version-level cohort table to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\retention_cohort_by_version.csv


Unnamed: 0,version,acquisition_channel,users,d1_rate,d7_rate
0,gate_30,Facebook,13288,0.446267,0.184753
1,gate_30,Instagram,17960,0.451058,0.191704
2,gate_30,Organic,4572,0.451881,0.190507
3,gate_30,TikTok,8880,0.443356,0.195158
4,gate_40,Facebook,13488,0.439576,0.183571


In [15]:
# Cell 7 – Heatmap for D7 retention
heatmap_data = cohort_version.pivot_table(
    index="acquisition_channel",
    columns="version",
    values="d7_rate",
)

plt.figure(figsize=(10, 4 + len(heatmap_data) * 0.3))
sns.heatmap(
    heatmap_data,
    cmap="YlGnBu",
    annot=True,
    fmt=".1%",
    cbar_kws={"label": "D7 retention"},
)

plt.title("D7 Retention by Version and Channel")
plt.xlabel("Version")
plt.ylabel("Acquisition channel")
plt.tight_layout()

heatmap_path = FIGURES_DIR / "retention_heatmap.png"
plt.savefig(heatmap_path, dpi=150, bbox_inches="tight")
plt.close()
print(f"Saved heatmap to {heatmap_path}")


Saved heatmap to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\figures\retention_heatmap.png


In [16]:
# Cell 8 – Text summary
top_d7 = channel_retention.iloc[0]
worst_d7 = channel_retention.iloc[-1]

print(
    f"Highest D7 channel: {top_d7['acquisition_channel']} -> {top_d7['d7_rate']:.2%}. "
    f"Lowest D7 channel: {worst_d7['acquisition_channel']} -> {worst_d7['d7_rate']:.2%}."
)


Highest D7 channel: Organic -> 18.78%. Lowest D7 channel: Facebook -> 18.42%.
