In [1]:
# Cell 1 – Imports & directory setup
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

sns.set_theme(style="whitegrid")

PROJ_ROOT = Path.cwd()
while PROJ_ROOT != PROJ_ROOT.parent and not (PROJ_ROOT / "data").exists():
    PROJ_ROOT = PROJ_ROOT.parent

DATA_DIR = PROJ_ROOT / "data" / "processed"
REPORTS_DIR = PROJ_ROOT / "reports"
TABLES_DIR = REPORTS_DIR / "tables"
FIGURES_DIR = REPORTS_DIR / "figures"

for path in [REPORTS_DIR, TABLES_DIR, FIGURES_DIR]:
    path.mkdir(parents=True, exist_ok=True)

PARQUET_PATH = DATA_DIR / "events.parquet"
CSV_PATH = DATA_DIR / "clean_data.csv"

In [2]:
# Cell 2 – Load processed dataset
if PARQUET_PATH.exists():
    df = pd.read_parquet(PARQUET_PATH)
    source_path = PARQUET_PATH
else:
    df = pd.read_csv(CSV_PATH, low_memory=False)
    source_path = CSV_PATH

print(f"Loaded {len(df):,} rows from {source_path}")
df.head()

Loaded 90,189 rows from c:\Users\umyana\Documents\mobile_game_analytics_pipeline\data\processed\events.parquet


Unnamed: 0,userid,version,session_count,retention_1,retention_7,acquisition_channel,country,platform,purchase,CAC,revenue,ROI
0,116,gate_30,3,0,0,Facebook,USA,Google Play,0,2.8,0.023765,-0.991512
1,337,gate_30,38,0,0,TikTok,USA,Google Play,0,1.7,0.062804,-0.963057
2,377,gate_40,165,1,0,Facebook,USA,Google Play,0,2.8,0.087634,-0.968702
3,483,gate_40,1,0,0,Facebook,Mexico,Google Play,0,2.8,0.011883,-0.995756
4,488,gate_40,179,0,1,TikTok,USA,App Store,0,1.7,0.771525,-0.546162


In [3]:
# Cell 3 – Sanity checks on required columns
required_cols = {
    "userid",
    "session_count",
    "retention_1",
    "retention_7",
    "purchase",
    "acquisition_channel",
    "CAC",
    "revenue",
}

missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required columns: {sorted(missing)}")

summary_cols = [
    "session_count",
    "retention_1",
    "retention_7",
    "purchase",
    "revenue",
    "CAC",
]
display(df[summary_cols].describe(include="all"))

Unnamed: 0,session_count,retention_1,retention_7,purchase,revenue,CAC
count,90189.0,90189.0,90189.0,90189.0,90189.0,90189.0
mean,51.872457,0.454867,0.332945,0.055772,0.546676,2.125861
std,195.050858,0.497962,0.47127,0.229482,3.315148,0.72223
min,0.0,0.0,0.0,0.0,0.0,0.3
25%,5.0,0.0,0.0,0.0,0.035648,1.7
50%,16.0,0.0,0.0,0.0,0.062359,2.3
75%,51.0,1.0,1.0,0.0,0.205964,2.8
max,49854.0,1.0,1.0,1.0,121.653118,2.8


In [4]:
# Cell 4 – Define funnel stages
funnel_flags = pd.DataFrame(
    {
        "install": np.ones(len(df), dtype=int),
        "onboarding": (df["session_count"].fillna(0) > 0).astype(int),
        "d1": df["retention_1"].fillna(False).astype(int),
        "d7": df["retention_7"].fillna(False).astype(int),
        "purchase": df["purchase"].fillna(False).astype(int),
    },
    index=df.index,
)

stage_labels = {
    "install": "Install",
    "onboarding": "Onboarding",
    "d1": "D1 Return",
    "d7": "D7 Return",
    "purchase": "Purchase",
}

funnel_flags.head()

Unnamed: 0,install,onboarding,d1,d7,purchase
0,1,1,0,0,0
1,1,1,0,0,0
2,1,1,1,0,0
3,1,1,0,0,0
4,1,1,0,1,0


In [5]:
# Cell 5 – Aggregate counts and conversion rates
total_users = len(funnel_flags)

stage_counts = {
    stage_labels[key]: funnel_flags[key].sum()
    for key in ["install", "onboarding", "d1", "d7", "purchase"]
}

def safe_div(num, denom):
    return float(num) / denom if denom else np.nan

funnel_summary = pd.DataFrame(
    {
        "n_users": [total_users],
        "rate_install": [safe_div(stage_counts["Install"], total_users)],
        "rate_onboarding_from_install": [
            safe_div(stage_counts["Onboarding"], stage_counts["Install"])
        ],
        "rate_d1_from_onboarding": [
            safe_div(stage_counts["D1 Return"], stage_counts["Onboarding"])
        ],
        "rate_d7_from_d1": [
            safe_div(stage_counts["D7 Return"], stage_counts["D1 Return"])
        ],
        "rate_purchase_overall": [
            safe_div(stage_counts["Purchase"], total_users)
        ],
        "rate_purchase_from_d7": [
            safe_div(stage_counts["Purchase"], stage_counts["D7 Return"])
        ],
    }
)

summary_path = TABLES_DIR / "funnel.csv"
funnel_summary.round(6).to_csv(summary_path, index=False)
print(f"Saved summary table to {summary_path}")
funnel_summary

Saved summary table to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\funnel.csv


Unnamed: 0,n_users,rate_install,rate_onboarding_from_install,rate_d1_from_onboarding,rate_d7_from_d1,rate_purchase_overall,rate_purchase_from_d7
0,90189,1.0,0.955715,0.475944,0.731962,0.055772,0.16751


In [6]:
# Cell 6 – Long format for Tableau / visualization
stage_order = ["Install", "Onboarding", "D1 Return", "D7 Return", "Purchase"]

funnel_long = pd.DataFrame(
    {
        "stage": stage_order,
        "users": [stage_counts[stage] for stage in stage_order],
    }
)
funnel_long["pct_of_installs"] = funnel_long["users"] / total_users
funnel_long["conversion_from_previous"] = funnel_long["users"].div(
    funnel_long["users"].shift(fill_value=total_users)
)

long_path = TABLES_DIR / "funnel_long.csv"
funnel_long.round(6).to_csv(long_path, index=False)
print(f"Saved long-format table to {long_path}")
funnel_long

Saved long-format table to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\funnel_long.csv


Unnamed: 0,stage,users,pct_of_installs,conversion_from_previous
0,Install,90189,1.0,1.0
1,Onboarding,86195,0.955715,0.955715
2,D1 Return,41024,0.454867,0.475944
3,D7 Return,30028,0.332945,0.731962
4,Purchase,5030,0.055772,0.16751


In [7]:
# Cell 7 – Funnel visualization
fig, ax = plt.subplots(figsize=(8, 5))

sns.barplot(
    data=funnel_long,
    y="stage",
    x="pct_of_installs",
    order=list(reversed(stage_order)),
    color="#1f77b4",
    ax=ax,
)

ax.set_xlabel("Share of installs")
ax.set_ylabel("")
ax.set_xlim(0, 1)

for idx, row in funnel_long.iterrows():
    ax.text(
        row["pct_of_installs"] + 0.01,
        len(stage_order) - 1 - idx,
        f"{row['pct_of_installs']:.1%}",
        va="center",
    )

ax.set_title("User Funnel Conversion")

fig.tight_layout()
figure_path = FIGURES_DIR / "funnel.png"
fig.savefig(figure_path, dpi=150, bbox_inches="tight")
print(f"Saved funnel visualization to {figure_path}")
plt.close(fig)

Saved funnel visualization to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\figures\funnel.png


In [8]:
# Cell 8 – Narrative summary
overall_purchase = funnel_summary.loc[0, "rate_purchase_overall"]
d1_drop = 1 - funnel_summary.loc[0, "rate_d1_from_onboarding"]
d7_drop = 1 - funnel_summary.loc[0, "rate_d7_from_d1"]

print(
    f"Overall purchase rate: {overall_purchase:.2%}\n"
    f"Drop after onboarding → D1: {d1_drop:.2%}\n"
    f"Drop after D1 → D7: {d7_drop:.2%}\n"
)

Overall purchase rate: 5.58%
Drop after onboarding → D1: 52.41%
Drop after D1 → D7: 26.80%

