## Retention Cohort Analysis

This notebook focuses on retention cohorts using the processed mobile game dataset. It provides:

- Project-aware path handling with a fallback from Parquet to CSV (`data/processed/clean_data.csv`).
- Transformation of day-based retention flags (e.g., `retention_1`, `retention_7`) into a long format suitable for cohort analytics.
- Cohort summaries grouped by acquisition channel, platform, and their combinations.
- Visual heatmap of channel-level retention to reveal standout cohorts.

The analysis assumes retention columns are boolean indicators for whether a user returned on the corresponding day (e.g., `retention_7 == True`).

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    import seaborn as sns
    sns.set_theme(style='whitegrid')
except Exception:
    try:
        plt.style.use('seaborn-whitegrid')
    except OSError:
        plt.style.use('ggplot')

PROJ_ROOT = Path.cwd()
while PROJ_ROOT != PROJ_ROOT.parent and not (PROJ_ROOT / 'data').exists():
    PROJ_ROOT = PROJ_ROOT.parent

DATA_DIR = PROJ_ROOT / 'data' / 'processed'
PARQUET_PATH = DATA_DIR / 'events.parquet'
CSV_PATH = DATA_DIR / 'clean_data.csv'

REPORTS_DIR = PROJ_ROOT / 'reports'
FIGURES_DIR = REPORTS_DIR / 'figures'
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

COHORT_SUMMARY_PATH = REPORTS_DIR / 'retention_cohort_summary.csv'
COHORT_HEATMAP_PATH = FIGURES_DIR / 'retention_heatmap.png'

print(f'Project root: {PROJ_ROOT}')
print(f'Parquet path exists: {PARQUET_PATH.exists()}')
print(f'CSV fallback exists: {CSV_PATH.exists()}')
print(f'Reports directory: {REPORTS_DIR}')
print(f'Figures directory: {FIGURES_DIR}')

Project root: c:\Users\umyana\Documents\mobile_game_analytics_pipeline
Parquet path exists: True
CSV fallback exists: True
Reports directory: c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports
Figures directory: c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\figures


In [2]:
try:
    df = pd.read_parquet(PARQUET_PATH)
    source_used = PARQUET_PATH
except (ImportError, ValueError, FileNotFoundError) as err:
    print(f'Parquet unavailable ({err}); falling back to CSV')
    df = pd.read_csv(CSV_PATH)
    source_used = CSV_PATH

print(f'Loaded {len(df):,} rows from {source_used}')
df.head()

Loaded 90,189 rows from c:\Users\umyana\Documents\mobile_game_analytics_pipeline\data\processed\events.parquet


Unnamed: 0,userid,version,session_count,retention_1,retention_7,acquisition_channel,country,platform,purchase,CAC,revenue,ROI
0,116,gate_30,3,False,False,Facebook,USA,Google Play,0,2.8,0.038024,-0.98642
1,337,gate_30,38,True,False,TikTok,USA,Google Play,0,1.7,0.100486,-0.94089
2,377,gate_40,165,True,False,Facebook,USA,Google Play,0,2.8,0.140215,-0.949923
3,483,gate_40,1,False,False,Facebook,Mexico,Google Play,0,2.8,0.019012,-0.99321
4,488,gate_40,179,True,True,TikTok,USA,App Store,0,1.7,1.23444,-0.273859


In [3]:
retention_cols = [col for col in df.columns if col.startswith('retention_')]
if not retention_cols:
    raise ValueError('No retention_* columns found; cannot compute cohorts.')

cohort_base = df[['userid', 'acquisition_channel', 'platform', 'country', 'version'] + retention_cols].copy()
retention_long = cohort_base.melt(
    id_vars=['userid', 'acquisition_channel', 'platform', 'country', 'version'],
    value_vars=retention_cols,
    var_name='retention_metric',
    value_name='retained'
)
retention_long['day'] = (
    retention_long['retention_metric']
    .str.extract('(\d+)')
    .astype(float)
    .astype('Int64')
)
retention_long = retention_long.dropna(subset=['day'])
retention_long['day'] = retention_long['day'].astype(int)

print(f'Retention metrics detected: {retention_cols}')
retention_long.head()

  .str.extract('(\d+)')


Retention metrics detected: ['retention_1', 'retention_7']


Unnamed: 0,userid,acquisition_channel,platform,country,version,retention_metric,retained,day
0,116,Facebook,Google Play,USA,gate_30,retention_1,False,1
1,337,TikTok,Google Play,USA,gate_30,retention_1,True,1
2,377,Facebook,Google Play,USA,gate_40,retention_1,True,1
3,483,Facebook,Google Play,Mexico,gate_40,retention_1,False,1
4,488,TikTok,App Store,USA,gate_40,retention_1,True,1


In [4]:
def compute_retention(table, cohort_fields):
    group_cols = cohort_fields + ['day']
    summary = (
        table.groupby(group_cols)
        .agg(installs=('retained', 'count'), retained=('retained', 'sum'))
        .reset_index()
    )
    summary['retention_rate'] = np.where(
        summary['installs'] > 0,
        summary['retained'] / summary['installs'],
        np.nan
    )
    return summary

overall_retention = compute_retention(retention_long, [])
channel_retention = compute_retention(retention_long, ['acquisition_channel'])
platform_retention = compute_retention(retention_long, ['platform'])
channel_platform_retention = compute_retention(retention_long, ['acquisition_channel', 'platform'])

print('Overall retention by day:')
overall_retention[['day', 'installs', 'retained', 'retention_rate']]


Overall retention by day:


Unnamed: 0,day,installs,retained,retention_rate
0,1,90189,40153,0.44521
1,7,90189,16781,0.186065


In [5]:
channel_retention_matrix = channel_retention.pivot(
    index='acquisition_channel',
    columns='day',
    values='retention_rate'
).sort_index()

print('Channel retention rates by day (fractional form):')
channel_retention_matrix


Channel retention rates by day (fractional form):


day,1,7
acquisition_channel,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook,0.442897,0.184157
Instagram,0.448637,0.187178
Organic,0.445865,0.187807
TikTok,0.441404,0.185772


In [6]:
if channel_retention_matrix.empty:
    raise ValueError('Channel retention matrix is empty; verify data.')

fig, ax = plt.subplots(figsize=(8, 5))
heatmap_data = channel_retention_matrix * 100

if 'sns' in globals():
    sns.heatmap(
        heatmap_data,
        annot=True,
        fmt='.1f',
        cmap='Blues',
        cbar_kws={'label': 'Retention %'},
        ax=ax
    )
else:
    im = ax.imshow(heatmap_data.values, cmap='Blues', aspect='auto')
    ax.set_xticks(range(len(heatmap_data.columns)))
    ax.set_xticklabels(heatmap_data.columns)
    ax.set_yticks(range(len(heatmap_data.index)))
    ax.set_yticklabels(heatmap_data.index)
    for i in range(heatmap_data.shape[0]):
        for j in range(heatmap_data.shape[1]):
            ax.text(j, i, f"{heatmap_data.iloc[i, j]:.1f}", ha='center', va='center', color='black')
    cbar = fig.colorbar(im, ax=ax)
    cbar.set_label('Retention %')

ax.set_title('Channel retention heatmap')
ax.set_xlabel('Retention day')
ax.set_ylabel('Acquisition channel')
fig.tight_layout()
fig.savefig(COHORT_HEATMAP_PATH, dpi=120, bbox_inches='tight')
plt.close(fig)

print(f'Saved channel retention heatmap to {COHORT_HEATMAP_PATH}')
heatmap_data

Saved channel retention heatmap to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\figures\retention_heatmap.png


day,1,7
acquisition_channel,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook,44.289662,18.415745
Instagram,44.863703,18.717786
Organic,44.586474,18.780728
TikTok,44.140386,18.577229


In [7]:
retention_long['channel_platform'] = (
    retention_long['acquisition_channel'] + ' | ' + retention_long['platform']
)
channel_platform_summary = compute_retention(retention_long, ['channel_platform'])
channel_platform_matrix = channel_platform_summary.pivot(
    index='channel_platform',
    columns='day',
    values='retention_rate'
).sort_index()

channel_platform_matrix.head()

day,1,7
channel_platform,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook | App Store,0.440612,0.180114
Facebook | Google Play,0.443654,0.185498
Instagram | App Store,0.445527,0.190348
Instagram | Google Play,0.449653,0.186143
Organic | App Store,0.452391,0.188743


In [8]:
channel_retention_matrix.to_csv(COHORT_SUMMARY_PATH, float_format='%.4f')
print(f'Saved channel retention summary to {COHORT_SUMMARY_PATH}')
channel_retention_matrix

Saved channel retention summary to c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\retention_cohort_summary.csv


day,1,7
acquisition_channel,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook,0.442897,0.184157
Instagram,0.448637,0.187178
Organic,0.445865,0.187807
TikTok,0.441404,0.185772


Generated artifacts:
- `reports/retention_cohort_summary.csv`
- `reports/figures/retention_heatmap.png`