In [20]:
# Cell 1 - Setup
from pathlib import Path

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

sns.set_theme(style="whitegrid")

PROJ_ROOT = Path.cwd()
while PROJ_ROOT != PROJ_ROOT.parent and not (PROJ_ROOT / "data").exists():
    PROJ_ROOT = PROJ_ROOT.parent

DATA_DIR = PROJ_ROOT / "data" / "processed"
REPORTS_DIR = PROJ_ROOT / "reports"
TABLES_DIR = REPORTS_DIR / "tables"
FIGURES_DIR = REPORTS_DIR / "figures"

for path in [REPORTS_DIR, TABLES_DIR, FIGURES_DIR]:
    path.mkdir(parents=True, exist_ok=True)

PARQUET_PATH = DATA_DIR / "events.parquet"
CSV_PATH = DATA_DIR / "clean_data.csv"

In [21]:
# Cell 2 – Read Data
if PARQUET_PATH.exists():
    df = pd.read_parquet(PARQUET_PATH)
    source_path = PARQUET_PATH
else:
    df = pd.read_csv(CSV_PATH, low_memory=False)
    source_path = CSV_PATH

print(f"Loaded {len(df):,} rows from {source_path} location.")
df.head()

Loaded 90,189 rows from c:\Users\umyana\Documents\mobile_game_analytics_pipeline\data\processed\events.parquet location.


Unnamed: 0,userid,version,session_count,retention_1,retention_7,acquisition_channel,country,platform,purchase,CAC,revenue,ROI
0,116,gate_30,3,0,0,Facebook,USA,Google Play,0,2.8,0.023765,-0.991512
1,337,gate_30,38,0,0,TikTok,USA,Google Play,0,1.7,0.062804,-0.963057
2,377,gate_40,165,1,0,Facebook,USA,Google Play,0,2.8,0.087634,-0.968702
3,483,gate_40,1,0,0,Facebook,Mexico,Google Play,0,2.8,0.011883,-0.995756
4,488,gate_40,179,0,1,TikTok,USA,App Store,0,1.7,0.771525,-0.546162


In [22]:
# Cell 3 - Check
required_cols = {"acquisition_channel", "revenue", "CAC", "platform"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing Columns: {sorted(missing)}")

summary = df[["revenue", "CAC", "ROI"]].describe()
display(summary)


Unnamed: 0,revenue,CAC,ROI
count,90189.0,90189.0,90189.0
mean,0.546676,2.125861,-0.593476
std,3.315148,0.72223,3.92046
min,0.0,0.3,-1.0
25%,0.035648,1.7,-0.983623
50%,0.062359,2.3,-0.970268
75%,0.205964,2.8,-0.858005
max,121.653118,2.8,319.650079


In [23]:
# Cell 4 – Channel based Aggregation
roi_by_channel = (
    df.groupby("acquisition_channel", as_index=False)
    .agg(
        users=("userid", "nunique"),
        revenue=("revenue", "sum"),
        ad_spend=("CAC", "sum"),
    )
)

roi_by_channel["roi"] = (
    (roi_by_channel["revenue"] - roi_by_channel["ad_spend"])
    / roi_by_channel["ad_spend"].replace(0, pd.NA)
)
roi_by_channel["roas"] = roi_by_channel["revenue"] / roi_by_channel["ad_spend"].replace(
    0, pd.NA
)

roi_by_channel = roi_by_channel.sort_values("roas", ascending=False)
roi_by_channel


Unnamed: 0,acquisition_channel,users,revenue,ad_spend,roi,roas
2,Organic,9153,5091.083966,2745.9,0.854068,1.854068
3,TikTok,17979,10415.219151,30564.3,-0.659236,0.340764
1,Instagram,36281,19274.536869,83446.3,-0.769019,0.230981
0,Facebook,26776,14523.34201,74972.8,-0.806285,0.193715


In [24]:
# Cell 5 – Save Channel Table
roi_csv_path = TABLES_DIR / "roi_by_channel.csv"
roi_by_channel.round(6).to_csv(roi_csv_path, index=False)
print(f"Channel based ROI/ROAS table saved into {roi_csv_path}.")


Channel based ROI/ROAS table saved into c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\roi_by_channel.csv.


In [25]:
# Cell 6 – Long format for Tableau
roi_long = roi_by_channel.melt(
    id_vars=["acquisition_channel", "users"],
    value_vars=["revenue", "ad_spend", "roi", "roas"],
    var_name="metric",
    value_name="value",
)
roi_long_path = TABLES_DIR / "roi_by_channel_long.csv"
roi_long.round(6).to_csv(roi_long_path, index=False)
print(f"Long formatted table saved into {roi_long_path}.")
roi_long.head()


Long formatted table saved into c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\tables\roi_by_channel_long.csv.


Unnamed: 0,acquisition_channel,users,metric,value
0,Organic,9153,revenue,5091.083966
1,TikTok,17979,revenue,10415.219151
2,Instagram,36281,revenue,19274.536869
3,Facebook,26776,revenue,14523.34201
4,Organic,9153,ad_spend,2745.9


In [26]:
# Cell 7 – ROAS graphic
fig, ax = plt.subplots(figsize=(8, 5))
sns.barplot(
    data=roi_by_channel,
    x="roas",
    y="acquisition_channel",
    palette="viridis",
    ax=ax,
)

ax.set_xlabel("ROAS")
ax.set_ylabel("Kanal")
ax.set_title("Channel based ROAS")
ax.axvline(1.0, color="red", linestyle="--", linewidth=1, label="Break-even ROAS")
ax.legend(loc="lower right")

for idx, row in roi_by_channel.iterrows():
    ax.text(
        row["roas"] + 0.05,
        idx,
        f"{row['roas']:.2f}",
        va="center",
    )

fig.tight_layout()
roas_fig_path = FIGURES_DIR / "roi_by_channel.png"
fig.savefig(roas_fig_path, dpi=150, bbox_inches="tight")
plt.close(fig)
print(f"ROAS graphic saved into {roas_fig_path}")



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(


ROAS graphic saved into c:\Users\umyana\Documents\mobile_game_analytics_pipeline\reports\figures\roi_by_channel.png


In [27]:
# Cell 8 – Short Summary
top_channel = roi_by_channel.iloc[0]
worst_channel = roi_by_channel.iloc[-1]

print(
    f"Highest ROAS: {top_channel['acquisition_channel']} -> ROAS {top_channel['roas']:.2f}, ROI {top_channel['roi']:.2%}\n"
    f"Lowest ROAS: {worst_channel['acquisition_channel']} -> ROAS {worst_channel['roas']:.2f}, ROI {worst_channel['roi']:.2%}"
)


Highest ROAS: Organic -> ROAS 1.85, ROI 85.41%
Lowest ROAS: Facebook -> ROAS 0.19, ROI -80.63%
