In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, timezone
import uuid
import os

# Constants (adjusted: lower base churn for positives/mix)
CONFIG = {
    "NUM_USERS": 10000,
    "SESSIONS_PER_USER_MEAN": 4,
    "ADS_PER_SESSION_MEAN": 3,
    "SESSION_DUR_MEAN_MIN": 10,
    "SESSION_DUR_STD_MIN": 5,
    "OUTPUT_PATH": "D:/CAREER/INTERVIEWS/Play Simple Games/Case Study/PSG_ML Solution/Model Input/GamingData_Input_v6.csv",
    "SEED": 2025,
    "AD_REVENUE_MEAN": 0.02,
    "AD_REVENUE_STD": 0.01,
    "P_CHURN_BASE": 0.2,  # Lowered for mean ~0.25 churn
    "P_CHURN_UPLIFT_PER_AD": 0.01,  # Milder uplift
    "P_CHURN_STD": 0.05,  # Reduced variability for more positives
    "LTV_EST_MEAN": 5.0,
    "LTV_EST_STD": 2.0,
}

np.random.seed(CONFIG["SEED"])

# Distributions (match original)
COUNTRIES = ["US", "IN", "BR", "TR", "ID", "PH", "EG", "VN", "TH", "PK", "CO", "MY", "RU", "BD", "MX"]
DEVICE_TYPES = ["Android", "iOS"]
AD_TYPES = ["banner", "interstitial", "rewarded", "native"]
AD_PLACEMENTS = ["start", "mid", "end", "idle"]
SIM_START = (datetime.now(timezone.utc).replace(hour=0, minute=0, second=0, microsecond=0) - timedelta(days=30)).replace(tzinfo=None)

rows = []
for _ in range(CONFIG["NUM_USERS"]):
    user_id = str(uuid.uuid4())
    country = np.random.choice(COUNTRIES)
    device_type = np.random.choice(DEVICE_TYPES, p=[0.65, 0.35])
    sessions_per_user = np.random.poisson(CONFIG["SESSIONS_PER_USER_MEAN"]) + 1
    session_starts = [SIM_START + timedelta(days=np.random.randint(0, 31), hours=np.random.randint(0, 24)) for _ in range(sessions_per_user)]
    session_starts.sort()
    
    for s_idx, session_start in enumerate(session_starts):
        session_id = str(uuid.uuid4())
        session_duration = np.clip(np.random.normal(CONFIG["SESSION_DUR_MEAN_MIN"], CONFIG["SESSION_DUR_STD_MIN"]), 1, 30) * 60  # Seconds
        level_reached = np.random.randint(1, 1001)
        ads_in_session = np.random.poisson(CONFIG["ADS_PER_SESSION_MEAN"]) + 1
        ad_times = [session_start + timedelta(seconds=np.random.uniform(0, session_duration)) for _ in range(ads_in_session)]
        ad_times.sort()
        
        session_churn_base = np.random.normal(CONFIG["P_CHURN_BASE"], CONFIG["P_CHURN_STD"]) + (s_idx * 0.005)  # Variable base
        session_retention = 1 if np.random.rand() > session_churn_base else 0
        session_spend = np.random.exponential(0.5) if np.random.rand() < 0.05 else 0.0
        
        for a_idx, event_time in enumerate(ad_times):
            ad_type = np.random.choice(AD_TYPES, p=[0.25, 0.25, 0.35, 0.15])
            ad_placement = np.random.choice(AD_PLACEMENTS, p=[0.1, 0.6, 0.2, 0.1])
            time_of_day = event_time.hour
            frequency = a_idx + 1
            impressions = 1
            clicks = 1 if np.random.rand() < 0.03 else 0
            game_time_post_ad = np.random.uniform(0, session_duration / 2)
            spend_after_ad = np.random.exponential(0.1) if clicks else 0.0
            churn_uplift = frequency * CONFIG["P_CHURN_UPLIFT_PER_AD"]
            p_churn = np.clip(session_churn_base + churn_uplift - (0.05 if ad_type == "rewarded" else 0), 0.01, 0.4)  # Capped max for positives
            ad_revenue = np.clip(np.random.normal(CONFIG["AD_REVENUE_MEAN"], CONFIG["AD_REVENUE_STD"]), 0.001, 0.1)
            lifetime_value_estimate = np.random.normal(CONFIG["LTV_EST_MEAN"], CONFIG["LTV_EST_STD"])
            
            rows.append({
                "user_id": user_id,
                "country": country,
                "device_type": device_type,
                "session_id": session_id,
                "session_duration": round(session_duration, 2),
                "level_reached": level_reached,
                "ad_type": ad_type,
                "ad_placement": ad_placement,
                "time_of_day": time_of_day,
                "frequency": frequency,
                "impressions": impressions,
                "clicks": clicks,
                "game_time_post_ad": round(game_time_post_ad, 2),
                "retention_next_day": session_retention,
                "spend_after_ad": round(spend_after_ad + session_spend / ads_in_session, 2),
                "churn_probability": round(p_churn, 4),
                "ad_revenue_per_impression": round(ad_revenue, 4),
                "lifetime_value_estimate": round(lifetime_value_estimate, 2),
                "event_time": event_time.isoformat(timespec="seconds"),
            })

df = pd.DataFrame(rows)
df["event_time"] = pd.to_datetime(df["event_time"])
df = df.sort_values(["user_id", "event_time"]).reset_index(drop=True)

# Balance categories (same as before, but minimal to save time)
for col, min_share in [("ad_type", 0.15), ("ad_placement", 0.10), ("country", 0.05)]:
    shares = df[col].value_counts(normalize=True)
    low = shares[shares < min_share]
    if not low.empty:
        extra = []
        for k, v in low.items():
            need = int((min_share - v) * len(df))
            if need > 0:
                extra.append(df[df[col]==k].sample(n=min(need, len(df[df[col]==k])), replace=True, random_state=CONFIG["SEED"]))
        if extra:
            df = pd.concat([df] + extra, ignore_index=True)

# Save
os.makedirs(os.path.dirname(CONFIG["OUTPUT_PATH"]), exist_ok=True)
df.to_csv(CONFIG["OUTPUT_PATH"], index=False)
print(f"✅ Saved enhanced synthetic gaming telemetry → {CONFIG['OUTPUT_PATH']}")

# Validation (same as before)
def pct(x): return f"{100*x:.1f}%"

print("\n=== Shape & Head ===")
print(df.shape)
print(df.head(5))

print("\n=== Category mixes ===")
print("ad_type:", ", ".join([f"{k}={pct(v)}" for k,v in df['ad_type'].value_counts(normalize=True).to_dict().items()]))
print("ad_placement:", ", ".join([f"{k}={pct(v)}" for k,v in df['ad_placement'].value_counts(normalize=True).to_dict().items()]))
print("device_type:", ", ".join([f"{k}={pct(v)}" for k,v in df['device_type'].value_counts(normalize=True).to_dict().items()]))

print("\n=== Key stats ===")
print(df[[
    "session_duration","frequency","clicks","ad_revenue_per_impression",
    "game_time_post_ad","spend_after_ad","churn_probability"
]].describe(percentiles=[.1,.5,.9]).to_string())

print("\n=== Realism checks (spearman) ===")
corr = df[["frequency","churn_probability"]].corr(method="spearman").iloc[0,1]
print(f"freq ↗ vs churn_probability ↗ : ρ={corr:.3f} (target >0.3)")
rwd = df.loc[df["ad_type"]=="rewarded","retention_next_day"].mean()
intst = df.loc[df["ad_type"]=="interstitial","retention_next_day"].mean()
print(f"rewarded retention={rwd:.3f} vs interstitial={intst:.3f} (rewarded higher)")
long_sess_ctr = (df["clicks"]/df["impressions"]).groupby(pd.qcut(df["session_duration"], 4)).mean()
print("CTR by session_duration quartile:\n", long_sess_ctr)
print(f"Overall retention mean: {df['retention_next_day'].mean():.3f} (target 0.5-0.6)")
print(f"Overall churn prob mean: {df['churn_probability'].mean():.3f} (target 0.2-0.3 for mix)")
print(f"ARPU mean: {df['spend_after_ad'].mean() + df['ad_revenue_per_impression'].mean():.4f} (target 0.02-0.05)")



✅ Saved enhanced synthetic gaming telemetry → D:/CAREER/INTERVIEWS/Play Simple Games/Case Study/PSG_ML Solution/Model Input/GamingData_Input_v6.csv

=== Shape & Head ===
(201506, 19)
                                user_id country device_type  \
0  0001304c-5894-4de4-a822-953425026c01      RU     Android   
1  0001304c-5894-4de4-a822-953425026c01      RU     Android   
2  0001304c-5894-4de4-a822-953425026c01      RU     Android   
3  0001304c-5894-4de4-a822-953425026c01      RU     Android   
4  0001304c-5894-4de4-a822-953425026c01      RU     Android   

                             session_id  session_duration  level_reached  \
0  795c6c34-7d87-407e-a2ea-4da8ba17a436            457.58            459   
1  795c6c34-7d87-407e-a2ea-4da8ba17a436            457.58            459   
2  795c6c34-7d87-407e-a2ea-4da8ba17a436            457.58            459   
3  795c6c34-7d87-407e-a2ea-4da8ba17a436            457.58            459   
4  795c6c34-7d87-407e-a2ea-4da8ba17a436            457.58 

  long_sess_ctr = (df["clicks"]/df["impressions"]).groupby(pd.qcut(df["session_duration"], 4)).mean()
