Improvements from First Try:
- Look for leakage
- Train with network

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

In [None]:
from datetime import datetime
from mlflow.models import infer_signature

In [None]:
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()
MLFLOW_URI = os.environ.get("MLFLOW_URI")

In [None]:
!pip3 list | grep mlflow 
!pip3 list | grep pandas 
!pip3 list | grep scipy 
!pip3 list | grep numpy 
!pip3 list | grep statsmodels 

In [None]:
import mlflow 

mlflow.set_tracking_uri(MLFLOW_URI)

EXPERIMENT_NAME = "T2P Ensemble"
if not mlflow.get_experiment_by_name(name=EXPERIMENT_NAME):
    mlflow.create_experiment(name=EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

## Read Data

In [None]:
df = pd.read_csv("/Users/emulie/Documents/poc/T2PArima/data/merged_20250804.csv")

## Cleaning Data

In [None]:
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

valid_country_mask = df['country'].apply(lambda x: isinstance(x, str))
zero_country_mask = df['country'] == '0'
valid_continent_mask = df['continent'].apply(lambda x: isinstance(x, str))
valid_subcontinent_mask = df['sub_continent'].apply(lambda x: isinstance(x, str))

df = df[valid_country_mask & ~zero_country_mask & valid_continent_mask & valid_subcontinent_mask]

In [None]:
# --- minimum conversions required
df['t2p'] = df[PAID_COL] / df[TRIAL_COL]

min_cost_mask = df['cost_usd'] > 5.0
min_paid_mask = df[PAID_COL] > 2.0
min_trial_mask = df[TRIAL_COL] > 5.0
min_t2p_mask = df['t2p'] > 0
df_overall = df[min_cost_mask & min_paid_mask & min_trial_mask & min_t2p_mask]

In [None]:
T1_countries = [
    "GU", "PR", "DK", "JE", "NO", "BE", "FR", "US", "IL", "GB", "UK",
    "CA", "AU", "IE", "NL", "SE", "ES", "IT", "TW", "DE", "FI",
    "NZ", "JP", "KR", "SG", "HK"
]
T2_countries = [
    "ZA", "MT", "AE", "SA", "PL", "AT", "NO", "DK", "IS", "FI"
]
T3_countries = [
    "IN", "PH", "MY", "NG", "TH", "VN", "EG", "MN", "RO", "HU", "RS", "TR"
]


country_tier_map = {country: 'T1' for country in T1_countries} | {country: 'T2' for country in T2_countries} | {country: 'T3' for country in T3_countries}
df_overall['country_tier'] = df_overall['country'].apply(lambda x: country_tier_map[x] if x in country_tier_map else 'T4')

## Feature Encoding

In [None]:
# # --- encode categorical columns
# from sklearn.preprocessing import LabelEncoder
# import joblib

# run_name = f"LABEL_ENCODER_{SEGMENTATION}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
# experiment_tags = {
#     "project_name": EXPERIMENT_NAME, 
#     "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # need to be a string
#     "model": "Label Encoder", 
#     "mlflow.note.content": experiment_description,
# }


# with mlflow.start_run(experiment_id=experiment.experiment_id, 
#                                       run_name=run_name, tags=experiment_tags):
#     for col in ['network', 'platform', 'country', 'continent', 'sub_continent', 'country_tier']:
#         le = LabelEncoder()
#         df_overall[f'{col}_encoded'] = le.fit_transform(df_overall[col])
#         label_encoder_path = f"labelencoder_{col}.pkl"
#         joblib.dump(le, label_encoder_path)
#         mlflow.log_artifact(label_encoder_path)

In [None]:
df_overall['network'].unique()

In [None]:
network_map = {
    'Apple Search Ads': 0, 
    'Facebook Ads': 1, 
    'googleadwords_int': 2, 
    'tiktokglobal_int': 3, 
    'tatari_streaming': 4, 
    'snapchat_int': 5,
    'other': 6,
}

platform_map = {
    'android': 0, 
    'ios': 1, 
    'web': 2
}

country_tier_map = {
    'T1': 0, 
    'T2': 1, 
    'T3': 2, 
    'T4': 3
}

In [None]:
df_encoded = df_overall.copy()
df_encoded['network'] = df_encoded['network'].apply(lambda x: network_map[x])
df_encoded['platform'] = df_encoded['platform'].apply(lambda x: platform_map[x])
df_encoded['country_tier'] = df_encoded['country_tier'].apply(lambda x: country_tier_map[x])

In [None]:
df_encoded.head()

## Training - 

In [None]:
dff = df_encoded.copy()

### 1. GLM

#### Transform

In [None]:
X_cols = ['network', 'platform', 'week_of_year', 'day_of_week', 'cost_usd']

TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

formula_t2p = f"t2p ~ {' + '.join(X_cols)}"
cols_to_log_transform = ['cost_usd']

In [None]:
formula_t2p

In [None]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

for col in cols_to_log_transform:
    X[col] = np.log(X[col])

N = int(len(dff) * 0.8)
X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

#### Model

In [None]:
import statsmodels.api as sm

model_t2p = sm.GLM(y_t2p_train, X_train, formula=formula_t2p, family=sm.families.Poisson()).fit()
glm_predicted = model_t2p.predict(X_test)

In [None]:
mse_glm_t2p = mean_squared_error(glm_predicted, y_t2p_test)
print(mse_glm_t2p)

In [None]:
glm_relative_errors = [abs(pred - actual)/actual for pred, actual in zip(glm_predicted, y_t2p_test)]

In [None]:
np.mean(glm_relative_errors)

### 2. XGB

#### Transform

In [None]:
X_cols = ['week_of_year', 'day_of_week', 'cost_usd', 'impressions', 'clicks', 'installs']
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [None]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

y_t2p = np.log(y_t2p)

# N = int(len(dff) * 0.8)
# X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
# y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

#### Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

params = {
    "n_estimators": 200,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}
xgb_t2p = GradientBoostingRegressor(**params)


In [None]:
# def create_stratified_bins(y, n_bins=10):
#     return pd.qcut(y, q=n_bins, labels=False, duplicates="drop")
# y_strat = create_stratified_bins(y_t2p_train)

from sklearn.model_selection import KFold

n_splits = 5
skf = KFold(n_splits=n_splits, shuffle=True, random_state=420)

In [None]:
for i, (train_index, test_index) in enumerate(skf.split(X, y_t2p)):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y_t2p.iloc[train_index], y_t2p.iloc[test_index]
    
    xgb_t2p.fit(X_train, y_train)
    xgb_pred = np.exp(xgb_t2p.predict(X_test))
    
    mse_xgb = mean_squared_error(xgb_pred, np.exp(y_test))
    xgb_relative_errors = np.mean([abs(pred - actual)/actual for pred, actual in zip(xgb_pred, np.exp(y_t2p_test))])
    print(f"---- Fold {i} ----")
    print(f"MSE: {mse_xgb}; Relative Error: {xgb_relative_errors}\n")

In [None]:
mse_xgb

In [None]:
np.mean(glm_relative_errors)