Improvements from First Try:
- Look for leakage => no leakage found
- Train with network
- Evaluate if model is underfitting/overfitting => it's underfitting
- Add additional features => polynomial, interaction effects, dimension reduction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

In [2]:
from datetime import datetime
from mlflow.models import infer_signature

In [3]:
from dotenv import load_dotenv
import os

In [4]:
load_dotenv()
MLFLOW_URI = os.environ.get("MLFLOW_URI")

In [5]:
!pip3 list | grep mlflow 
!pip3 list | grep pandas 
!pip3 list | grep scipy 
!pip3 list | grep numpy 
!pip3 list | grep statsmodels 

mlflow                                   2.22.0
mlflow-skinny                            2.22.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
pandas                                   1.3.4
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
scipy                                    1.10.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
numpy                                    1.23.3
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m
statsmodels                              0.13.5
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install -

In [6]:
import mlflow 

mlflow.set_tracking_uri(MLFLOW_URI)

EXPERIMENT_NAME = "T2P Ensemble"
if not mlflow.get_experiment_by_name(name=EXPERIMENT_NAME):
    mlflow.create_experiment(name=EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

## Read Data

In [8]:
df = pd.read_csv("/Users/emulie/Documents/poc/T2PArima/data/merged_20250804.csv")

## Cleaning Data

In [9]:
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

valid_country_mask = df['country'].apply(lambda x: isinstance(x, str))
zero_country_mask = df['country'] == '0'
valid_continent_mask = df['continent'].apply(lambda x: isinstance(x, str))
valid_subcontinent_mask = df['sub_continent'].apply(lambda x: isinstance(x, str))

df = df[valid_country_mask & ~zero_country_mask & valid_continent_mask & valid_subcontinent_mask]

In [10]:
# --- minimum conversions required
df['t2p'] = df[PAID_COL] / df[TRIAL_COL]

min_cost_mask = df['cost_usd'] > 5.0
min_paid_mask = df[PAID_COL] > 2.0
min_trial_mask = df[TRIAL_COL] > 5.0
min_t2p_mask = df['t2p'] > 0
df_overall = df[min_cost_mask & min_paid_mask & min_trial_mask & min_t2p_mask]

In [22]:
T1_countries = [
    "GU", "PR", "DK", "JE", "NO", "BE", "FR", "US", "IL", "GB", "UK",
    "CA", "AU", "IE", "NL", "SE", "ES", "IT", "TW", "DE", "FI",
    "NZ", "JP", "KR", "SG", "HK"
]
T2_countries = [
    "ZA", "MT", "AE", "SA", "PL", "AT", "NO", "DK", "IS", "FI"
]
T3_countries = [
    "IN", "PH", "MY", "NG", "TH", "VN", "EG", "MN", "RO", "HU", "RS", "TR"
]


country_tier_map = {country: 'T1' for country in T1_countries} | {country: 'T2' for country in T2_countries} | {country: 'T3' for country in T3_countries}
df_overall['country_tier'] = df_overall['country'].apply(lambda x: country_tier_map[x] if x in country_tier_map else 'T4')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_overall['country_tier'] = df_overall['country'].apply(lambda x: country_tier_map[x] if x in country_tier_map else 'T4')


## Feature Encoding

In [23]:
# # --- encode categorical columns
# from sklearn.preprocessing import LabelEncoder
# import joblib

# run_name = f"LABEL_ENCODER_{SEGMENTATION}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
# experiment_tags = {
#     "project_name": EXPERIMENT_NAME, 
#     "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # need to be a string
#     "model": "Label Encoder", 
#     "mlflow.note.content": experiment_description,
# }


# with mlflow.start_run(experiment_id=experiment.experiment_id, 
#                                       run_name=run_name, tags=experiment_tags):
#     for col in ['network', 'platform', 'country', 'continent', 'sub_continent', 'country_tier']:
#         le = LabelEncoder()
#         df_overall[f'{col}_encoded'] = le.fit_transform(df_overall[col])
#         label_encoder_path = f"labelencoder_{col}.pkl"
#         joblib.dump(le, label_encoder_path)
#         mlflow.log_artifact(label_encoder_path)

In [24]:
df_overall['network'].unique()

array(['Apple Search Ads', 'Facebook Ads', 'googleadwords_int',
       'tiktokglobal_int', 'tatari_streaming', 'snapchat_int'],
      dtype=object)

In [25]:
network_map = {
    'Apple Search Ads': 0, 
    'Facebook Ads': 1, 
    'googleadwords_int': 2, 
    'tiktokglobal_int': 3, 
    'tatari_streaming': 4, 
    'snapchat_int': 5,
    'other': 6,
}

platform_map = {
    'android': 0, 
    'ios': 1, 
    'web': 2
}

country_tier_map = {
    'T1': 0, 
    'T2': 1, 
    'T3': 2, 
    'T4': 3
}

In [27]:
df_encoded = df_overall.copy()
df_encoded['network'] = df_encoded['network'].apply(lambda x: network_map[x])
df_encoded['platform'] = df_encoded['platform'].apply(lambda x: platform_map[x])
df_encoded['country_tier'] = df_encoded['country_tier'].apply(lambda x: country_tier_map[x])

In [28]:
df_encoded.head()

Unnamed: 0,date,network,platform,country,cost_cad,cost_usd,clicks,impressions,installs,trials_prefinal,...,trials_hauutm,paid_hauutm,revenue,refund,refunded_amount,is_holiday,week_of_year,day_of_week,t2p,country_tier
7,2023-01-01,0,1,US,14.775428,10.9084,17.0,4134.0,68.0,7.0,...,177.0,73.0,2666.489,6.0,-230.958,True,52,7,0.412429,0
306,2023-01-01,1,0,DE,295.725005,218.3278,516.0,77445.0,155.0,11.0,...,20.0,6.0,232.686699,0.0,0.0,True,52,7,0.3,0
337,2023-01-01,1,0,UK,1279.944256,944.957,1547.0,256323.0,376.0,30.0,...,54.0,13.0,719.58712,0.0,0.0,True,52,7,0.240741,0
338,2023-01-01,1,0,US,3170.89642,2341.0088,1882.0,312940.0,86.0,8.0,...,78.0,23.0,722.3045,1.0,-50.9915,True,52,7,0.294872,0
362,2023-01-01,1,1,US,6608.95022,4879.2545,21845.0,1822490.0,2435.0,472.0,...,370.0,194.0,6737.512989,16.0,-601.888,True,52,7,0.524324,0


## Training - 

In [37]:
dff = df_encoded.copy()

### 1. GLM

#### Transform

In [38]:
X_cols = ['network', 'platform', 'week_of_year', 'day_of_week', 'cost_usd']

TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

formula_t2p = f"t2p ~ {' + '.join(X_cols)}"
cols_to_log_transform = ['cost_usd']

In [39]:
formula_t2p

't2p ~ network + platform + week_of_year + day_of_week + cost_usd'

In [40]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

for col in cols_to_log_transform:
    X[col] = np.log(X[col])

N = int(len(dff) * 0.8)
X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = np.log(X[col])


#### Model

In [48]:
import statsmodels.api as sm

model_t2p = sm.GLM(y_t2p_train, X_train, formula=formula_t2p, family=sm.families.Poisson()).fit()
glm_predicted = model_t2p.predict(X_test)

In [49]:
mse_glm_t2p = mean_squared_error(glm_predicted, y_t2p_test)
print(mse_glm_t2p)

0.017731860001266107


In [58]:
glm_relative_errors = [abs(pred - actual)/actual for pred, actual in zip(glm_predicted, y_t2p_test)]

In [59]:
np.mean(glm_relative_errors)

0.42457746189508927

### 2. XGB

#### Transform

In [141]:
X_cols = ['network', 'platform', 'week_of_year', 'day_of_week', 'cost_usd', 'impressions', 'clicks', 'installs']
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [142]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

# N = int(len(dff) * 0.8)
# X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
# y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

#### Model

In [143]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# params = {
#     "n_estimators": 200,
#     "max_depth": 4,
#     "min_samples_split": 5,
#     "learning_rate": 0.05,
#     "loss": "squared_error",
# }

param_dist = {
    "n_estimators": randint(50, 300),
    "max_depth": randint(2, 8),
    "min_samples_split": randint(2, 10),
    "learning_rate": uniform(0.005, 0.1),
    "loss": ["squared_error", "absolute_error"],
}
xgb_t2p = GradientBoostingRegressor(random_state=42)


In [144]:
random_search = RandomizedSearchCV(
    xgb_t2p,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_mean_squared_error",
    cv=skf,
    verbose=1,
    random_state=42,
    n_jobs=-1,
)

random_search.fit(X, y_t2p)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [145]:
xgb_best_model = random_search.best_estimator_
print("Best parameters:", random_search.best_params_)

Best parameters: {'learning_rate': 0.05092488919658671, 'loss': 'squared_error', 'max_depth': 5, 'min_samples_split': 9, 'n_estimators': 201}


In [146]:
from sklearn.model_selection import KFold

n_splits = 5
skf = KFold(n_splits=n_splits, shuffle=True, random_state=420)

In [147]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Evaluate best XGBoost model with cross-validation
for i, (train_index, test_index) in enumerate(skf.split(X, y_t2p)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_t2p.iloc[train_index], y_t2p.iloc[test_index]

    # Train model
    xgb_best_model.fit(X_train, y_train)

    # Predict
    y_pred_train = xgb_best_model.predict(X_train)
    y_pred_test = xgb_best_model.predict(X_test)

    # Compute metrics
    xgb_mse_train = mean_squared_error(y_train, y_pred_train)
    xgb_mse_test = mean_squared_error(y_test, y_pred_test)

    xgb_rel_error_train = np.mean(np.abs(y_pred_train - y_train) / y_train)
    xgb_rel_error_test = np.mean(np.abs(y_pred_test - y_test) / y_test)

    # Output
    print(f"---- Fold {i} ----")
    print(f"Training => MSE: {xgb_mse_train:.4f}; Mean Relative Error: {xgb_rel_error_train:.4f}")
    print(f"Testing  => MSE: {xgb_mse_test:.4f}; Mean Relative Error: {xgb_rel_error


---- Fold 0 ----
MSE: 0.013004506060438046; Relative Error: 0.3420953895539513

---- Fold 1 ----
MSE: 0.01334213424127205; Relative Error: 0.31120264046052365

---- Fold 2 ----
MSE: 0.013992439476378623; Relative Error: 0.3383812371573845

---- Fold 3 ----
MSE: 0.01350726401617012; Relative Error: 0.32829631039851365

---- Fold 4 ----
MSE: 0.012536254617171134; Relative Error: 0.3380013381834185



### 3. Random Forest

#### Transform

In [148]:
X_cols = ['network', 'platform', 'week_of_year', 'day_of_week', 'cost_usd', 'impressions', 'clicks', 'installs']
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

#### Model

In [159]:
from sklearn.ensemble import RandomForestRegressor

param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
}
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# note: increasing search space to 3000 and 200 doesn't improve training nor testing

In [160]:
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    scoring='neg_mean_squared_error',
    cv=skf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X, y_t2p)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [161]:
best_rf = random_search.best_estimator_
print("Best parameters found:\n", random_search.best_params_)

Best parameters found:
 {'bootstrap': True, 'max_depth': 169, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 15, 'n_estimators': 291}


In [162]:
n_splits = 5
skf = KFold(n_splits=n_splits, shuffle=True, random_state=420)

In [163]:
# Evaluate best model with cross-validation
for i, (train_index, test_index) in enumerate(skf.split(X, y_t2p)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y_t2p.iloc[train_index], y_t2p.iloc[test_index]

    best_rf.fit(X_train, y_train)
    y_pred_train = best_rf.predict(X_train)
    y_pred_test = best_rf.predict(X_test)

    rf_mse_test = mean_squared_error(y_test, y_pred_test)
    rf_mse_train = mean_squared_error(y_train, y_pred_train)
    rf_rel_error_test = np.mean(np.abs(y_pred_test - y_test) / (y_test))
    rf_rel_error_train = np.mean(np.abs(y_pred_train - y_train) / (y_train))

    print(f"---- Fold {i} ----")
    print(f"Training => MSE: {rf_mse_train:.4f}; Mean Relative Error: {rf_rel_error_train:.4f}")
    print(f"Training => MSE: {rf_mse_test:.4f}; Mean Relative Error: {rf_rel_error_test:.4f}\n")


---- Fold 0 ----
Training => MSE: 0.0078; Mean Relative Error: 0.2501
Training => MSE: 0.0124; Mean Relative Error: 0.3330

---- Fold 1 ----
Training => MSE: 0.0077; Mean Relative Error: 0.2514
Training => MSE: 0.0129; Mean Relative Error: 0.3070

---- Fold 2 ----
Training => MSE: 0.0075; Mean Relative Error: 0.2476
Training => MSE: 0.0138; Mean Relative Error: 0.3341

---- Fold 3 ----
Training => MSE: 0.0076; Mean Relative Error: 0.2490
Training => MSE: 0.0130; Mean Relative Error: 0.3235

---- Fold 4 ----
Training => MSE: 0.0078; Mean Relative Error: 0.2491
Training => MSE: 0.0121; Mean Relative Error: 0.3309



Model is underfitting

### 4. LightGBM

### 5. CatBoost

### 6. MLP

In [178]:
df_weekly.columns

Index(['date', 'network', 'platform', 'country', 'cost_cad', 'cost_usd',
       'clicks', 'impressions', 'installs', 'trials_prefinal', 'paid_prefinal',
       'revenues', 'install_date', 'continent', 'sub_continent',
       'trials_hauutm', 'paid_hauutm', 'revenue', 'refund', 'refunded_amount',
       'is_holiday', 'week_of_year', 'day_of_week', 't2p', 'country_tier'],
      dtype='object')

## --- Weekly Evaluation ---

In [179]:
df_weekly = dff.copy()
df_weekly['date'] = pd.to_datetime(df['date'])
df_weekly = df_weekly.groupby([pd.Grouper(key='date', freq='W-MON')]).agg({
    'cost_usd': 'sum', 
    'impressions': 'sum', 
    'clicks': 'sum', 
    'installs': 'sum',
    'trials_hauutm': 'sum', 
    'paid_hauutm': 'sum', 
}).reset_index()

In [180]:
df_weekly

Unnamed: 0,date,cost_usd,impressions,clicks,installs,trials_hauutm,paid_hauutm
0,2023-01-02,45152.681891,9.006742e+06,105806.000000,15915.000000,2508.0,1076.0
1,2023-01-09,203995.209782,2.403994e+07,312050.000000,77441.280731,11376.0,4911.0
2,2023-01-16,346760.606197,3.665180e+07,397455.000000,94020.726051,13984.0,5975.0
3,2023-01-23,310384.235154,3.351956e+07,354789.000000,89196.177301,13324.0,5814.0
4,2023-01-30,191421.817307,1.933858e+07,258146.000000,58654.276916,9032.0,3903.0
...,...,...,...,...,...,...,...
96,2024-11-04,30144.028474,5.547419e+06,39045.000000,9109.530384,1113.0,381.0
97,2024-11-11,21762.954158,3.709424e+06,27179.175904,7223.791015,960.0,304.0
98,2024-11-18,23789.226376,2.977140e+06,27076.000000,6502.539477,765.0,297.0
99,2024-11-25,21904.834363,1.945136e+06,18129.139576,6306.028311,728.0,213.0
