In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()

In [None]:
MLFLOW_URI = os.environ.get("MLFLOW_URI")

In [None]:
# MLFLOW_URI

### Read Data

In [None]:
df = pd.read_csv("/Users/emulie/Documents/poc/T2PArima/data/merged_20250804.csv")

In [None]:
df.head()

In [None]:
df.columns

### Cleaning

In [None]:
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [None]:
valid_country_mask = df['country'].apply(lambda x: isinstance(x, str))
zero_country_mask = df['country'] == '0'
valid_continent_mask = df['continent'].apply(lambda x: isinstance(x, str))
valid_subcontinent_mask = df['sub_continent'].apply(lambda x: isinstance(x, str))

df = df[valid_country_mask & ~zero_country_mask & valid_continent_mask & valid_subcontinent_mask]

In [None]:
# --- minimum conversions required
df['t2p'] = df[PAID_COL] / df[TRIAL_COL]

min_cost_mask = df['cost_usd'] > 5.0
min_paid_mask = df[PAID_COL] > 2.0
min_trial_mask = df[TRIAL_COL] > 5.0
min_t2p_mask = df['t2p'] > 0
df_overall = df[min_cost_mask & min_paid_mask & min_trial_mask & min_t2p_mask]

In [None]:
df.shape

### Transformation - 

In [None]:
# --- encode categorical columns
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in ['network', 'platform', 'country', 'continent', 'sub_continent']:
    df_overall[f'{col}_encoded'] = le.fit_transform(df_overall[col])

### Dataset Split

In [None]:
# X_cols = ['week_of_year', 'day_of_week', 'is_holiday', 'network_encoded', 'platform_encoded', 'country_encoded',
#        'continent_encoded', 'sub_continent_encoded']

In [None]:
# --- T1 countries - ANDROID
T1_countries = df_overall['country'].isin(['US', 'CA', 'AU', 'UK'])
android_mask = df_overall['platform'] == 'android'

dff = df_overall[T1_countries & android_mask]

In [None]:
# --- T1 countries - iOS
T1_countries = df_overall['country'].isin(['US', 'CA', 'AU', 'UK'])
ios_mask = df_overall['platform'] == 'ios'

dff = df_overall[T1_countries & ios_mask]

In [None]:
# --- T2 countries 
T2_countries = ~df_overall['country'].isin(['US', 'CA', 'AU', 'UK'])
dff = df_overall[T2_countries]

In [None]:
# X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

# N = int(len(df) * 0.8)
# X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
# y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

### Ensemble Model - GLM + XGB

#### 1. GLM

##### Transform

In [None]:
X_cols = ['week_of_year', 'day_of_week', 'cost_usd', 'clicks', 'impressions', 'installs']

TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [None]:
formula_trial = f"{TRIAL_COL} ~ week_of_year + day_of_week + cost_usd"
formula_paid = f"{PAID_COL} ~ week_of_year + day_of_week + cost_usd"

In [None]:
cols_to_log_transform = ['cost_usd']

In [None]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

for col in cols_to_log_transform:
    X[col] = np.log(X[col])

N = int(len(df) * 0.8)
X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

##### Model

In [None]:
import statsmodels.api as sm

In [None]:
model_trial = sm.GLM(y_trial_train, X_train, formula=formula_trial, family=sm.families.Poisson()).fit()
model_paid = sm.GLM(y_paid_train, X_train, formula=formula_paid, family=sm.families.Poisson()).fit()

In [None]:
y_trial_predicted = model_trial.predict(X)
y_paid_predicted = model_paid.predict(X)

#### 2. XGB

##### Transform

In [None]:
X_cols = ['week_of_year', 'day_of_week', 'cost_usd', 'installs']
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [None]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

y_t2p = np.log(y_t2p)

N = int(len(df) * 0.8)
X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

##### Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

params = {
    "n_estimators": 200,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.05,
    "loss": "squared_error",
}
xgb_t2p = GradientBoostingRegressor(**params)
xgb_t2p.fit(X_train, y_t2p_train)

In [None]:
t2p_pred = np.exp(xgb_t2p.predict(X))

#### 3. Ensemble 

In [None]:
dff['t2p_glm'] = y_paid_predicted / y_trial_predicted
dff['t2p_xgb'] = t2p_pred

In [None]:
plt.plot(range(len(dff)), dff['t2p_glm'], alpha=0.4, label='glm')
plt.plot(range(len(dff)), dff['t2p_xgb'], alpha=0.4, label='xgb')
plt.plot(range(len(dff)), dff['t2p'], alpha=0.4, label='actual')
plt.legend()
plt.show()

In [None]:
for t2p, t2p_glm, t2p_xgb in zip(dff['t2p'], dff['t2p_glm'], dff['t2p_xgb']):
    print(t2p, t2p_glm, t2p_xgb)

#### 4. Meta Model

In [None]:
from sklearn.linear_model import LinearRegression

meta_input = pd.DataFrame({
    "glm_pred": dff['t2p_glm'], 
    "xgb_pred": dff['t2p_xgb'],
    # "actual": dff['t2p']
})

meta_model = LinearRegression().fit(meta_input, dff['t2p'])
final_prediction = meta_model.predict(meta_input)

In [None]:
for t2p, pred in zip(dff['t2p'], final_prediction):
    print(t2p, pred)