In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

In [None]:
from datetime import datetime
from mlflow.models import infer_signature

In [None]:
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()

In [None]:
MLFLOW_URI = os.environ.get("MLFLOW_URI")

### Define MLFlow Metadata

In [None]:
experiment_description = """
This is a competing T2P method based on both prefinal data ie the data we get from the API and 
our internal data ie the trial/paid/refund we get from HAU/UTM attribution.
"""

In [None]:
# MLFLOW_URI

In [None]:
import mlflow 

mlflow.set_tracking_uri(MLFLOW_URI)

In [None]:
EXPERIMENT_NAME = "T2P Ensemble"
if not mlflow.get_experiment_by_name(name=EXPERIMENT_NAME):
    mlflow.create_experiment(name=EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

### Read Data

In [None]:
df = pd.read_csv("/Users/emulie/Documents/poc/T2PArima/data/merged_20250804.csv")

In [None]:
df.head()

In [None]:
df.columns

### Cleaning

In [None]:
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [None]:
valid_country_mask = df['country'].apply(lambda x: isinstance(x, str))
zero_country_mask = df['country'] == '0'
valid_continent_mask = df['continent'].apply(lambda x: isinstance(x, str))
valid_subcontinent_mask = df['sub_continent'].apply(lambda x: isinstance(x, str))

df = df[valid_country_mask & ~zero_country_mask & valid_continent_mask & valid_subcontinent_mask]

In [None]:
# --- minimum conversions required
df['t2p'] = df[PAID_COL] / df[TRIAL_COL]

min_cost_mask = df['cost_usd'] > 5.0
min_paid_mask = df[PAID_COL] > 2.0
min_trial_mask = df[TRIAL_COL] > 5.0
min_t2p_mask = df['t2p'] > 0
df_overall = df[min_cost_mask & min_paid_mask & min_trial_mask & min_t2p_mask]

In [None]:
df.shape

### Transformation - 

In [None]:
T1_countries = [
    "GU", "PR", "DK", "JE", "NO", "BE", "FR", "US", "IL", "GB", "UK",
    "CA", "AU", "IE", "NL", "SE", "ES", "IT", "TW", "DE", "FI",
    "NZ", "JP", "KR", "SG", "HK"
]
T2_countries = [
    "ZA", "MT", "AE", "SA", "PL", "AT", "NO", "DK", "IS", "FI"
]
T3_countries = [
    "IN", "PH", "MY", "NG", "TH", "VN", "EG", "MN", "RO", "HU", "RS", "TR"
]


country_tier_map = {country: 'T1' for country in T1_countries} | {country: 'T2' for country in T2_countries} | {country: 'T3' for country in T3_countries}
df_overall['country_tier'] = df_overall['country'].apply(lambda x: country_tier_map[x] if x in country_tier_map else 'T4')

In [None]:
# --- encode categorical columns
from sklearn.preprocessing import LabelEncoder
import joblib

run_name = f"LABEL_ENCODER_{SEGMENTATION}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
experiment_tags = {
    "project_name": EXPERIMENT_NAME, 
    "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # need to be a string
    "model": "Label Encoder", 
    "mlflow.note.content": experiment_description,
}


with mlflow.start_run(experiment_id=experiment.experiment_id, 
                                      run_name=run_name, tags=experiment_tags):
    for col in ['network', 'platform', 'country', 'continent', 'sub_continent', 'country_tier']:
        le = LabelEncoder()
        df_overall[f'{col}_encoded'] = le.fit_transform(df_overall[col])
        label_encoder_path = f"labelencoder_{col}.pkl"
        joblib.dump(le, label_encoder_path)
        mlflow.log_artifact(label_encoder_path)

In [None]:
SEGMENTATION = 'TIER'

### Dataset Split

Models Aggregation:
- Network Level =>
    - Web (only 4)
- Network-Country Tier Models (to test; postponed for now, not enough data)
    - Android T1, T2, T3, T4
    - iOS T1, T2, T3, T4
    - Web



In [None]:
# X_cols = ['week_of_year', 'day_of_week', 'is_holiday', 'network_encoded', 'platform_encoded', 'country_encoded',
#        'continent_encoded', 'sub_continent_encoded']

In [None]:
# # --- T1 countries - ANDROID
# T1_countries = df_overall['country'].isin(['US', 'CA', 'AU', 'UK'])
# android_mask = df_overall['platform'] == 'android'

# dff = df_overall[T1_countries & android_mask]

# SEGMENTATION = "android_T1"

In [None]:
# # --- T1 countries - iOS
# T1_countries = df_overall['country'].isin(['US', 'CA', 'AU', 'UK'])
# ios_mask = df_overall['platform'] == 'ios'

# dff = df_overall[T1_countries & ios_mask]
# SEGMENTATION = "ios_T1"

In [None]:
# # --- T2 countries 
# T2_countries = ~df_overall['country'].isin(['US', 'CA', 'AU', 'UK'])
# dff = df_overall[T2_countries]
# SEGMENTATION = "mobile_T2"

In [None]:
dff = df_overall.copy()

In [None]:
# --- segmentation
t1_mask = df_overall['country'].isin(T1_countries)
t1 = df_overall[t1_mask]
t2 = df_overall[~t1_mask]

In [None]:
print(t1.shape)
print(t2.shape)

In [None]:
# --- SEGMENTATION: ANDROID
android_mask = df_overall['platform'] == 'android'
dff = df_overall[android_mask]
SEGMENTATION = "android_overall"

In [None]:
# --- SEGMENTATION: iOS
ios_mask = df_overall['platform'] == 'ios'
dff = df_overall[ios_mask]
SEGMENTATION = "ios_overall"

In [None]:
# # --- SEGMENTATION: WEB (postponed)
# web_mask = df_overall['platform'] == 'web'
# dff = df_overall[web_mask]
# SEGMENTATION = "web_OVERALL"

In [None]:
# X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

# N = int(len(df) * 0.8)
# X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
# y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

In [None]:
# dff['country'].unique()

In [None]:
# dff['platform'].unique()

### Ensemble Model - GLM + XGB

#### 1. GLM

##### Transform

In [None]:
X_cols = ['week_of_year', 'day_of_week', 'cost_usd', 'clicks', 'impressions', 'installs']

TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [None]:
formula_trial = f"{TRIAL_COL} ~ week_of_year + day_of_week + cost_usd"
formula_paid = f"{PAID_COL} ~  week_of_year + day_of_week + cost_usd"
formula_t2p = f"t2p ~ week_of_year + day_of_week + cost_usd"

In [None]:
cols_to_log_transform = ['cost_usd']

In [None]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

for col in cols_to_log_transform:
    X[col] = np.log(X[col])

N = int(len(dff) * 0.8)
X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

##### Model

In [None]:
import statsmodels.api as sm

In [None]:
model_trial = sm.GLM(y_trial_train, X_train, formula=formula_trial, family=sm.families.Poisson()).fit()
model_paid = sm.GLM(y_paid_train, X_train, formula=formula_paid, family=sm.families.Poisson()).fit()
model_t2p = sm.GLM(y_t2p_train, X_train, formula=formula_t2p, family=sm.families.Poisson()).fit()

In [None]:
y_trial_predicted = model_trial.predict(X)
y_paid_predicted = model_paid.predict(X)
y_t2p_predicted = model_t2p.predict(X)

In [None]:
mse_glm_trial = mean_squared_error(y_trial_predicted[N:], y_trial_test)
mse_glm_paid = mean_squared_error(y_paid_predicted[N:], y_paid_test)

mse_glm_t2p = mean_squared_error(y_t2p_predicted[N:], y_t2p_test)
mse_glm_trial_paid = mean_squared_error(y_paid_predicted[N:] / y_trial_predicted[N:], y_t2p_test)

In [None]:
print(mse_glm_trial)
print(mse_glm_paid)

print(mse_glm_t2p)
print(mse_glm_trial_paid)

In [None]:
run_name = f"GLM_{SEGMENTATION}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
experiment_tags = {
    "project_name": EXPERIMENT_NAME, 
    "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # need to be a string
    "model": "GLM T2P w/ week_of_year, day_of_week, log(cost_usd)", 
    "mlflow.note.content": experiment_description,
}

with mlflow.start_run(experiment_id=experiment.experiment_id, 
                                      run_name=run_name, tags=experiment_tags):
    # TODO: add signature
    # signature = infer_signature(X_train, predictions)
    mlflow.statsmodels.log_model(model_t2p, "glm_t2p")
    
    mlflow.log_metric("mse", mse_glm_t2p)
    # mlflow.log_metric("r2_score", r2s)

    # mlflow.log_figure(fig, f"{y_col}_predictions.png")

#### 2. XGB

##### Transform

In [None]:
X_cols = ['week_of_year', 'day_of_week', 'cost_usd', 'installs']
TRIAL_COL = 'trials_hauutm'
PAID_COL = 'paid_hauutm'

In [None]:
X, y_trial, y_paid, y_t2p = dff[X_cols], dff[TRIAL_COL], dff[PAID_COL], dff['t2p']

y_t2p = np.log(y_t2p)

N = int(len(dff) * 0.8)
X_train, X_test, y_trial_train, y_trial_test, y_paid_train, y_paid_test = X[:N], X[N:], y_trial[:N], y_trial[N:], y_paid[:N], y_paid[N:]
y_t2p_train, y_t2p_test = y_t2p[:N], y_t2p[N:]

##### Model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

params = {
    "n_estimators": 200,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.05,
    "loss": "squared_error",
}
xgb_t2p = GradientBoostingRegressor(**params)
xgb_t2p.fit(X_train, y_t2p_train)

In [None]:
t2p_pred = np.exp(xgb_t2p.predict(X))

In [None]:
mse_xgb = mean_squared_error(t2p_pred[N:], y_t2p_test)

In [None]:
run_name = f"XGB_{SEGMENTATION}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
experiment_tags = {
    "project_name": EXPERIMENT_NAME, 
    "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # need to be a string
    "model": "XGB T2P w/ week_of_year, day_of_week, log(cost_usd)", 
    "mlflow.note.content": experiment_description,
}

with mlflow.start_run(experiment_id=experiment.experiment_id, 
                                      run_name=run_name, tags=experiment_tags):
    # TODO: add signature
    signature = infer_signature(X_train, t2p_pred)
    mlflow.sklearn.log_model(xgb_t2p, "xgb_t2p", signature=signature)
    
    mlflow.log_metric("mse", mse_xgb)

    for metric, val in params.items():
        mlflow.log_param(metric, val)

    # mlflow.log_figure(fig, f"{y_col}_predictions.png")

#### 3. Ensemble 

In [None]:
dff['t2p_glm'] = y_paid_predicted / y_trial_predicted
dff['t2p_xgb'] = t2p_pred

In [None]:
plt.plot(range(len(dff)), dff['t2p_glm'], alpha=0.4, label='glm')
plt.plot(range(len(dff)), dff['t2p_xgb'], alpha=0.4, label='xgb')
plt.plot(range(len(dff)), dff['t2p'], alpha=0.4, label='actual')
plt.legend()
plt.show()

In [None]:
# for t2p, t2p_glm, t2p_xgb in zip(dff['t2p'], dff['t2p_glm'], dff['t2p_xgb']):
#     print(t2p, t2p_glm, t2p_xgb)

#### 4.1. Meta Model - Linear Model

In [None]:
from sklearn.linear_model import LinearRegression

meta_input = pd.DataFrame({
    "glm_pred": dff['t2p_glm'], 
    "xgb_pred": dff['t2p_xgb'],
    # "actual": dff['t2p']
})

meta_model = LinearRegression().fit(meta_input, dff['t2p'])
final_prediction = meta_model.predict(meta_input)

In [None]:
meta_model.coef_

In [None]:
# for t2p, pred in zip(dff['t2p'], final_prediction):
#     print(t2p, pred)

In [None]:
mse_meta = mean_squared_error(final_prediction[N:], dff['t2p'][N:])

In [None]:
run_name = f"META_{SEGMENTATION}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
experiment_tags = {
    "project_name": EXPERIMENT_NAME, 
    "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # need to be a string
    "model": "META T2P w/ week_of_year, day_of_week, log(cost_usd)", 
    "mlflow.note.content": experiment_description,
}

with mlflow.start_run(experiment_id=experiment.experiment_id, 
                                      run_name=run_name, tags=experiment_tags):
    # TODO: add signature
    signature = infer_signature(X_train, final_prediction)
    mlflow.sklearn.log_model(meta_model, "meta_linear_t2p", signature=signature)
    
    mlflow.log_metric("mse", mse_meta)

    # for metric, val in params.items():
    #     mlflow.log_param(metric, val)

    # mlflow.log_figure(fig, f"{y_col}_predictions.png")

#### 4.2. Meta Model - Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

params = {
    'n_estimators': 100, 
    'criterion': 'squared_error', 
    'max_depth': 10, 
    'min_samples_split': 2,
}

rf_model = RandomForestRegressor(**params).fit(meta_input, dff['t2p'])

In [None]:
rf_predictions = rf_model.predict(meta_input)
mse_rf = mean_squared_error(rf_predictions[N:], dff['t2p'][N:])

In [None]:
# SEGMENTATION = "OVERALL"
run_name = f"META_RANDOMFOREST_{SEGMENTATION}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
experiment_tags = {
    "project_name": EXPERIMENT_NAME, 
    "date": datetime.now().strftime('%Y-%m-%d %H:%M:%S'), # need to be a string
    "model": "META Random Forest T2P w/ week_of_year, day_of_week, log(cost_usd)", 
    "mlflow.note.content": experiment_description,
}

with mlflow.start_run(experiment_id=experiment.experiment_id, 
                                      run_name=run_name, tags=experiment_tags):
    # TODO: add signature
    signature = infer_signature(X_train, final_prediction)
    mlflow.sklearn.log_model(rf_model, "meta_rf_t2p", signature=signature)
    
    mlflow.log_metric("mse", mse_rf)

    for metric, val in params.items():
        mlflow.log_param(metric, val)

In [None]:
# for pred, actual in zip(rf_predictions[N:], dff['t2p'][N:]):
#     print(pred, actual)