In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

import mlflow 
from mlflow.models import infer_signature

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import os
import configparser
import ssl

### Importing Model URI

In [None]:
# missing ssh import needed to connect to model URI
ssl._create_default_https_context = ssl._create_unverified_context

# 
# script_dir = os.path.dirname(os.path.abspath(__file__))
script_dir = os.path.dirname(os.path.abspath(""))
config_path = os.path.join(script_dir, "config.ini")
print(f"config_path: {config_path}")

# parse 
config = configparser.ConfigParser()
config.read(config_path)

# define mlflow uri
mlflow_uri = config["mlflow-server"]["Url"]
TRACKING_URI = mlflow_uri

### Import Dataset

In [None]:
df = pd.read_csv("/Users/emulie/Documents/poc/T2PArima/data/data_merged.csv")

In [None]:
df_raw = df

In [None]:
df.head()

### Train model 

In [None]:
def filter_by_network_platform_country(df: pd.DataFrame, network: str, platform: str, country: str):
    network_mask = df['network'] == network
    platform_mask = df['platform'] == platform
    country_mask = df['country'] == country
    df_filtered = df[network_mask & platform_mask & country_mask]
    df_filtered.sort_values('date')
    return df_filtered

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, ElasticNetCV
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

def get_linear_regression_model(df: pd.DataFrame, x_cols: [str], y_col: str, plot_title: str):
    # --- Split data
    df.sort_values('date')
    X, y = df[x_cols], df[[y_col]]
    N = int(len(df) * 0.8)
    X_train, X_test, y_train, y_test = X[:N], X[N:], y[:N], y[N:]

    # --- train model
    model = LinearRegression().fit(X_train, y_train)
    # model = Ridge(alpha=1.0).fit(X_train, y_train)
    # model = ElasticNetCV(cv=10).fit(X_train, y_train)

    # --- predictions + model metrics (accuracy, precision, ...)
    predictions = model.predict(X_test)
    mse = mean_squared_error(predictions, y_test)
    r2s = r2_score(predictions, y_test)
    print(f"MSE: {mse}; R2 score: {r2s}")

    # --- plot 
    df_test = df[N:]
    df_test['predicted'] = np.array(predictions).flatten()
    print(len(np.array(predictions).flatten()), df_test.shape[0])
    fig, ax = plt.subplots(figsize=(6,6))
    ax.plot(df_test['date'], df_test[y_col], label='true')
    ax.plot(df_test['date'], df_test['predicted'], label='predicted')
    ax.legend()
    ax.set_title(plot_title)
    # plt.show()

    return model, mse, r2s, fig



In [None]:
# --- TODO: test 
network, platform, country = 'Facebook Ads', 'android', 'US'
df_fb_android_us = filter_by_network_platform_country(df=df, 
    network=network, platform=platform, country=country)
reg, mse, r2s, fig = get_linear_regression_model(df_fb_android_us, 
                                                 x_cols=x_cols, 
                                                 y_col='paid_prefinal', 
                                                plot_title=f"{network} {platform} {country} paid_prefinal")
# reg = get_linear_regression_model(df_fb_android_us, x_cols=x_cols, y_col='paid_hauutm')

### Push model on MLFlow

In [None]:
platforms = ['android', 'ios']
networks = [
    "Facebook Ads", "Apple Search Ads", "googleadwords_int", 
    "tiktokglobal_int", "snapchat_int", 
    # "tatari_linear", "tatari_streaming", "tatari_programmatic"
]
countries = ['US']
x_cols = ['impressions', 'clicks', 'installs', 'trials_prefinal', 'cost_cad']
y_cols = ['paid_hauutm', 'paid_prefinal']

In [None]:
mlflow.set_tracking_uri(TRACKING_URI)

# --- create exp if doesn't exist
EXPERIMENT_NAME = "Competing T2P"
if not mlflow.get_experiment_by_name(name=EXPERIMENT_NAME):
    mlflow.create_experiment(name=EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# --- train the model
for network in networks:
    for platform in platforms:
        for country in countries:
            for y_col in y_cols: 
                # model metadata
                run_name = f"{network}_{platform}_{country}_{y_col}_{datetime.now().strftime('%Y-%m-%d_%H:%M')}"
                tags = {
                    'env': 'test',
                    'data_date': datetime.now().strftime('%Y-%m-%d_%H:%M'),
                    'model_type': "LinearRegression", 
                    'experiment_description': f"T2P Linear Regression for (network-platform) pair on {y_col}"
                }

                # split train-test
    
                # train model + get predictions error
                dff = filter_by_network_platform_country(df=df, 
                    network=network, platform=platform, country=country)
                reg, mse, r2s, fig = get_linear_regression_model(df_fb_android_us, 
                                                 x_cols=x_cols, 
                                                 y_col=y_col, 
                                                plot_title=f"{network} {platform} {country} {y_col}")
    
                # log to MLFlow
                with mlflow.start_run(experiment_id=experiment.experiment_id, 
                                      run_name=run_name, tags=tags):
                    # TODO: add signature
                    # signature = infer_signature(X_train, predictions)
                    mlflow.sklearn.log_model(reg, "model")
                    
                    mlflow.log_metric("mse", mse)
                    mlflow.log_metric("r2_score", r2s)

                    mlflow.log_figure(fig, f"{y_col}_predictions.png")
                    
                
