In [None]:
import yaml
import pandas as pd
import numpy as np
import statsmodels.api as sm
import plotly.express as px
from patsy import dmatrices

In [None]:
# read in (yaml) configs
with open('../conf/model_config.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)

# import data
dataset = '../' + model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)

# define predictors and target
predictor = [i for i in model_config['meta']['predictors'] if i in ['outfit_liked', 'item_added', 'comment_posted', 'comment_liked']]
target = model_config['meta']['target']
df = dataset[predictor+target]

# log transformation of the dataset
df = np.log(df+1)

In [None]:
mask = np.random.rand(len(df)) < 0.7
df_train = df[mask]
df_test = df[~mask]

In [None]:
formula = target[0] + ' ~ ' + ''.join([str(i) + ' + ' for i in predictor])
formula =formula[:-3]

In [None]:
#Set up the X and y matrices
y_train, X_train = dmatrices(formula, df_train, return_type='dataframe')
y_test, X_test = dmatrices(formula, df_test, return_type='dataframe')

In [None]:
gauss_model = sm.GLM(y_train, X_train, family=sm.families.Gaussian())
gauss_results = gauss_model.fit()

In [None]:
print(gauss_results.summary())

In [None]:
gauss_predictions = gauss_results.get_prediction(X_test)
predictions_summary_frame = gauss_predictions.summary_frame()

In [None]:
predicted_score=predictions_summary_frame['mean']
actual_score = y_test['retentionScore']
residuals = actual_score - predicted_score

In [None]:
fig = px.scatter(x=actual_score, y=predicted_score, labels={'x': 'retention score', 'y': 'prediction'}, 
                 title = 'Comparison between predictions and reality',
                 template = 'plotly_dark')
fig.update_traces(marker=dict(size=5, 
                              color=((abs(actual_score-predicted_score)/actual_score < 0.10).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=actual_score.min(), y0=actual_score.min(),
    x1=actual_score.max(), y1=actual_score.max()
)
fig.write_html("../documentation/docs/assets/stat_gauss_fit.html")

In [None]:
fig = px.scatter(x=predicted_score, y=residuals, labels={'x': 'prediction', 'y': 'residuals'}, template='plotly_dark')
fig.write_html("../documentation/docs/assets/stat_gauss_residuals.html")

# Adding Interactions to the formula

In [None]:
formula2 = formula + '+ item_added:outfit_liked + item_added:comment_posted + item_added:comment_liked + outfit_liked:comment_posted + outfit_liked:comment_liked + comment_posted:comment_liked'

In [None]:
#Set up the X and y matrices
y_train2, X_train2 = dmatrices(formula2, df_train, return_type='dataframe')
y_test2, X_test2 = dmatrices(formula2, df_test, return_type='dataframe')

In [None]:
gauss_model2 = sm.GLM(y_train2, X_train2, family=sm.families.Gaussian())
gauss_results2 = gauss_model2.fit()

In [None]:
print(gauss_results2.summary())

In [None]:
gauss_predictions2 = gauss_results2.get_prediction(X_test2)
predictions_summary_frame2 = gauss_predictions2.summary_frame()

In [None]:
predicted_score2=predictions_summary_frame2['mean']
actual_score2 = y_test2['retentionScore']
residuals2 = actual_score2.values - predicted_score2

In [None]:
fig = px.scatter(x=actual_score2, y=predicted_score2, labels={'x': 'retention score', 'y': 'prediction'}, 
                 title = 'Comparison between predictions and reality',
                 template = 'plotly_dark')
fig.update_traces(marker=dict(size=5, 
                              color=((abs(actual_score2-predicted_score2)/actual_score2 < 0.10).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=actual_score2.min(), y0=actual_score2.min(),
    x1=actual_score2.max(), y1=actual_score2.max()
)
fig.write_html("../documentation/docs/assets/stat_gauss2_fit.html")

In [None]:
fig = px.scatter(x=predicted_score2, y=residuals2, labels={'x': 'prediction', 'y': 'residuals'}, template='plotly_dark')
fig.write_html("../documentation/docs/assets/stat_gauss2_residual.html")

# Gamma

In [None]:
gamma_model = sm.GLM(y_train2, X_train2, family=sm.families.Gamma())
gamma_results = gamma_model.fit()

In [None]:
print(gamma_results.summary())

In [None]:
gamma_predictions = gamma_results.get_prediction(X_test2)
predictions_summary_frame3 = gamma_predictions.summary_frame()

In [None]:
predicted_score3=predictions_summary_frame3['mean']
residuals3 = actual_score2.values - predicted_score3

In [None]:
fig = px.scatter(x=actual_score2, y=predicted_score3, labels={'x': 'retention score', 'y': 'prediction'}, 
                 title = 'Comparison between predictions and reality',
                 template = 'plotly_dark')
fig.update_traces(marker=dict(size=5, 
                              color=((abs(actual_score2-predicted_score3)/actual_score2 < 0.10).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=actual_score2.min(), y0=actual_score2.min(),
    x1=actual_score2.max(), y1=actual_score2.max()
)
fig.write_html("../documentation/docs/assets/stat_gamma_fit.html")