In [None]:
import pandas as pd
import yaml
import numpy as np
import dalex as dx
import shap
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Reading Dataset

In [None]:
# read in (yaml) configs
with open('../conf/model_config.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)

# import data
dataset = '../' + model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)

## Reading Data Dict

In [None]:
# Reading in the data dictionary
dict_dir = '../data/meta/data_dict.xlsx'

dic = pd.read_excel(dict_dir, engine = 'openpyxl')

In [None]:
predictors = dic.loc[(dic.role=='predictor') & (dic.use == 'Y'), 'name'].values.tolist()
target = dic.loc[dic.role=='target', 'name'].values.tolist()

In [None]:
X = dataset[predictors]
y = dataset[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'], random_state = 42)

# Initialize and Train Champion Models

In [None]:
model_xgb = xgb.XGBRegressor(random_state=42, n_estimators=300, max_depth=3, eta=.05)
model_xgb.fit(X_train, y_train)

model_lgbm = lightgbm.LGBMRegressor(random_state=42, n_estimators=500, max_depth=-1, learning_rate=.2)
model_lgbm.fit(X_train, y_train)

# Model Evaluation

In [None]:
# predict holdout
pred = model_xgb.predict(X_test)

# mean absolute error as KPI
errors = abs(pred - y_test.iloc[:,0].to_numpy())
avg_error = np.mean(errors)
print('MAE:', np.round(avg_error, 3))

In [None]:
y_true = y_test.iloc[:,0].to_numpy()
y_pred = pred

fig = px.scatter(x=y_true, y=y_pred, labels={'x': 'retention score', 'y': 'prediction'}, 
                 title = 'Comparison between predictions and reality',
                 template = 'plotly_dark')
fig.update_traces(marker=dict(size=5, 
                              color=((abs(y_true-y_pred)/y_true < 0.05).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y_true.min(), y0=y_true.min(),
    x1=y_true.max(), y1=y_true.max()
)
fig.write_html("../documentation/docs/assets/prediction_scatter.html")

In [None]:
exp_xgb = dx.Explainer(model_xgb, X, y, label = 'XGBoost')
exp_lgbm = dx.Explainer(model_lgbm, X, y, label = 'LightGBM')

# Model Perfomance

In [None]:
mp_xgb = exp_xgb.model_performance(model_type='regression')
mp_lgbm = exp_lgbm.model_performance(model_type='regression')

## Performance Metrics

In [None]:
mp_xgb.result

In [None]:
mp_lgbm.result

## Residual 95% Confidence Intervals

In [None]:
xgb_ci = 1.96*(mp_xgb.residuals.residuals.std())/(np.float_power(len(dataset), .5))
print('XGBoost 95% confidence interval for residuals: ' + str(tuple([mp_xgb.residuals.residuals.mean()-xgb_ci, mp_xgb.residuals.residuals.mean()+xgb_ci])) + '; where mean of residuals is ' + str(mp_xgb.residuals.residuals.mean()))
lgbm_ci = 1.96*(mp_lgbm.residuals.residuals.std())/(np.float_power(len(dataset), .5))
print('LigthGBM 95% confidence interval for residuals: ' + str(tuple([mp_lgbm.residuals.residuals.mean()-lgbm_ci, mp_lgbm.residuals.residuals.mean()+lgbm_ci])) + '; where mean of residuals is ' + str(mp_lgbm.residuals.residuals.mean()))

## Absolute Residual Quantiles 

In [None]:
for i in [.95, .99, .995, .999]:
    print(str(i*100) + ' percentile of the XGBoost residuals is ' + str(abs(mp_xgb.residuals.residuals).quantile(i)))
    print(str(i*100) + ' percentile of the LightGBM residuals is ' + str(abs(mp_lgbm.residuals.residuals).quantile(i)))

## Reverse Cumulative Distribution of Residuals

In [None]:
fig = mp_lgbm.plot(mp_xgb, show=False);
fig.update_layout(template='plotly_dark')
fig.update_xaxes(range=[0, 5])
fig.write_html("../documentation/docs/assets/reverse_residuals.html")

## Further Residual Analysis

In [None]:
md_xgb = exp_xgb.model_diagnostics()
md_lgbm = exp_lgbm.model_diagnostics()

In [None]:
fig = md_xgb.plot(md_lgbm, show=False);
fig.update_layout(template='plotly_dark')
fig.update_yaxes(range=[-50, 50], row=1, col=1)
fig.write_html("../documentation/docs/assets/residuals_scatter.html")


# Feature Importances

In [None]:
mfe_xgb = exp_xgb.model_parts()
mfe_lgbm = exp_lgbm.model_parts()

In [None]:
fig = mfe_xgb.plot(show=False)
fig.update_layout(template='plotly_dark', font_color='aliceblue')
fig.update_traces(marker_color='#46bac2', textfont_color='aliceblue')
fig.update_yaxes(color='aliceblue')
fig.update_xaxes(color='aliceblue', showgrid=False)
fig.write_html("../documentation/docs/assets/xgb_feature_imp.html")

In [None]:
fig = mfe_lgbm.plot(show=False)
fig.update_layout(template='plotly_dark', font_color='aliceblue')
fig.update_traces(marker_color='#8bdcbe', textfont_color='aliceblue')
fig.update_yaxes(color='aliceblue')
fig.update_xaxes(color='aliceblue', showgrid=False)
fig.write_html("../documentation/docs/assets/lgbm_feature_imp.html")

In [None]:
explainer_xgb = shap.TreeExplainer(model_xgb, X, model_output = "margin", feature_pertubation = "interventional")
shap_values_xgb = explainer_xgb.shap_values(X, check_additivity=False)

In [None]:
shap.summary_plot(shap_values_xgb, X, plot_type = "bar", color='#46bac2', show=False);
fig=plt.gcf()
fig.set_figheight(12)
fig.set_figwidth(14)
ax = plt.gca()
ax.set_title('Shapley Feature Importance for XGBoost')
ax.tick_params(axis='y', labelsize=24)
plt.show()