In [None]:
import pandas as pd 
from catboost import CatBoostClassifier, Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import lightgbm as lgb

In [None]:
X = pd.read_parquet("../data/processed/train_data/train.gzip")
y = pd.read_parquet("../data/processed/train_data/train_target.gzip")

In [None]:
for column in X.columns:
    if X[column].isnull().values.any():
        print(column, X[column].isnull().sum(), X[column].isnull().sum()/X.shape[0])

## Modele de classification

In [None]:
y_retard = y[["RETARD A L'ARRIVEE"]]
y = y[["RETARD"]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
cat_features = [0,1,6,10,12,13,14,15]

In [None]:
sum_neg = y_train[y_train["RETARD"]==0].shape[0]
sum_pos = y_train[y_train["RETARD"]==1].shape[0]

In [None]:
model = CatBoostClassifier(iterations=500, 
                           learning_rate=0.03, 
                           eval_metric="Recall",
                           depth=10,
                           random_seed=0, 
                           auto_class_weights="Balanced")

In [None]:
model.fit(X_train, y_train, cat_features)
preds_class = model.predict(X_test)
preds_proba = model.predict_proba(X_test)

In [None]:
print(classification_report(y_test, preds_class))

In [None]:
print(confusion_matrix(y_test, preds_class))

In [None]:
preds_class

In [None]:
preds_proba

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, preds_class)
roc_auc = auc(fpr, tpr)

plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.001, 1])
plt.ylim([0, 1.001])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show();

In [None]:
df = pd.DataFrame()

In [None]:
df[X_test.columns] = X_test

In [None]:
predictions = []
for i in range(preds_proba.shape[0]):
    predictions.append(preds_proba[i][1])
predictions

In [None]:
df["RETARD PREDIT"] = predictions
df.head()

In [None]:
thresh = 0.2
def calculate_prediction(x, thresh):
    if x <= thresh:
        return 0
    else : 
        return 1
df["RETARD"] = df["RETARD PREDIT"].apply(lambda x: calculate_prediction(x))
df.head()

In [None]:
print(classification_report(y_test, df["RETARD"]))

# INTERPRETATION WITH SHAP

In [None]:
import shap 

In [None]:
%time 
shap_values = shap.TreeExplainer(model).shap_values(X_test)

In [None]:
shap_values[0]

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.dependence_plot("NOMBRE DE PASSAGERS", shap_values, X_test)

In [None]:
shap.dependence_plot("TEMPS PROGRAMME", shap_values, X_test)

In [None]:
shap.dependence_plot("DISTANCE", shap_values, X_test)

In [None]:
shap.dependence_plot("HEURE D'ARRIVEE", shap_values, X_test)

## Modèle de régression

In [None]:
y_retard = y_retard[y_retard["RETARD A L'ARRIVEE"]>10]
y_retard.head()

In [None]:
indexes = y_retard.index

In [None]:
indexes_deleted = X.index.difference(indexes)
indexes_deleted

In [None]:
X_retard = X.drop(indexes_deleted)
X_retard.head()

In [None]:
X_retard = X_retard.reset_index(drop=True)
y_retard = y_retard.reset_index(drop=True)

In [None]:
cat_features = [0,1,6,10,12,13,14,15]

for i in cat_features:
    X_retard.iloc[:,i] = X_retard.iloc[:,i].astype('category')

In [None]:
X_train_retard, X_test_retard, y_train_retard, y_test_retard = train_test_split(X_retard,
                                                                                y_retard, 
                                                                                test_size=0.2,
                                                                                random_state=42)

In [None]:
clf = lgb.LGBMRegressor(num_leaves=50, max_depth=-1, 
                         random_state=314, 
                         silent=True, 
                         metric='None', 
                         n_jobs=4, 
                         n_estimators=2000,
                         colsample_bytree=0.9,
                         subsample=0.9,
                         learning_rate=0.05)

In [None]:
clf.fit(X_train_retard, y_train_retard)
preds_class = clf.predict(X_test_retard)

In [None]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test_retard, preds_class))

In [None]:
mean_squared_error(y_test_retard, preds_class)

In [None]:
rmse

In [None]:
df_retard = pd.DataFrame()

In [None]:
df_retard["RETARD"] = y_test_retard["RETARD A L'ARRIVEE"]
df_retard["RETARD PREDIT"] = preds_class
df_retard.head()

In [None]:
data = Pool(data=X_train, cat_features=cat_features)
pd.DataFrame({'feature_importance': model.get_feature_importance(data), 
              'feature_names': X_test.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

In [None]:
def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        labels = train_label.iloc[train_index]
        test_labels = train_label.iloc[test_index]

        model = CatBoostClassifier(**params)
        model.fit(train, np.ravel(labels), cat_features=cat_dims)

        #res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
        res.append(recall(test_labels, model.predict(test)))
    return np.mean(res)

In [None]:
params ={'learning_rate': 0.03,
        'depth': 10,
        'iterations': 1000, 
        'random_seed': 0, 
        'auto_class_weights': "Balanced"}

In [None]:
crossvaltest(params, X, y, cat_features)

In [None]:
params = {'learning_rate': [0.03, 0.05, 0.1],
        'depth': [5, 10, 20],
        'iterations': [50, 100, 200, 1000], 
        'random_seed': [0], 
        'auto_class_weights': ["Balanced"]}

In [None]:
lgb.plot_importance(clf, height=0.5, importance_type='gain', 
                    max_num_features=15, title= "Featur Importance")

# Shap interpretation 

Nous utilisons un diagramme de dispersion de densité des valeurs SHAP pour chaque feature afin d'identifier l'impact de chaque feature sur la sortie du modèle pour les individus de l'ensemble de données de test. Les features sont triées en fonction de la somme des valeurs SHAP pour tous les échantillons.

In [None]:
explainer = shap.TreeExplainer(clf.booster_)
shap_values = explainer.shap_values(X_test_retard)
global_importances = np.abs(shap_values).mean(0)[:-1]

In [None]:
inds = np.argsort(-global_importances)
f = plt.figure(figsize=(5,10))
nb_features = len(global_importances)
y_pos = np.arange(nb_features)
inds2 = np.flip(inds[:nb_features], 0)
plt.barh(y_pos, global_importances[inds2], align='center', color="#1E88E5")
plt.yticks(y_pos, fontsize=13)
plt.gca().set_yticklabels(X_train.columns[inds2])
plt.xlabel('mean abs. SHAP value (impact on model output)', fontsize=13)
plt.gca().xaxis.set_ticks_position('bottom')
plt.gca().yaxis.set_ticks_position('none')
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

In [None]:
shap.summary_plot(shap_values, X_test_retard)

In [None]:
explainer.expected_value

In [None]:
X_test_retard.iloc[0,:].shape

In [None]:
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[0], X_test_retard.iloc[0,:])

In [None]:
shap.plots.force(shap_values[0])

### Visualize many prediction 

In [None]:
shap.force_plot(explainer.expected_value[1], shap_values[:1000,:], X_display.iloc[:1000,:])

## Plot the SHAP dependence plots for the top 5 features

Les graphiques de dépendance SHAP montrent l'effet d'une seule feature sur l'ensemble des données. Ils représentent la valeur d'une feature en fonction de la valeur SHAP de cette feature sur de nombreux échantillons. Les diagrammes de dépendance SHAP sont similaires aux diagrammes de dépendance partielle, mais tiennent compte des effets d'interaction présents dans les features, et ne sont définis que dans les régions de l'espace d'entrée supportées par les données. La dispersion verticale des valeurs SHAP pour une seule valeur de feature est due aux effets d'interaction, et une autre feature est choisie pour être colorée afin de mettre en évidence les interactions possibles.

In [None]:
for name in X_test_retard.columns:
    shap.dependence_plot(name, shap_values, X_test_retard)