# Librairies

In [1]:
import pandas as pd
import numpy as np

import evidently
from evidently import ColumnMapping

from evidently.test_suite import TestSuite
from evidently.test_preset import NoTargetPerformanceTestPreset

import shap

import joblib

In [2]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

# Chargement des données

Il s'agit du dataset déja préprocessé suivant le même processus détaillé dans le notebook entrainement_du_modèle.ipynb

Une dernière colonne **prediction** a déja été ajoutée à la fin du dataset et correspond au score de la classe 1 retourné par notre modèle final.

In [3]:
df = pd.read_csv("../input/data_cleaned.csv", index_col="SK_ID_CURR")

# Sélection des variables les plus importantes

Nous n'évaluerons le Data Drift que sur les 30 features les plus importantes du modèle.

In [4]:
model = joblib.load("model.pkl")
background_data = df.iloc[:307505, :-1]
explainer = shap.TreeExplainer(model)

# Absolute values of shapley values
shap_values = explainer.shap_values(background_data)
shap_values = np.abs(shap_values[1])

# Get the average shap value of each feature and sort them
shap_values = pd.DataFrame(
    shap_values, 
    index=background_data.index,
    columns=background_data.columns
)
shap_values = pd.DataFrame({
    "feature": shap_values.mean(0).index, 
    "shap_values": shap_values.mean(0).values
            })
shap_values = shap_values.sort_values("shap_values", ascending=False)

# Get the 50 first features
most_important_features = shap_values.iloc[:30, 0].values.tolist()
most_important_features

['EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'EXT_SOURCE_1',
 'PREV_NEW_RETURN_DAY_MAX',
 'NEW_PAYMENT_RATE',
 'CODE_GENDER',
 'NEW_LOAN_VALUE_RATIO',
 'DAYS_EMPLOYED',
 'OWN_CAR_AGE',
 'AMT_CREDIT',
 'NEW_ANNUITY_INCOME_PERC',
 'NAME_FAMILY_STATUS_Married',
 'NAME_EDUCATION_TYPE_Highereducation',
 'INSTAL_AMT_PAYMENT_SUM',
 'INSTAL_DPD_MEAN',
 'PREV_NEW_DAYS_DUE_DIFF_MEAN',
 'DAYS_BIRTH',
 'BURO_DAYS_CREDIT_MEAN',
 'INSTAL_AMT_INSTALMENT_SUM',
 'PREV_PRODUCT_COMBINATION_CashXSelllow_MEAN',
 'DAYS_ID_PUBLISH',
 'NEW_WORKING_YEAR_RANGE',
 'INSTAL_DPD_MAX',
 'POS_MONTHS_BALANCE_MAX',
 'ACTIVE_DAYS_CREDIT_ENDDATE_MIN',
 'INSTAL_PAYMENT_DIFF_MEAN',
 'PREV_RATE_DOWN_PAYMENT_MAX',
 'INSTAL_DAYS_ENTRY_PAYMENT_MAX',
 'ACTIVE_DAYS_CREDIT_ENDDATE_MAX',
 'PREV_PRODUCT_COMBINATION_POSindustrywithinterest_MEAN']

In [5]:
df = df.loc[:, [*most_important_features, "prediction"]]

# Report

In [6]:
nunique = df.nunique()

num_features = nunique[nunique>2].index.tolist()
num_features.remove("prediction")

cat_features = nunique[nunique==2].index.tolist()
df[cat_features] = df[cat_features].astype("object")

column_mapping = ColumnMapping()
column_mapping.numerical_features = num_features
column_mapping.categorical_features = cat_features
column_mapping.prediction = 'prediction'

Le jeu de données contient 356249 individus : 
- Les 307505 premiers individus correspondent aux clients du dataset application_train.csv
- Les 48744 derniers individus correspondent à ceux du dataset application_test.csv

Pour éviter d'avoir un fichier trop volumineux, les tests ne seront réalisés que sur des échantillons de 10000 individus. 

In [10]:
reference = df.iloc[:307505, :].sample(n=10000, replace=False, random_state=42)
current = df.iloc[307505:, :].sample(n=10000, replace=False, random_state=42)

In [11]:
report = TestSuite(tests=[
   NoTargetPerformanceTestPreset()
])
report.run(reference_data=reference, current_data=current)

In [12]:
report.save_html("data-drift-analysis.html")