In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import pickle

sns.set()

In [None]:
def distplot(series, **kwargs):
    """Create a figure with two subplots.
    The lower part of the figure is distplot and the upper part display
    a box plot for the same sample.

    :arg:
        series (pd.Series): The sample you want to plot.
        kwargs : all keyword argument accepted by seaborn.distplot.
    """
    # Cut the window in 2 parts
    kwrgs = {"height_ratios": (.15, .85)}
    f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, figsize=(8, 8),
                                        gridspec_kw=kwrgs)

    # Add a graph in each part
    sns.boxplot(series, ax=ax_box)
    sns.distplot(series, ax=ax_hist, **kwargs)

    # Remove x axis name for the boxplot
    ax_box.set(xlabel='')


In [None]:
data = dict()
for dirname, _, filenames in os.walk('../data/raw'):
    for filename in filenames:
        print(filename)

On recharge les données intégralement

In [None]:
app_train = pd.read_csv('../data/raw/application_train.csv')\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
app_test = pd.read_csv('../data/raw/application_test.csv')\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
bureau = pd.read_csv('../data/raw/bureau.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_BUREAU'])\
    .reset_index(drop=True)
bureau_balance = pd.read_csv('../data/raw/bureau_balance.csv')\
    .sort_values('SK_ID_BUREAU')\
    .reset_index(drop=True)
cash = pd.read_csv('../data/raw/POS_CASH_balance.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
credit = pd.read_csv('../data/raw/credit_card_balance.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
previous = pd.read_csv('../data/raw/previous_application.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
installments = pd.read_csv('../data/raw/installments_payments.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)

## Manual feature engineering

On reprend les quelques variables qui ont le plus de sens. Voir Notebook '1.0-tg-initial-EDA'

In [None]:
features = app_train[['SK_ID_CURR', 
                      'DAYS_BIRTH', # Age
                      #'CODE_GENDER', # Sex
                      'OCCUPATION_TYPE', # Job
                      'AMT_INCOME_TOTAL', # Revenues
                      'AMT_CREDIT', # Credit amount
                      'NAME_CONTRACT_TYPE', # Contract type, Cash/Revolving
                      'AMT_ANNUITY', # Annuity amount
                      'EXT_SOURCE_1',
                      'EXT_SOURCE_2',
                      'EXT_SOURCE_3',
                     ]]

features_test = app_test[['SK_ID_CURR', 
                          'DAYS_BIRTH', # Age
                          #'CODE_GENDER', # Sex
                          'OCCUPATION_TYPE', # Job
                          'AMT_INCOME_TOTAL', # Revenues
                          'AMT_CREDIT', # Credit amount
                          'NAME_CONTRACT_TYPE', # Contract type, Cash/Revolving
                          'AMT_ANNUITY', # Annuity amount
                          'EXT_SOURCE_1',
                          'EXT_SOURCE_2',
                          'EXT_SOURCE_3',
                         ]]

In [None]:
features = pd.concat([features, features_test], axis=0)

In [None]:
features.shape

In [None]:
features

In [None]:
features['DAYS_BIRTH'] = features['DAYS_BIRTH'] / -365

In [None]:
features

### payment default

In [None]:
bureau

In [None]:
bureau = bureau.set_index('SK_ID_BUREAU').join(pd.get_dummies(bureau.set_index('SK_ID_BUREAU')['CREDIT_ACTIVE'], prefix='CREDIT_ACTIVE'))

In [None]:
bureau[bureau['SK_ID_CURR'] == 456116]

In [None]:
bureau_balance['STATUS'].replace('C', 0, inplace=True)
bureau_balance['STATUS'].replace('X', 0, inplace=True)
bureau_balance['STATUS'] = bureau_balance['STATUS'].astype('int')

In [None]:
count_late = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].sum()

In [None]:
count_late.describe()

In [None]:
bureau = pd.merge(bureau, count_late, left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU')
bureau.rename(columns={"STATUS": 'REPORTED_DPD'}, inplace=True)

In [None]:
bureau

In [None]:
bureau[[x for x in bureau.columns if x.startswith('CREDIT_ACTIVE_')] + ['SK_ID_CURR', 'REPORTED_DPD']].groupby('SK_ID_CURR').sum()

In [None]:
bureau_history = bureau[[x for x in bureau.columns if x.startswith('CREDIT_ACTIVE_')] + ['SK_ID_CURR', 'REPORTED_DPD']].groupby('SK_ID_CURR').sum()
features = features.set_index('SK_ID_CURR').join(bureau_history).reset_index()

In [None]:
features

In [None]:
features.columns

In [None]:
# No significant features.
# From feature importance with LightGBM
features.drop(columns=['CREDIT_ACTIVE_Bad debt', 'CREDIT_ACTIVE_Sold'], inplace=True)

In [None]:
features.join(app_train['TARGET'])

### payment default from HC

In [None]:
previous

In [None]:
cash

In [None]:
cash['SK_DPD'].describe()

In [None]:
installments

In [None]:
installments['BAD_PAYMENT_HC'] = installments['AMT_INSTALMENT'] != installments['AMT_PAYMENT']

In [None]:
installments['BAD_PAYMENT_HC'].describe()

In [None]:
bad_payment_hc_history = installments[['SK_ID_CURR', 'BAD_PAYMENT_HC']].groupby('SK_ID_CURR').count()

In [None]:
features = features.set_index('SK_ID_CURR').join(bad_payment_hc_history).sort_index()

In [None]:
previous

In [None]:
previous['CNT_PAYMENT']

In [None]:
previous['DAYS_TERMINATION'].describe()

In [None]:
previous['IS_ACTIVE'] = previous['DAYS_TERMINATION'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
previous['IS_ACTIVE'].describe()

In [None]:
active_cred_hc = previous[['SK_ID_CURR', 'IS_ACTIVE']].groupby('SK_ID_CURR').sum()

In [None]:
features = features.join(active_cred_hc)

In [None]:
features.rename(columns={'IS_ACTIVE': 'ACTIVE_CRED_HC'}, inplace=True)

In [None]:
features

In [None]:
features.index

In [None]:
features['TOTAL_PREV_HC'] = previous[['SK_ID_CURR', 'IS_ACTIVE']].groupby('SK_ID_CURR').count()

In [None]:
features['TOTAL_PREV_HC'].describe()

In [None]:
features.describe()

In [None]:
features

In [None]:
features = features.join(app_train.set_index('SK_ID_CURR')['TARGET'])

In [None]:
features.to_csv('../data/processed/features.csv')

In [None]:
new_features = ["CREDIT_ACTIVE_Active",
                "CREDIT_ACTIVE_Closed",
                "REPORTED_DPD",
                "BAD_PAYMENT_HC",
                "ACTIVE_CRED_HC",
                "TOTAL_PREV_HC"]
for feature in new_features:
    distplot(features[feature])

### Baseline

Add TARGET and make a logistic reg.

In [None]:
from sklearn.preprocessing import LabelEncoder

le_ = 0

categ_var = list()
label_encoders = dict()

for col in features.columns:
    if features[col].dtype.name == 'object':
        if len(features[col].unique()) <= 2:
            # label encoder
            le = LabelEncoder()
            print('Encoding %s' % col)
            le.fit(features[col])
            features[col] = le.transform(features[col])
            le_ += 1
            categ_var.append(col)
            label_encoders[col] = le

print(f"{le_} columns encoded")

In [None]:
for col in features.columns:
    if col not in app_train.columns:
        features[col].fillna(0, inplace=True)

In [None]:
features_ohe = pd.get_dummies(features)

In [None]:
features_le = features.copy()

le_ = 0
for col in features.columns:
    if features[col].dtype.name == 'object':
        if len(features[col].unique()) >= 2:
            le = LabelEncoder()
            # label encoder
            print('Encoding %s' % col)
            le.fit(features[col].astype(str))
            features_le[col] = le.transform(features[col].astype(str))
            le_ += 1
            categ_var.append(col)
            label_encoders[col] = le
print(f"{le_} columns encoded")

In [None]:
features_le = features_le.join(app_train.set_index('SK_ID_CURR')['TARGET'])
features_ohe = features_ohe.join(app_train.set_index('SK_ID_CURR')['TARGET'])

In [None]:
features_ohe

#### Baseline Logistic reg

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

train = features_ohe[features_ohe['TARGET'].notna()].copy()
test = features_ohe[features_ohe['TARGET'].isna()].copy()
target = train['TARGET']
train.drop(columns=['TARGET'], inplace=True)
test.drop(columns=['TARGET'], inplace=True)


imputer = SimpleImputer(strategy='constant', fill_value=0)
scaler = MinMaxScaler(feature_range=(0, 1))
imputer.fit(train)
scaler.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

train = scaler.transform(train)
test = scaler.transform(test)

features_names = list(features_ohe.drop(columns=['TARGET']).columns)

print(f'train set shape : {train.shape}')
print(f'test set shape : {test.shape}')

In [None]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression(C=1e-4)
reg.fit(train, target)

In [None]:
baseline_results = app_test[['SK_ID_CURR']].copy()
baseline_results['TARGET'] = reg.predict_proba(test)[:, 1]

In [None]:
path = os.path.join(os.path.abspath('../reports/'), 'logistic_reg_features_engineering_baseline.csv')
baseline_results.to_csv(path, index=False)

In [None]:
features.to_csv('../data/interim/features.csv')

#### Résultats:

**ROC_AUC = 0.68867**

### LightGBM

In [None]:
import lightgbm as lgb

In [None]:
features_le

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

train = features_le[features_le['TARGET'].notna()].copy()
test = features_le[features_le['TARGET'].isna()].copy()
target = train['TARGET']
train.drop(columns=['TARGET'], inplace=True)
test.drop(columns=['TARGET'], inplace=True)


imputer = SimpleImputer(strategy='constant', fill_value=0)
scaler = MinMaxScaler(feature_range=(0, 1))
imputer.fit(train)
scaler.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

train = scaler.transform(train)
test = scaler.transform(test)

features_names = list(features_le.drop(columns=['TARGET']).columns)

print(f'train set shape : {train.shape}')
print(f'test set shape : {test.shape}')

In [None]:
features_le

In [None]:
pd.DataFrame(train, columns=features_names, index=app_train['SK_ID_CURR'])\
    .join(app_train[['SK_ID_CURR', 'TARGET']].set_index("SK_ID_CURR"))

In [None]:
with open('../data/interim/scalers.pickle', 'wb') as f:
    pickle.dump(scaler, f)

In [None]:
train_data = lgb.Dataset(train, label=target, 
                         feature_name=features_names,
                         categorical_feature=categ_var)

param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'

num_round = 100

In [None]:
bst = lgb.train(param, train_data, num_round)

In [None]:
baseline_results = app_test[['SK_ID_CURR']].copy()
baseline_results['TARGET'] = bst.predict(test)
path = os.path.join(os.path.abspath('../reports/'), 'lgbm.csv')
baseline_results.to_csv(path, index=False)

#### résultats: 

ROC_AUC = 0.72253

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 8))
lgb.plot_importance(bst, ignore_zero=False, ax=ax)

### LightGBM optimization

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train, target) 

In [None]:
train_data = lgb.Dataset(X_train, label=y_train, feature_name=features_names)
test_data = lgb.Dataset(X_test, label=y_test, feature_name=features_names, reference=train_data)

In [None]:
evals_results = dict()

In [None]:
param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'

num_round = 1000


In [None]:
bst = lgb.train(param, train_data, num_boost_round=num_round, 
                verbose_eval=1, evals_result=evals_results,
                valid_sets=[train_data, test_data],
                early_stopping_rounds=30)

In [None]:
fig, ax = plt.subplots(1, figsize=(12, 8))
sns.lineplot(x=range(len(evals_results['training']['auc'])),
             y=evals_results['training']['auc'], label='train')
sns.lineplot(x=range(len(evals_results['valid_1']['auc'])),
             y=evals_results['valid_1']['auc'], label='test')
plt.legend()
plt.show()

In [None]:
evals_results.keys()

In [None]:
baseline_results = app_test[['SK_ID_CURR']].copy()
baseline_results['TARGET'] = bst.predict(test)
path = os.path.join(os.path.abspath('../reports/'), 'lgbm_early_stop.csv')
baseline_results.to_csv(path, index=False)

### Résultats LightGBM 

**ROC_AUC = 0.73572**

### Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
train = features_ohe[features_ohe['TARGET'].notna()].copy()
test = features_ohe[features_ohe['TARGET'].isna()].copy()
target = train['TARGET']
train.drop(columns=['TARGET'], inplace=True)
test.drop(columns=['TARGET'], inplace=True)


imputer = SimpleImputer(strategy='constant', fill_value=0)
scaler = MinMaxScaler(feature_range=(0, 1))
imputer.fit(train)
scaler.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

train = scaler.transform(train)
test = scaler.transform(test)

features_names = list(features_ohe.drop(columns=['TARGET']).columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target) 
clf = RandomForestClassifier()

In [None]:
clf

In [None]:
clf.fit(X_train, y_train)

In [None]:
res = clf.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
y_test

In [None]:
roc_auc_score(y_test.values, res[:, 1])

### Interprétabilité du modèle

In [None]:
import shap

shap.initjs()

In [None]:
train = features_le[features_le['TARGET'].notna()].copy()
sample_data = pd.DataFrame(train.drop(columns=['TARGET']), 
                           columns=features_names).sample(100, random_state=100)
index = sample_data.index
sample_data = MinMaxScaler(feature_range=[0, 1]).fit_transform(sample_data)
sample_data = SimpleImputer(strategy='constant', fill_value=0).fit_transform(sample_data)
sample_data = pd.DataFrame(sample_data, columns=features_names, index=index)
explainer = shap.TreeExplainer(bst, data=sample_data, model_output='probability')
shap_values = explainer.shap_values(sample_data)

In [None]:
sample_data

In [None]:
shap_values.shape

In [None]:
sample_data.shape

In [None]:
shap.summary_plot(shap_values, features=sample_data, feature_names=features_names)

In [None]:
explainer.expected_value

In [None]:
shap.force_plot(explainer.expected_value,
                shap_values[0,:], features=sample_data.iloc[0, :])

In [None]:
app_train[app_train['SK_ID_CURR'] == 362145]

In [None]:
shap.force_plot(explainer.expected_value,
                shap_values, features=sample_data)

In [None]:
features_le.to_csv('../data/processed/features.csv')

In [None]:
categ_var

In [None]:
bst.save_model('../models/booster_V0.txt')

In [None]:
features_le