In [None]:
import os

import numpy as np
import pandas as pd
import featuretools as ft
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

sns.set()

In [None]:
data = dict()
for dirname, _, filenames in os.walk('../data/raw'):
    for filename in filenames:
        print(filename)

In [None]:
app_train = pd.read_csv('../data/raw/application_train.csv', nrows=10000)\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
app_test = pd.read_csv('../data/raw/application_test.csv', nrows=10000)\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
bureau = pd.read_csv('../data/raw/bureau.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_BUREAU'])\
    .reset_index(drop=True)
bureau_balance = pd.read_csv('../data/raw/bureau_balance.csv', nrows=10000)\
    .sort_values('SK_ID_BUREAU')\
    .reset_index(drop=True)
cash = pd.read_csv('../data/raw/POS_CASH_balance.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
credit = pd.read_csv('../data/raw/credit_card_balance.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
previous = pd.read_csv('../data/raw/previous_application.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
installments = pd.read_csv('../data/raw/installments_payments.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)

In [None]:
bureau_balance.shape

In [None]:
bureau.head()

In [None]:
app_train['set'] = 'train'
app_test['set'] = 'test'
app_test["TARGET"] = np.nan

In [None]:
# Append the dataframes
app = app_train

In [None]:
# Entity set with id applications
es = ft.EntitySet(id='clients')

In [None]:
es = es.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR')
es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU')
es = es.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV')
es = es.entity_from_dataframe(entity_id='bureau_balance', 
                              dataframe=bureau_balance,
                              index='SK_ID_BURR',
                             )
es = es.entity_from_dataframe(entity_id='cash', 
                              dataframe=cash, 
                              index='cash_index')
es = es.entity_from_dataframe(entity_id='installments', 
                              dataframe=installments, 
                              index='installments_index')
es = es.entity_from_dataframe(entity_id='credit', 
                              dataframe=credit, 
                              index='credit_index')

In [None]:
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])

r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])


In [None]:
es.add_relationships([r_app_bureau,
                      r_app_previous,
                      r_previous_cash,
                      r_previous_installments,
                      r_previous_credit,
                      r_bureau_balance])

In [None]:
feature_matrix_client, features_defs = ft.dfs(entityset=es, target_entity='app', verbose=True)

In [None]:
feature_matrix_client

In [None]:
features_defs

In [None]:
corr = feature_matrix_client[feature_matrix_client['TARGET'].notna()].corr()

In [None]:
corr.sort_values('TARGET', inplace=True)

In [None]:
corr['TARGET'].head(15)

In [None]:
corr['TARGET'].dropna().tail(15)

In [None]:
corr['TARGET'].dropna().shape

In [None]:
def kde_target_plot(df, feature):
    """Kernel density estimate plot of a feature colored
    by value of the target."""
    
    # Need to reset index for loc to workBU
    df = df.reset_index()
    plt.figure(figsize = (10, 6))
    plt.style.use('fivethirtyeight')
    
    # plot repaid loans
    sns.kdeplot(df.loc[df['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(df.loc[df['TARGET'] == 1, feature], label = 'target == 1')
    
    # Label the plots
    plt.title('Distribution of Feature by Target Value')
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    plt.show()

In [None]:
corr['TARGET'].dropna()[corr['TARGET'] != 0]

Attention à la colinéarité

On recharge les données intégralement

In [None]:
app_train = pd.read_csv('../data/raw/application_train.csv')\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
app_test = pd.read_csv('../data/raw/application_test.csv')\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
bureau = pd.read_csv('../data/raw/bureau.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_BUREAU'])\
    .reset_index(drop=True)
bureau_balance = pd.read_csv('../data/raw/bureau_balance.csv')\
    .sort_values('SK_ID_BUREAU')\
    .reset_index(drop=True)
cash = pd.read_csv('../data/raw/POS_CASH_balance.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
credit = pd.read_csv('../data/raw/credit_card_balance.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
previous = pd.read_csv('../data/raw/previous_application.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
installments = pd.read_csv('../data/raw/installments_payments.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)

## Manual feature engineering

In [None]:
features = app_train[['SK_ID_CURR', 
                      'DAYS_BIRTH', # Age
                      'CODE_GENDER', # Sex
                      'OCCUPATION_TYPE', # Job
                      'AMT_INCOME_TOTAL', # Revenues
                      'AMT_CREDIT', # Credit amount
                      'NAME_CONTRACT_TYPE', # Contract type, Cash/Revolving
                      'AMT_ANNUITY', # Annuity amount
                      'EXT_SOURCE_1',
                      'EXT_SOURCE_2',
                      'EXT_SOURCE_3',
                     ]]

features_test = app_test[['SK_ID_CURR', 
                          'DAYS_BIRTH', # Age
                          'CODE_GENDER', # Sex
                          'OCCUPATION_TYPE', # Job
                          'AMT_INCOME_TOTAL', # Revenues
                          'AMT_CREDIT', # Credit amount
                          'NAME_CONTRACT_TYPE', # Contract type, Cash/Revolving
                          'AMT_ANNUITY', # Annuity amount
                          'EXT_SOURCE_1',
                          'EXT_SOURCE_2',
                          'EXT_SOURCE_3',
                         ]]

In [None]:
features = pd.concat([features, features_test], axis=0)

In [None]:
features.shape

In [None]:
features

### payment default

In [None]:
bureau

In [None]:
bureau = bureau.set_index('SK_ID_BUREAU').join(pd.get_dummies(bureau.set_index('SK_ID_BUREAU')['CREDIT_ACTIVE'], prefix='CREDIT_ACTIVE'))

In [None]:
bureau[bureau['SK_ID_CURR'] == 456116]

In [None]:
bureau_balance['STATUS'].replace('C', 0, inplace=True)
bureau_balance['STATUS'].replace('X', 0, inplace=True)
bureau_balance['STATUS'] = bureau_balance['STATUS'].astype('int')

In [None]:
count_late = bureau_balance.groupby('SK_ID_BUREAU')['STATUS'].sum()

In [None]:
count_late.describe()

In [None]:
count_late.astype('bool').describe()

In [None]:
bureau = pd.merge(bureau, count_late, left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU')
bureau.rename(columns={"STATUS": 'REPORTED_DPD'}, inplace=True)

In [None]:
bureau[[x for x in bureau.columns if x.startswith('CREDIT_ACTIVE_')] + ['SK_ID_CURR', 'REPORTED_DPD']].groupby('SK_ID_CURR').sum()

In [None]:
bureau_history = bureau[[x for x in bureau.columns if x.startswith('CREDIT_ACTIVE_')] + ['SK_ID_CURR', 'REPORTED_DPD']].groupby('SK_ID_CURR').sum()
features = features.set_index('SK_ID_CURR').join(bureau_history).reset_index()

In [None]:
features

In [None]:
features.join(app_train['TARGET'])

### payment default from HC

In [None]:
previous

In [None]:
cash

In [None]:
cash['SK_DPD'].describe()

In [None]:
installments

In [None]:
installments['BAD_PAYMENT_HC'] = installments['AMT_INSTALMENT'] != installments['AMT_PAYMENT']

In [None]:
installments['BAD_PAYMENT_HC'].describe()

In [None]:
bad_payment_hc_history = installments[['SK_ID_CURR', 'BAD_PAYMENT_HC']].groupby('SK_ID_CURR').count()

In [None]:
features = features.set_index('SK_ID_CURR').join(bad_payment_hc_history).sort_index()

In [None]:
previous

In [None]:
previous['CNT_PAYMENT']

In [None]:
previous['DAYS_TERMINATION'].describe()

In [None]:
previous['IS_ACTIVE'] = previous['DAYS_TERMINATION'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
previous['IS_ACTIVE'].describe()

In [None]:
active_cred_hc = previous[['SK_ID_CURR', 'IS_ACTIVE']].groupby('SK_ID_CURR').sum()

In [None]:
features = features.join(active_cred_hc)

In [None]:
features.rename(columns={'IS_ACTIVE': 'ACTIVE_CRED_HC'}, inplace=True)

In [None]:
features

In [None]:
features['TOTAL_PREV_HC'] = previous[['SK_ID_CURR', 'IS_ACTIVE']].groupby('SK_ID_CURR').count()

In [None]:
features['TOTAL_PREV_HC'].describe()

In [None]:
features.describe()

Add TARGET and make a logistic reg.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le_ = 0

for col in features.columns:
    if features[col].dtype.name == 'object':
        if len(features[col].unique()) <= 2:
            # label encoder
            print('Encoding %s' % col)
            le.fit(features[col])
            features[col] = le.transform(features[col])
            le_ += 1

print(f"{le_} columns encoded")

In [None]:
features = pd.get_dummies(features)

In [None]:
features

In [None]:
features = features.join(app_train.set_index('SK_ID_CURR')['TARGET'])

In [None]:
features

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

train = features[features['TARGET'].notna()].copy()
test = features[features['TARGET'].isna()].copy()
target = train['TARGET']
train.drop(columns=['TARGET'], inplace=True)
test.drop(columns=['TARGET'], inplace=True)


imputer = SimpleImputer(strategy='median')
scaler = MinMaxScaler(feature_range=(0, 1))
imputer.fit(train)
scaler.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

train = scaler.transform(train)
test = scaler.transform(test)

features_names = list(features.drop(columns=['TARGET']).columns)

print(f'train set shape : {train.shape}')
print(f'test set shape : {test.shape}')

In [None]:
from sklearn.linear_model import LogisticRegression

reg = LogisticRegression(C=1e-4)
reg.fit(train, target)

In [None]:
baseline_results = app_test[['SK_ID_CURR']]
baseline_results['TARGET'] = reg.predict_proba(test)[:, 1]

In [None]:
path = os.path.join(os.path.abspath('../reports/'), 'logistic_reg_features_engineering_baseline.csv')
baseline_results.to_csv(path, index=False)

Résultats:

ROC_AUC = 0.68867

In [None]:
import lightgbm as lgb

In [None]:
train_data = lgb.Dataset(train, label=target, feature_name=features_names)

In [None]:
param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'

num_round = 10
bst = lgb.train(param, train_data, num_round)

In [None]:
ypred = bst.predict(test)

In [None]:
baseline_results = app_test[['SK_ID_CURR']]
baseline_results['TARGET'] = ypred
path = os.path.join(os.path.abspath('../reports/'), 'lgbm.csv')
baseline_results.to_csv(path, index=False)

score: 

ROC_AUC = 0.72253

In [None]:
import shap

shap.initjs()

In [None]:
sample_data = pd.DataFrame(train, columns=features_names).sample(100)
explainer = shap.TreeExplainer(bst, data=sample_data, model_output='probability')
shap_values = explainer.shap_values(sample_data)

In [None]:
shap_values.shape

In [None]:
sample_data.shape

In [None]:
shap.summary_plot(shap_values, feature_names=features_names)

In [None]:
explainer.expected_value

In [None]:
shap.force_plot(explainer.expected_value,
                shap_values[0,:], features=sample_data.iloc[0, :])

In [None]:
shap.force_plot(explainer.expected_value,
                shap_values[10,:], features=sample_data.iloc[10, :])

In [None]:
shap.force_plot(explainer.expected_value,
                shap_values, features=sample_data)