In [None]:
import os

import numpy as np
import pandas as pd
import featuretools as ft
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

sns.set()

In [None]:
data = dict()
for dirname, _, filenames in os.walk('../data/raw'):
    for filename in filenames:
        print(filename)

In [None]:
app_train = pd.read_csv('../data/raw/application_train.csv', nrows=10000)\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
app_test = pd.read_csv('../data/raw/application_test.csv', nrows=10000)\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
bureau = pd.read_csv('../data/raw/bureau.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_BUREAU'])\
    .reset_index(drop=True)
bureau_balance = pd.read_csv('../data/raw/bureau_balance.csv', nrows=10000)\
    .sort_values('SK_ID_BUREAU')\
    .reset_index(drop=True)
cash = pd.read_csv('../data/raw/POS_CASH_balance.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
credit = pd.read_csv('../data/raw/credit_card_balance.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
previous = pd.read_csv('../data/raw/previous_application.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
installments = pd.read_csv('../data/raw/installments_payments.csv', nrows=10000)\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)

In [None]:
bureau_balance.shape

In [None]:
bureau.head()

In [None]:
app_train['set'] = 'train'
app_test['set'] = 'test'
app_test["TARGET"] = np.nan

In [None]:
# Append the dataframes
app = app_train

In [None]:
# Entity set with id applications
es = ft.EntitySet(id='clients')

In [None]:
es = es.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR')
es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU')
es = es.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV')
es = es.entity_from_dataframe(entity_id='bureau_balance', 
                              dataframe=bureau_balance,
                              index='SK_ID_BURR',
                             )
es = es.entity_from_dataframe(entity_id='cash', 
                              dataframe=cash, 
                              index='cash_index')
es = es.entity_from_dataframe(entity_id='installments', 
                              dataframe=installments, 
                              index='installments_index')
es = es.entity_from_dataframe(entity_id='credit', 
                              dataframe=credit, 
                              index='credit_index')

In [None]:
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])

r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])


In [None]:
es.add_relationships([r_app_bureau,
                      r_app_previous,
                      r_previous_cash,
                      r_previous_installments,
                      r_previous_credit,
                      r_bureau_balance])

In [None]:
feature_matrix_client, features_defs = ft.dfs(entityset=es, target_entity='app', verbose=True)

In [None]:
feature_matrix_client

In [None]:
features_defs

In [None]:
corr = feature_matrix_client[feature_matrix_client['TARGET'].notna()].corr()

In [None]:
corr.sort_values('TARGET', inplace=True)

In [None]:
corr['TARGET'].head(15)

In [None]:
corr['TARGET'].dropna().tail(15)

In [None]:
corr['TARGET'].dropna().shape

In [None]:
def kde_target_plot(df, feature):
    """Kernel density estimate plot of a feature colored
    by value of the target."""
    
    # Need to reset index for loc to workBU
    df = df.reset_index()
    plt.figure(figsize = (10, 6))
    plt.style.use('fivethirtyeight')
    
    # plot repaid loans
    sns.kdeplot(df.loc[df['TARGET'] == 0, feature], label = 'target == 0')
    # plot loans that were not repaid
    sns.kdeplot(df.loc[df['TARGET'] == 1, feature], label = 'target == 1')
    
    # Label the plots
    plt.title('Distribution of Feature by Target Value')
    plt.xlabel('%s' % feature); plt.ylabel('Density');
    plt.show()

In [None]:
kde_target_plot(feature_matrix_client, feature='MAX(previous.MEAN(credit.CNT_DRAWINGS_OTHER_CURRENT))')

In [None]:
kde_target_plot(feature_matrix_client, feature='SKEW(previous.MIN(installments.AMT_PAYMENT))')

In [None]:
corr['TARGET'].dropna()[corr['TARGET'] != 0]

## Feature engineering results

In [None]:
app_train = pd.read_csv('../data/raw/application_train.csv')\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
app_test = pd.read_csv('../data/raw/application_test.csv')\
    .sort_values('SK_ID_CURR')\
    .reset_index(drop=True)
bureau = pd.read_csv('../data/raw/bureau.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_BUREAU'])\
    .reset_index(drop=True)
bureau_balance = pd.read_csv('../data/raw/bureau_balance.csv')\
    .sort_values('SK_ID_BUREAU')\
    .reset_index(drop=True)
cash = pd.read_csv('../data/raw/POS_CASH_balance.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
credit = pd.read_csv('../data/raw/credit_card_balance.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
previous = pd.read_csv('../data/raw/previous_application.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)
installments = pd.read_csv('../data/raw/installments_payments.csv')\
    .sort_values(['SK_ID_CURR', 'SK_ID_PREV'])\
    .reset_index(drop=True)

In [None]:
app_test['TARGET'] = np.nan
app = app_train.append(app_test, ignore_index=True)

In [None]:
# Entity set with id applications
es = ft.EntitySet(id='clients')

es = es.entity_from_dataframe(entity_id='app', dataframe=app, index='SK_ID_CURR')
es = es.entity_from_dataframe(entity_id='bureau', dataframe=bureau, index='SK_ID_BUREAU')
es = es.entity_from_dataframe(entity_id='previous', dataframe=previous, index='SK_ID_PREV')
es = es.entity_from_dataframe(entity_id='bureau_balance', 
                              dataframe=bureau_balance,
                              index='SK_ID_BURR',
                             )
es = es.entity_from_dataframe(entity_id='cash', 
                              dataframe=cash, 
                              index='cash_index')
es = es.entity_from_dataframe(entity_id='installments', 
                              dataframe=installments, 
                              index='installments_index')
es = es.entity_from_dataframe(entity_id='credit', 
                              dataframe=credit, 
                              index='credit_index')

r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])

r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

es.add_relationships([r_app_bureau,
                      r_app_previous,
                      r_previous_cash,
                      r_previous_installments,
                      r_previous_credit,
                      r_bureau_balance])

In [None]:
feature_matrix_client, features_defs = ft.dfs(entityset=es, target_entity='app')

In [None]:
target = app_train['TARGET']

test = app_test.copy()
train = app_train.copy().drop(columns=['TARGET'])

imputer = SimpleImputer(strategy='median')
scaler = MinMaxScaler(feature_range=(0, 1))

imputer.fit(train)

train = imputer.transform(train)
test = imputer.transform(test)

scaler.fit(train)

train = scaler.transform(train)
test = scaler.transform(test)

print(f'train set shape : {train.shape}')
print(f'test set shape : {test.shape}')