In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import time

import gc

In [2]:
# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train_df = pd.read_csv('../input/application_train.csv')
test_df = pd.read_csv('../input/application_test.csv')
bureau = pd.read_csv('../input/bureau.csv').sort_values(['SK_ID_CURR', 'SK_ID_BUREAU']).reset_index(drop = True)
bureau_balance = pd.read_csv('../input/bureau_balance.csv').sort_values('SK_ID_BUREAU').reset_index(drop = True)
cash = pd.read_csv('../input/POS_CASH_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
credit = pd.read_csv('../input/credit_card_balance.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
previous = pd.read_csv('../input/previous_application.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)
installments = pd.read_csv('../input/installments_payments.csv').sort_values(['SK_ID_CURR', 'SK_ID_PREV']).reset_index(drop = True)

In [4]:
train_df['is_train'] = True
test_df['is_train'] = False
all_df = pd.concat([train_df, test_df])

# all_df['DAYS_EMPLOYED_ANOM'] = all_df["DAYS_EMPLOYED"] == 365243
all_df["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)



In [5]:
# featuretools for automated feature engineering
import featuretools as ft

In [6]:
es = ft.EntitySet(id = 'clients')

# Entities with a unique index
es = es.entity_from_dataframe(entity_id = 'app', dataframe = train_df, index = 'SK_ID_CURR')

es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV')

# Entities that do not have a unique index
es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bureaubalance_index')

es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index')

es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                              make_index = True, index = 'installments_index')

es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index')

In [7]:
# Relationship between app and bureau
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])


In [8]:
# Add in the defined relationships
es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])
# Print out the EntitySet
es

Entityset: clients
  Entities:
    app [Rows: 50000, Columns: 123]
    bureau [Rows: 289257, Columns: 17]
    previous [Rows: 282870, Columns: 37]
    bureau_balance [Rows: 4288064, Columns: 4]
    cash [Rows: 1687552, Columns: 9]
    installments [Rows: 2295557, Columns: 9]
    credit [Rows: 650563, Columns: 24]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV

In [9]:
# List the primitives in a dataframe
primitives = ft.list_primitives()
pd.options.display.max_colwidth = 100
primitives[primitives['type'] == 'aggregation']

Unnamed: 0,name,type,description
0,n_most_common,aggregation,Finds the N most common elements in a categorical feature.
1,num_true,aggregation,Finds the number of 'True' values in a boolean.
2,any,aggregation,Test if any value is 'True'.
3,percent_true,aggregation,Finds the percent of 'True' values in a boolean feature.
4,num_unique,aggregation,Returns the number of unique categorical variables.
5,mode,aggregation,Finds the most common element in a categorical feature.
6,last,aggregation,Returns the last value.
7,max,aggregation,Finds the maximum non-null value of a numeric feature.
8,mean,aggregation,Computes the average value of a numeric feature.
9,min,aggregation,Finds the minimum non-null value of a numeric feature.


In [None]:
primitives[primitives['type'] == 'transform']

In [11]:
# Default primitives from featuretools
default_agg_primitives =  ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_primitives =  ["day", "year", "month", "weekday", "haversine", "numwords", "characters"]

# DFS with specified primitives
feature_names = ft.dfs(entityset = es, target_entity = 'app',
                       trans_primitives = default_trans_primitives,
                       agg_primitives=default_agg_primitives, 
                       max_depth = 2, features_only=True)

print('%d Total Features' % len(feature_names))

Exception ignored in: <generator object add_client at 0x7fb699bca308>
RuntimeError: generator ignored GeneratorExit


1697 Total Features


In [13]:
# DFS with default primitives
default_agg_primitives = ["std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]

feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app',
#                                        trans_primitives = default_trans_primitives,
                                       agg_primitives=default_agg_primitives, 
                                        max_depth = 2, features_only=False, verbose = True,
                                      n_jobs = 2)
feature_matrix.head(10)

Built 1278 features


Future exception was never retrieved
future: <Future finished exception=CommClosedError('in <closed TCP>: Stream is closed',)>
Traceback (most recent call last):
  File "/home/yyamada/.pyenv/versions/anaconda3-5.0.1/envs/kaggle/lib/python3.6/site-packages/distributed/comm/tcp.py", line 177, in read
    n_frames = yield stream.read_bytes(8)
  File "/home/yyamada/.pyenv/versions/anaconda3-5.0.1/envs/kaggle/lib/python3.6/site-packages/tornado/gen.py", line 1099, in run
    value = future.result()
tornado.iostream.StreamClosedError: Stream is closed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/yyamada/.pyenv/versions/anaconda3-5.0.1/envs/kaggle/lib/python3.6/site-packages/tornado/gen.py", line 1107, in run
    yielded = self.gen.throw(*exc_info)
  File "/home/yyamada/.pyenv/versions/anaconda3-5.0.1/envs/kaggle/lib/python3.6/site-packages/distributed/comm/tcp.py", line 198, in read
    convert_stream_closed_error(self

CommClosedError: in <closed TCP>: Stream is closed: while trying to call remote method 'scatter'

In [None]:
# DFS with default primitives
default_agg_primitives = ["std", "max", "min", "mean", "count", "mode"]

feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app',
#                                                                     trans_primitives = default_trans_primitives,
                                                                    agg_primitives=default_agg_primitives, 
                                                                    max_depth = 2, features_only=False, verbose = True,
                                                                    n_jobs = 6)
feature_matrix.head(10)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_feats = [f for f in all_df.columns if all_df[f].dtype == 'object']
for col in tqdm(categorical_feats):
    all_df[col] = all_df[col].astype('str')
    le.fit(all_df[col])
    all_df[col] = le.transform(all_df[col])

train_df = all_df[all_df['is_train'] == True].drop('is_train', axis=1)
test_df = all_df[all_df['is_train'] == False].drop('is_train', axis=1).drop('TARGET', axis=1)

# del all_df
# gc.collect()