In [1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# Clearing up memory
import gc

# Featuretools for automated feature engineering
import featuretools as ft

# Suppress pandas warnings
import warnings
warnings.filterwarnings('ignore')

print('Reading in data')

# Read in the full datasets
app_train = pd.read_csv('../input/application_train.csv')
app_test = pd.read_csv('../input/application_test.csv')
bureau = pd.read_csv('../input/bureau.csv')
bureau_balance = pd.read_csv('../input/bureau_balance.csv')
cash = pd.read_csv('../input/POS_CASH_balance.csv')
credit = pd.read_csv('../input/credit_card_balance.csv')
previous = pd.read_csv('../input/previous_application.csv')
installments = pd.read_csv('../input/installments_payments.csv')

Reading in data


In [2]:
# Join the application dataframes together
app_test['set'] = 'test'
app_test['TARGET'] = -999
app_train['set'] = 'train'

# Append the dataframes (this is a row bind in R)
app = app_train.append(app_test, ignore_index = True)


# 前処理 by aguiar
app = app[app['CODE_GENDER'] != 'XNA']
    

In [3]:
docs = [_f for _f in app.columns if 'FLAG_DOC' in _f]
live = [_f for _f in app.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)]

# add features by Olivier

# NaN values for DAYS_EMPLOYED: 365243 -> nan
app['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

inc_by_org = app[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
app['NEW_CREDIT_TO_ANNUITY_RATIO'] = app['AMT_CREDIT'] / app['AMT_ANNUITY']
app['NEW_CREDIT_TO_GOODS_RATIO'] = app['AMT_CREDIT'] / app['AMT_GOODS_PRICE']
app['NEW_DOC_IND_AVG'] = app[docs].mean(axis=1)
app['NEW_DOC_IND_STD'] = app[docs].std(axis=1)
app['NEW_DOC_IND_KURT'] = app[docs].kurtosis(axis=1)
app['NEW_LIVE_IND_SUM'] = app[live].sum(axis=1)
app['NEW_LIVE_IND_STD'] = app[live].std(axis=1)
app['NEW_LIVE_IND_KURT'] = app[live].kurtosis(axis=1)
app['NEW_INC_PER_CHLD'] = app['AMT_INCOME_TOTAL'] / (1 + app['CNT_CHILDREN'])
app['NEW_INC_BY_ORG'] = app['ORGANIZATION_TYPE'].map(inc_by_org)
app['NEW_EMPLOY_TO_BIRTH_RATIO'] = app['DAYS_EMPLOYED'] / app['DAYS_BIRTH']
app['NEW_ANNUITY_TO_INCOME_RATIO'] = app['AMT_ANNUITY'] / (1 + app['AMT_INCOME_TOTAL'])
app['NEW_SOURCES_PROD'] = app['EXT_SOURCE_1'] * app['EXT_SOURCE_2'] * app['EXT_SOURCE_3']
app['NEW_EXT_SOURCES_MEAN'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
app['NEW_SCORES_STD'] = app[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
app['NEW_SCORES_STD'] = app['NEW_SCORES_STD'].fillna(app['NEW_SCORES_STD'].mean())
app['NEW_CAR_TO_BIRTH_RATIO'] = app['OWN_CAR_AGE'] / app['DAYS_BIRTH']
app['NEW_CAR_TO_EMPLOY_RATIO'] = app['OWN_CAR_AGE'] / app['DAYS_EMPLOYED']
app['NEW_PHONE_TO_BIRTH_RATIO'] = app['DAYS_LAST_PHONE_CHANGE'] / app['DAYS_BIRTH']
app['NEW_PHONE_TO_EMPLOY_RATIO'] = app['DAYS_LAST_PHONE_CHANGE'] / app['DAYS_EMPLOYED']
app['NEW_CREDIT_TO_INCOME_RATIO'] = app['AMT_CREDIT'] / app['AMT_INCOME_TOTAL']


# add features by Kageyama
app['NEW_INCOME_TO_BIRTH_RATIO'] = app['AMT_INCOME_TOTAL'] / app['DAYS_BIRTH']
app['NEW_REGISTRATION_TO_BIRTH_RATIO'] = app['DAYS_REGISTRATION'] / app['DAYS_BIRTH']
app['NEW_PUBLISH_TO_BIRTH_RATIO'] = app['DAYS_ID_PUBLISH'] / app['DAYS_BIRTH']
app['NEW_CREDIT_TO_BIRTH_RATIO'] = app['AMT_CREDIT'] / app['DAYS_BIRTH']
app['NEW_ANNUITY_TO_BIRTH_RATIO'] = app['AMT_ANNUITY'] / app['DAYS_BIRTH']
app['NEW_INCOME_TO_EMPLOY_RATIO'] = app['AMT_INCOME_TOTAL'] / app['DAYS_EMPLOYED']
app['NEW_REGISTRATION_TO_EMPLOY_RATIO'] = app['DAYS_REGISTRATION'] / app['DAYS_EMPLOYED']
app['NEW_PUBLISH_TO_EMPLOY_RATIO'] = app['DAYS_ID_PUBLISH'] / app['DAYS_EMPLOYED']
app['NEW_CREDIT_TO_EMPLOY_RATIO'] = app['AMT_CREDIT'] / app['DAYS_EMPLOYED']
app['NEW_ANNUITY_TO_EMPLOY_RATIO'] = app['AMT_ANNUITY'] / app['DAYS_EMPLOYED']
app['NEW_EXT_SOURCES_1_TO_MEAN_RATIO'] = app['EXT_SOURCE_1'] / app['NEW_EXT_SOURCES_MEAN']
app['NEW_EXT_SOURCES_2_TO_MEAN_RATIO'] = app['EXT_SOURCE_2'] / app['NEW_EXT_SOURCES_MEAN']
app['NEW_EXT_SOURCES_3_TO_MEAN_RATIO'] = app['EXT_SOURCE_3'] / app['NEW_EXT_SOURCES_MEAN']
app['NEW_CREDIT_TO_ANNUITY_RATIO'] = app['AMT_CREDIT'] / app['AMT_ANNUITY']
app['NEW_CREDIT_TO_GOODS_PRICE_RATIO'] = app['AMT_CREDIT'] / app['AMT_GOODS_PRICE']
app['NEW_ANNUITY_TO_GOODS_PRICE_RATIO'] = app['AMT_ANNUITY'] / app['AMT_GOODS_PRICE']
app['NEW_INCOME_TO_GOODS_PRICE_RATIO'] = app['AMT_INCOME_TOTAL'] / app['AMT_GOODS_PRICE']
app['NEW_REGISTRATION_TO_PUBLISH_RATIO'] = app['DAYS_REGISTRATION'] / app['DAYS_ID_PUBLISH']
app['NEW_INCOME_TO_PHONE_RATIO'] = app['AMT_INCOME_TOTAL'] / app['DAYS_LAST_PHONE_CHANGE']
app['NEW_CREDIT_TO_PHONE_RATIO'] = app['AMT_CREDIT'] / app['DAYS_LAST_PHONE_CHANGE']
app['NEW_ANNUITY_TO_PHONE_RATIO'] = app['AMT_ANNUITY'] / app['DAYS_LAST_PHONE_CHANGE']
app['NEW_GOODS_PRICE_TO_PHONE_RATIO'] = app['AMT_GOODS_PRICE'] / app['DAYS_LAST_PHONE_CHANGE']

In [4]:
# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    app[bin_feature], uniques = pd.factorize(app[bin_feature])

## 新しい特徴。 by aguiar。　
##olivierと被ってるとこはコメントアウト。

# Some simple new features (percentages)
# app['DAYS_EMPLOYED_PERC'] = app['DAYS_EMPLOYED'] / app['DAYS_BIRTH']
app['INCOME_CREDIT_PERC'] = app['AMT_INCOME_TOTAL'] / app['AMT_CREDIT']
app['INCOME_PER_PERSON'] = app['AMT_INCOME_TOTAL'] / app['CNT_FAM_MEMBERS']
# app['ANNUITY_INCOME_PERC'] = app['AMT_ANNUITY'] / app['AMT_INCOME_TOTAL']
# app['PAYMENT_RATE'] = app['AMT_ANNUITY'] / app['AMT_CREDIT']

In [5]:
# 前処理 by aguiar
previous['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
previous['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
previous['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
previous['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
previous['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# Add feature: value ask / value received percentage
previous['APP_CREDIT_PERC'] = previous['AMT_APPLICATION'] / previous['AMT_CREDIT']

In [6]:
#　前処理 by aguiar
# Percentage and difference paid in each installmentstallment (amount paid and installmentstallment value)
installments['PAYMENT_PERC'] = installments['AMT_PAYMENT'] / installments['AMT_INSTALMENT']
installments['PAYMENT_DIFF'] = installments['AMT_INSTALMENT'] - installments['AMT_PAYMENT']
# Days past due and days before due (no negative values)
installments['DPD'] = installments['DAYS_ENTRY_PAYMENT'] - installments['DAYS_INSTALMENT']
installments['DBD'] = installments['DAYS_INSTALMENT'] - installments['DAYS_ENTRY_PAYMENT']
installments['DPD'] = installments['DPD'].apply(lambda x: x if x > 0 else 0)
installments['DBD'] = installments['DBD'].apply(lambda x: x if x > 0 else 0)

In [7]:
# Create the entity set with an id
es = ft.EntitySet(id = 'applications')

# Add in all the entities

# Entities with a unique index
es = es.entity_from_dataframe(entity_id = 'app', dataframe = app, index = 'SK_ID_CURR')

es = es.entity_from_dataframe(entity_id = 'bureau', dataframe = bureau, index = 'SK_ID_BUREAU')

es = es.entity_from_dataframe(entity_id = 'previous', dataframe = previous, index = 'SK_ID_PREV')

# Entities that do not have a unique index
es = es.entity_from_dataframe(entity_id = 'bureau_balance', dataframe = bureau_balance, 
                              make_index = True, index = 'bb_index')

es = es.entity_from_dataframe(entity_id = 'cash', dataframe = cash, 
                              make_index = True, index = 'cash_index')

es = es.entity_from_dataframe(entity_id = 'installments', dataframe = installments,
                              make_index = True, index = 'in_index')

es = es.entity_from_dataframe(entity_id = 'credit', dataframe = credit,
                              make_index = True, index = 'credit_index')


# Relationship between app and bureau
r_app_bureau = ft.Relationship(es['app']['SK_ID_CURR'], es['bureau']['SK_ID_CURR'])

# Relationship between bureau and bureau balance
r_bureau_balance = ft.Relationship(es['bureau']['SK_ID_BUREAU'], es['bureau_balance']['SK_ID_BUREAU'])

# Relationship between current app and previous apps
r_app_previous = ft.Relationship(es['app']['SK_ID_CURR'], es['previous']['SK_ID_CURR'])

# Relationships between previous apps and cash, installments, and credit
r_previous_cash = ft.Relationship(es['previous']['SK_ID_PREV'], es['cash']['SK_ID_PREV'])
r_previous_installments = ft.Relationship(es['previous']['SK_ID_PREV'], es['installments']['SK_ID_PREV'])
r_previous_credit = ft.Relationship(es['previous']['SK_ID_PREV'], es['credit']['SK_ID_PREV'])

# Add in the defined relationships
es = es.add_relationships([r_app_bureau, r_bureau_balance, r_app_previous,
                           r_previous_cash, r_previous_installments, r_previous_credit])
                           
print(es)
                           
print('Clearing up memory')

gc.enable()
# Clear up memory
del app, bureau, bureau_balance, cash, credit, installments, previous
gc.collect()

print('Deep Feature Synthesis in Progress')

# Default primitives from featuretools
default_agg_primitives =  ["std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"]
default_trans_primitives =  ["weekday", "haversine", "numwords", "characters"]

# DFS for application features using a max depth of 2
feature_matrix, feature_names = ft.dfs(entityset = es, target_entity = 'app',
                       trans_primitives = default_trans_primitives,
                       agg_primitives=default_agg_primitives, 
                       max_depth = 2, features_only=False, verbose = True)
                       
# Reset the index to make SK_ID_CURR a column again                                      
feature_matrix = feature_matrix.reset_index()

print('Saving features')
feature_matrix.to_csv('new_feature_by_kageyama_feature_matrix.csv', index = False)

Entityset: applications
  Entities:
    app [Rows: 356251, Columns: 166]
    bureau [Rows: 1716428, Columns: 17]
    previous [Rows: 1670214, Columns: 38]
    bureau_balance [Rows: 27299925, Columns: 4]
    cash [Rows: 10001358, Columns: 9]
    installments [Rows: 13605401, Columns: 13]
    credit [Rows: 3840312, Columns: 24]
  Relationships:
    bureau.SK_ID_CURR -> app.SK_ID_CURR
    bureau_balance.SK_ID_BUREAU -> bureau.SK_ID_BUREAU
    previous.SK_ID_CURR -> app.SK_ID_CURR
    cash.SK_ID_PREV -> previous.SK_ID_PREV
    installments.SK_ID_PREV -> previous.SK_ID_PREV
    credit.SK_ID_PREV -> previous.SK_ID_PREV
Clearing up memory
Deep Feature Synthesis in Progress
Built 1430 features
Elapsed: 31:57:19 | Remaining: 00:00 | Progress: 100%|██████████| Calculated: 11/11 chunks  
Saving features
