In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from sklearn.externals import joblib
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [47]:
X = pd.read_csv('./data/application_train.csv', nrows=None)
X.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
X['CODE_GENDER'].replace('XNA',np.nan, inplace=True)
X['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
X['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)
X['NAME_FAMILY_STATUS'].replace('Unknown', np.nan, inplace=True)
X['ORGANIZATION_TYPE'].replace('XNA', np.nan, inplace=True)
        
X['annuity_income_percentage'] = X['AMT_ANNUITY'] / X['AMT_INCOME_TOTAL']
X['car_to_birth_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_BIRTH']
X['car_to_employ_ratio'] = X['OWN_CAR_AGE'] / X['DAYS_EMPLOYED']
X['children_ratio'] = X['CNT_CHILDREN'] / X['CNT_FAM_MEMBERS']
X['credit_to_annuity_ratio'] = X['AMT_CREDIT'] / X['AMT_ANNUITY']
X['credit_to_goods_ratio'] = X['AMT_CREDIT'] / X['AMT_GOODS_PRICE']
X['credit_to_income_ratio'] = X['AMT_CREDIT'] / X['AMT_INCOME_TOTAL']
X['days_employed_percentage'] = X['DAYS_EMPLOYED'] / X['DAYS_BIRTH']
X['income_per_child'] = X['AMT_INCOME_TOTAL'] / (1 + X['CNT_CHILDREN'])
X['income_per_person'] = X['AMT_INCOME_TOTAL'] / X['CNT_FAM_MEMBERS']
X['payment_rate'] = X['AMT_ANNUITY'] / X['AMT_CREDIT']
X['phone_to_birth_ratio'] = X['DAYS_LAST_PHONE_CHANGE'] / X['DAYS_BIRTH']
X['NAME_EDUCATION_TYPE_CAT'] = X.NAME_EDUCATION_TYPE.astype("category")
X['OCCUPATION_TYPE_CAT'] = X.OCCUPATION_TYPE.astype("category")
X['ORGANIZATION_TYPE_CAT'] = X.ORGANIZATION_TYPE.astype("category")
X['cnt_non_child'] = X['CNT_FAM_MEMBERS'] - X['CNT_CHILDREN']
X['child_to_non_child_ratio'] = X['CNT_CHILDREN'] / X['cnt_non_child']
X['income_per_non_child'] = X['AMT_INCOME_TOTAL'] / X['cnt_non_child']
X['credit_per_person'] = X['AMT_CREDIT'] / X['CNT_FAM_MEMBERS']
X['credit_per_child'] = X['AMT_CREDIT'] / (1 + X['CNT_CHILDREN'])
X['credit_per_non_child'] = X['AMT_CREDIT'] / X['cnt_non_child']
X['short_employment'] = (X['DAYS_EMPLOYED'] < -2000).astype(int)
X['young_age'] = (X['DAYS_BIRTH'] < -14000).astype(int)
X['DPD_CNT_SOCIAL'] = X['OBS_30_CNT_SOCIAL_CIRCLE'] + X['DEF_30_CNT_SOCIAL_CIRCLE'] + X['OBS_60_CNT_SOCIAL_CIRCLE'] + X['DEF_60_CNT_SOCIAL_CIRCLE']

X['external_sources_weighted'] = X.EXT_SOURCE_1 * 2 + X.EXT_SOURCE_2 * 3 + X.EXT_SOURCE_3 * 4
# for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
#     X['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
#         X[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
    
engineered_numerical_columns = ['annuity_income_percentage',
                                'car_to_birth_ratio',
                                'car_to_employ_ratio',
                                'credit_to_annuity_ratio',
                                'credit_to_goods_ratio',
                                'credit_to_income_ratio',
                                'days_employed_percentage',
                                'income_per_child',
                                'income_per_person',
                                'payment_rate',
                                'phone_to_birth_ratio',
                                'external_sources_weighted',
#                                 'external_sources_min',
#                                 'external_sources_max',
#                                 'external_sources_sum',
#                                 'external_sources_mean',
#                                 'external_sources_nanmedian',
                                'NAME_EDUCATION_TYPE_CAT',
                                'OCCUPATION_TYPE_CAT',
#                                 'ORGANIZATION_TYPE_CAT',
                                'REGION_POPULATION_RELATIVE',
                                'REGION_RATING_CLIENT',
                                'APARTMENTS_AVG',
                                'LANDAREA_AVG',
                                'ELEVATORS_AVG',
                                'LIVINGAREA_AVG',
                                'DPD_CNT_SOCIAL',
                                'FLAG_DOCUMENT_3',
                                'AMT_REQ_CREDIT_BUREAU_QRT',
                                'AMT_REQ_CREDIT_BUREAU_YEAR',
                                'DAYS_BIRTH',
                                'DAYS_EMPLOYED',
                                'DAYS_ID_PUBLISH',
                                'DAYS_REGISTRATION',
                                'cnt_non_child',
                                'child_to_non_child_ratio',
                                'income_per_non_child',
                                'credit_per_person',
                                'credit_per_child',
                                'credit_per_non_child',
                                'short_employment',
                                'young_age',
                                'EXT_SOURCE_1',
                                'EXT_SOURCE_2',
                                'EXT_SOURCE_3']

In [49]:
AGGREGATION_RECIPIES = [
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE'], [('EXT_SOURCE_1', 'mean'),
                                              ('EXT_SOURCE_2', 'mean'),
                                              ('OWN_CAR_AGE', 'max')]),
    (['CODE_GENDER', 'ORGANIZATION_TYPE'], [('AMT_ANNUITY', 'mean'),
                                            ('AMT_INCOME_TOTAL', 'mean'),
                                            ('DAYS_REGISTRATION', 'mean'),
                                            ('EXT_SOURCE_1', 'mean')]),
    (['CODE_GENDER', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('EXT_SOURCE_1', 'mean'),
                                                                                           ('EXT_SOURCE_2', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE'], [('AMT_CREDIT', 'mean'),
                                                  ('AMT_REQ_CREDIT_BUREAU_YEAR', 'mean'),
                                                  ('APARTMENTS_AVG', 'mean'),
                                                  ('BASEMENTAREA_AVG', 'mean'),
                                                  ('EXT_SOURCE_1', 'mean'),
                                                  ('EXT_SOURCE_2', 'mean'),
                                                  ('EXT_SOURCE_3', 'mean'),
                                                  ('NONLIVINGAREA_AVG', 'mean'),
                                                  ('OWN_CAR_AGE', 'mean'),
                                                  ('YEARS_BUILD_AVG', 'mean')]),
    (['NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'REG_CITY_NOT_WORK_CITY'], [('ELEVATORS_AVG', 'mean'),
                                                                            ('EXT_SOURCE_1', 'mean')]),
]
groupby_aggregate_names = []
for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):
    group_object = X.groupby(groupby_cols)
    for select, agg in tqdm(specs):
        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
        X = X.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
        groupby_aggregate_names.append(groupby_aggregate_name)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [61]:
dsccb = pd.read_csv('./data/credit_card_balance.csv')
dsccb.loc[dsccb.AMT_DRAWINGS_ATM_CURRENT < 0, 'AMT_DRAWINGS_ATM_CURRENT'] = np.nan
dsccb.loc[dsccb.AMT_DRAWINGS_CURRENT < 0, 'AMT_DRAWINGS_CURRENT'] = np.nan
dsccb_g = dsccb.groupby('SK_ID_CURR').mean()
dsccb_g['AMT_BALANCE_LIMIT_RATIO'] = dsccb_g['AMT_BALANCE'] / dsccb_g['AMT_CREDIT_LIMIT_ACTUAL']
# ccb_columns = ['AMT_BALANCE_LIMIT_RATIO','CNT_DRAWINGS_ATM_CURRENT','AMT_BALANCE','AMT_TOTAL_RECEIVABLE','AMT_RECIVABLE','AMT_RECEIVABLE_PRINCIPAL','MONTHS_BALANCE','AMT_DRAWINGS_ATM_CURRENT','AMT_DRAWINGS_CURRENT','CNT_DRAWINGS_POS_CURRENT']

ccb_sorted = dsccb.sort_values(['SK_ID_CURR', 'MONTHS_BALANCE'])
groupby = ccb_sorted.groupby(by=['SK_ID_CURR'])
ccb_sorted['credit_card_monthly_diff'] = groupby['AMT_BALANCE'].diff()
groupby = ccb_sorted.groupby(by=['SK_ID_CURR'])

g = groupby['credit_card_monthly_diff'].agg('mean').reset_index()
dsccb_g = pd.merge(dsccb_g, g, how='left', on=['SK_ID_CURR'])
# dsccb = dsccb.merge(g, on=['SK_ID_CURR'], how='left')
dsccb_g = dsccb_g.reset_index()

In [57]:
dsbb = pd.read_csv('./data/bureau.csv')
dsbb.loc[dsbb.DAYS_CREDIT_ENDDATE < -40000, 'DAYS_CREDIT_ENDDATE'] = np.nan
dsbb.loc[dsbb.DAYS_CREDIT_UPDATE < -40000, 'DAYS_CREDIT_UPDATE'] = np.nan
dsbb.loc[dsbb.DAYS_ENDDATE_FACT < -40000, 'DAYS_ENDDATE_FACT'] = np.nan
dsbb['bureau_credit_active_binary'] = (dsbb['CREDIT_ACTIVE'] != 'Closed').astype(int)
dsbb_g = dsbb.groupby('SK_ID_CURR').mean()
dsbb_g = dsbb_g.reset_index()
# bb_columns = ['DAYS_CREDIT','CREDIT_DAY_OVERDUE','DAYS_CREDIT_ENDDATE','DAYS_ENDDATE_FACT','AMT_CREDIT_MAX_OVERDUE','CNT_CREDIT_PROLONG','AMT_CREDIT_SUM','AMT_CREDIT_SUM_DEBT','AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE','DAYS_CREDIT_UPDATE']

In [40]:
prev = pd.read_csv('./data/previous_application.csv')
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
# Previous applications numeric features
num_aggregations = {
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
}
# Previous applications categorical features
# cat_aggregations = {}
# for cat in cat_cols:
#     cat_aggregations[cat] = ['mean']

prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations})
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
# Previous Applications: Approved Applications - only numerical features
approved = prev[prev['NAME_CONTRACT_STATUS'] == 'Approved']
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS'] == 'Refused']
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
del refused, refused_agg, approved, approved_agg, prev

In [41]:
pos = pd.read_csv('./data/POS_CASH_balance.csv')
# Features
aggregations = {
    'MONTHS_BALANCE': ['max', 'mean', 'size'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}
# for cat in cat_cols:
#     aggregations[cat] = ['mean']

pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
# Count pos cash accounts
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
del pos

In [42]:
ins = pd.read_csv('./data/installments_payments.csv')
# ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
# Percentage and difference paid in each installment (amount paid and installment value)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# Days past due and days before due (no negative values)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
# Features: Perform aggregations
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max', 'mean', 'sum'],
    'DBD': ['max', 'mean', 'sum'],
    'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
    'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
}
# for cat in cat_cols:
#     aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# Count installments accounts
ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
del ins

In [150]:
list(prev_agg.columns)

['PREV_AMT_ANNUITY_MIN',
 'PREV_AMT_ANNUITY_MAX',
 'PREV_AMT_ANNUITY_MEAN',
 'PREV_AMT_APPLICATION_MIN',
 'PREV_AMT_APPLICATION_MAX',
 'PREV_AMT_APPLICATION_MEAN',
 'PREV_AMT_CREDIT_MIN',
 'PREV_AMT_CREDIT_MAX',
 'PREV_AMT_CREDIT_MEAN',
 'PREV_APP_CREDIT_PERC_MIN',
 'PREV_APP_CREDIT_PERC_MAX',
 'PREV_APP_CREDIT_PERC_MEAN',
 'PREV_APP_CREDIT_PERC_VAR',
 'PREV_AMT_DOWN_PAYMENT_MIN',
 'PREV_AMT_DOWN_PAYMENT_MAX',
 'PREV_AMT_DOWN_PAYMENT_MEAN',
 'PREV_AMT_GOODS_PRICE_MIN',
 'PREV_AMT_GOODS_PRICE_MAX',
 'PREV_AMT_GOODS_PRICE_MEAN',
 'PREV_HOUR_APPR_PROCESS_START_MIN',
 'PREV_HOUR_APPR_PROCESS_START_MAX',
 'PREV_HOUR_APPR_PROCESS_START_MEAN',
 'PREV_RATE_DOWN_PAYMENT_MIN',
 'PREV_RATE_DOWN_PAYMENT_MAX',
 'PREV_RATE_DOWN_PAYMENT_MEAN',
 'PREV_DAYS_DECISION_MIN',
 'PREV_DAYS_DECISION_MAX',
 'PREV_DAYS_DECISION_MEAN',
 'PREV_CNT_PAYMENT_MEAN',
 'PREV_CNT_PAYMENT_SUM',
 'APPROVED_AMT_ANNUITY_MIN',
 'APPROVED_AMT_ANNUITY_MAX',
 'APPROVED_AMT_ANNUITY_MEAN',
 'APPROVED_AMT_APPLICATION_MIN',
 'APPRO

In [62]:
X_final2 = X[['SK_ID_CURR'] + engineered_numerical_columns + groupby_aggregate_names]
X_final2 = pd.merge(X_final2, dsbb_g, how='left', on=['SK_ID_CURR'])
X_final2 = pd.merge(X_final2, dsccb_g, how='left', on=['SK_ID_CURR'])
X_final2 = pd.merge(X_final2, prev_agg, how='left', on=['SK_ID_CURR'])
X_final2 = pd.merge(X_final2, pos_agg, how='left', on=['SK_ID_CURR'])
X_final2 = pd.merge(X_final2, ins_agg, how='left', on=['SK_ID_CURR'])
X_final2 = X_final2[list(prev_agg.columns) + list(pos_agg.columns) + list(ins_agg.columns) + list(dsccb_g.columns) + list(dsbb_g.columns) + engineered_numerical_columns + groupby_aggregate_names]

# X_final2 = X[engineered_numerical_columns + groupby_aggregate_names]
X_final2 = X_final2.replace(-np.inf, np.nan)
y_ = X['TARGET']
X_ = X_final2

X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=1)

lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [106]:
y_.value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [None]:
clf = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
#     is_unbalance=True,
    n_estimators=10000,
    metric='auc',
    nthread=4,
    num_leaves=35,
    learning_rate=0.02,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    max_depth=8,
    reg_alpha=0.041545473,
    reg_lambda=0.0735294,
    min_split_gain=0.0222415,
    sub_sample=0.9,
    min_child_weight=39.3259775,
    bagging_freq=5,
    verbose=0
    )
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=200)
# gbm = lgb.fit(params,
# lgb_train,
# num_boost_round=1500,
# valid_sets=lgb_eval,
# early_stopping_rounds=20)

Training until validation scores don't improve for 200 rounds.


In [71]:
# print('Feature importances:', sorted(list(zip(list(clf.feature_importance), clf.feature_name()))))
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = list(X_train.columns)
fold_importance_df["importance"] = clf.feature_importances_
tmp = fold_importance_df.sort_values('importance', ascending=False)
for item in tmp.as_matrix():
    print(item)

['ORGANIZATION_TYPE_CAT' 4416]
['EXT_SOURCE_3' 859]
['EXT_SOURCE_2' 716]
['EXT_SOURCE_1' 705]
['OCCUPATION_TYPE_CAT' 608]
['DAYS_BIRTH' 564]
['payment_rate' 514]
['credit_to_annuity_ratio' 514]
['AMT_CREDIT_SUM' 487]
['AMT_CREDIT_SUM_DEBT' 463]
['credit_to_goods_ratio' 418]
['APPROVED_CNT_PAYMENT_MEAN' 416]
['phone_to_birth_ratio' 406]
['annuity_income_percentage' 406]
['credit_per_non_child' 391]
['INSTAL_DPD_MEAN' 376]
['DAYS_ID_PUBLISH' 366]
['AMT_CREDIT_MAX_OVERDUE' 361]
['DAYS_CREDIT' 357]
['INSTAL_DAYS_ENTRY_PAYMENT_MAX' 351]
['DAYS_CREDIT_ENDDATE' 346]
['DAYS_REGISTRATION' 336]
['external_sources_weighted' 313]
['DAYS_EMPLOYED' 310]
['INSTAL_DBD_SUM' 295]
['SK_ID_BUREAU' 276]
['POS_MONTHS_BALANCE_SIZE' 274]
['INSTAL_AMT_PAYMENT_SUM' 271]
['PREV_CNT_PAYMENT_MEAN' 263]
['DAYS_CREDIT_UPDATE' 261]
['REGION_POPULATION_RELATIVE' 260]
['bureau_credit_active_binary' 259]
['APPROVED_DAYS_DECISION_MAX' 241]
['days_employed_percentage' 238]
['car_to_birth_ratio' 236]
['car_to_employ_ratio'

  


In [18]:
# X_test = pd.merge(X_test, dsbb_g, how='left', on=['SK_ID_CURR'])
# X_test = pd.merge(X_test, dsccb_g, how='left', on=['SK_ID_CURR'])
# X_test = pd.merge(X_test, prev_agg, how='left', on=['SK_ID_CURR'])
# X_test = pd.merge(X_test, pos_agg, how='left', on=['SK_ID_CURR'])
# X_test = pd.merge(X_test, ins_agg, how='left', on=['SK_ID_CURR'])
dsccb_g.columns

Index(['SK_ID_CURR', 'SK_ID_PREV', 'MONTHS_BALANCE', 'AMT_BALANCE',
       'AMT_CREDIT_LIMIT_ACTUAL', 'AMT_DRAWINGS_ATM_CURRENT',
       'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT',
       'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY',
       'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT',
       'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE',
       'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT',
       'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT',
       'CNT_INSTALMENT_MATURE_CUM', 'SK_DPD', 'SK_DPD_DEF',
       'AMT_BALANCE_LIMIT_RATIO'],
      dtype='object')

In [64]:
XT = pd.read_csv('./data/application_test.csv', nrows=None)
XT['CODE_GENDER'].replace('XNA',np.nan, inplace=True)
XT['annuity_income_percentage'] = XT['AMT_ANNUITY'] / XT['AMT_INCOME_TOTAL']
XT['car_to_birth_ratio'] = XT['OWN_CAR_AGE'] / XT['DAYS_BIRTH']
XT['car_to_employ_ratio'] = XT['OWN_CAR_AGE'] / XT['DAYS_EMPLOYED']
XT['children_ratio'] = XT['CNT_CHILDREN'] / XT['CNT_FAM_MEMBERS']
XT['credit_to_annuity_ratio'] = XT['AMT_CREDIT'] / XT['AMT_ANNUITY']
XT['credit_to_goods_ratio'] = XT['AMT_CREDIT'] / XT['AMT_GOODS_PRICE']
XT['credit_to_income_ratio'] = XT['AMT_CREDIT'] / XT['AMT_INCOME_TOTAL']
XT['days_employed_percentage'] = XT['DAYS_EMPLOYED'] / XT['DAYS_BIRTH']
XT['income_per_child'] = XT['AMT_INCOME_TOTAL'] / (1 + XT['CNT_CHILDREN'])
XT['income_per_person'] = XT['AMT_INCOME_TOTAL'] / XT['CNT_FAM_MEMBERS']
XT['payment_rate'] = XT['AMT_ANNUITY'] / XT['AMT_CREDIT']
XT['phone_to_birth_ratio'] = XT['DAYS_LAST_PHONE_CHANGE'] / XT['DAYS_BIRTH']
XT['NAME_EDUCATION_TYPE_CAT'] = XT.NAME_EDUCATION_TYPE.astype("category")
XT['OCCUPATION_TYPE_CAT'] = XT.OCCUPATION_TYPE.astype("category")
XT['ORGANIZATION_TYPE_CAT'] = XT.ORGANIZATION_TYPE.astype("category")
XT['cnt_non_child'] = X['CNT_FAM_MEMBERS'] - X['CNT_CHILDREN']
XT['child_to_non_child_ratio'] = XT['CNT_CHILDREN'] / XT['cnt_non_child']
XT['income_per_non_child'] = XT['AMT_INCOME_TOTAL'] / XT['cnt_non_child']
XT['credit_per_person'] = XT['AMT_CREDIT'] / XT['CNT_FAM_MEMBERS']
XT['credit_per_child'] = XT['AMT_CREDIT'] / (1 + XT['CNT_CHILDREN'])
XT['credit_per_non_child'] = XT['AMT_CREDIT'] / XT['cnt_non_child']
XT['short_employment'] = (XT['DAYS_EMPLOYED'] < -2000).astype(int)
XT['young_age'] = (XT['DAYS_BIRTH'] < -14000).astype(int)
XT['DPD_CNT_SOCIAL'] = XT['OBS_30_CNT_SOCIAL_CIRCLE'] + XT['DEF_30_CNT_SOCIAL_CIRCLE'] + XT['OBS_60_CNT_SOCIAL_CIRCLE'] + XT['DEF_60_CNT_SOCIAL_CIRCLE']

XT['external_sources_weighted'] = XT.EXT_SOURCE_1 * 2 + XT.EXT_SOURCE_2 * 3 + XT.EXT_SOURCE_3 * 4
# for function_name in ['min', 'max', 'sum', 'mean', 'nanmedian']:
#     XT['external_sources_{}'.format(function_name)] = eval('np.{}'.format(function_name))(
#         XT[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)
    
for groupby_cols, specs in tqdm(AGGREGATION_RECIPIES):
    group_object = XT.groupby(groupby_cols)
    for select, agg in tqdm(specs):
        groupby_aggregate_name = '{}_{}_{}'.format('_'.join(groupby_cols), agg, select)
        XT = XT.merge(group_object[select]
                              .agg(agg)
                              .reset_index()
                              .rename(index=str,
                                      columns={select: groupby_aggregate_name})
                              [groupby_cols + [groupby_aggregate_name]],
                              on=groupby_cols,
                              how='left')
        
X_test = XT[['SK_ID_CURR'] + engineered_numerical_columns + groupby_aggregate_names]
X_test = pd.merge(X_test, dsbb_g, how='left', on=['SK_ID_CURR'])
X_test = pd.merge(X_test, dsccb_g, how='left', on=['SK_ID_CURR'])
X_test = pd.merge(X_test, prev_agg, how='left', on=['SK_ID_CURR'])
X_test = pd.merge(X_test, pos_agg, how='left', on=['SK_ID_CURR'])
X_test = pd.merge(X_test, ins_agg, how='left', on=['SK_ID_CURR'])
X_test = X_test.replace(-np.inf, np.nan)
X_test = X_test[list(prev_agg.columns) + list(pos_agg.columns) + list(ins_agg.columns) + list(dsccb_g.columns) + list(dsbb_g.columns) + engineered_numerical_columns + groupby_aggregate_names]
# X_test = XT[engineered_numerical_columns + groupby_aggregate_names]
y_test = clf.predict_proba(X_test)
X_raw = XT.values
f = open("./result4.csv", "w")
result = 'SK_ID_CURR,TARGET\n'
for i in range(len(y_test)):
    result += (str(X_raw[i][0]) + ',' + str(y_test[i][1]) + '\n')
f.write(result)
f.close()
print('finished')

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


finished
