## Home Credit Default Risk - Assignment 4

In [1]:
# References used:
# https://www.kaggle.com/jsaguiar/lightgbm-with-simple-features
# https://www.kaggle.com/prashantkikani/home-rf-et-xgb-cb-stack-oof1-lb-0-789
# Various documentation about sklearn/fastai and other kernels on the challenge discussion

# Configuration
sample_size = None # None for full training, else number of rows, careful this will break fastai preprocessing (it fails if any test feature has NaN that did not exist with NaN in the training set)
folder = 'dataset' # data folder
load_preprocessed = True # load preprocessed data from disk to save time
fname_preprocessed = 'my_preprocessed_data.csv'

In [2]:
import os
import gc
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from fastai import *
from fastai.tabular import *

df_train = pd.read_csv(folder+'/application_train.csv', nrows=sample_size)
df_test = pd.read_csv(folder+'/application_test.csv', nrows=sample_size)

df_bureau = pd.read_csv(folder+'/bureau.csv', nrows=sample_size)
df_bureau_balance = pd.read_csv(folder+'/bureau_balance.csv', nrows=sample_size)
df_credit_card = pd.read_csv(folder+'/credit_card_balance.csv', nrows=sample_size)
df_pos_cash = pd.read_csv(folder+'/POS_CASH_balance.csv', nrows=sample_size)
df_prev_app = pd.read_csv(folder+'/previous_application.csv', nrows=sample_size)
df_install = pd.read_csv(folder+'/installments_payments.csv', nrows=sample_size)
    
print(f'Training shape = {df_train.shape}')
print(f'Training shape = {df_test.shape}')
print(f'Target proportion = {round(df_train["TARGET"].mean(), 2)}')

Training shape = (307511, 122)
Training shape = (48744, 121)
Target proportion = 0.08


Target proportotion shows target column is highly imbalanced (92% vs. 8%).

## Preprocessing & Feature Engineering

In [3]:
# combine both train & test into one dataframe for preprocessing
key = 'SK_ID_CURR' # primary key
len_train = len(df_train) # length of training data 
df_both = pd.concat([df_train, df_test], sort=False) # combined train & testdataframe

if not load_preprocessed:
    # combine additional datasets into one big dataframe & engineer some features (mostly aggregations)
    print(f'Combined train & test before processing {df_both.shape}')

    # main dataframe
    print('Processing main data..')
    ## loan relative to salary
    df_both['LOAN_INCOME_RATIO'] = df_both['AMT_CREDIT'] / df_both['AMT_INCOME_TOTAL']
    ## annuities (fixed stream of payments) to income
    df_both['ANNUITY_INCOME_RATIO'] = df_both['AMT_ANNUITY'] / df_both['AMT_INCOME_TOTAL']
    ## number of days employed in life
    df_both['WORKING_LIFE_RATIO'] = df_both['DAYS_EMPLOYED'] / df_both['DAYS_BIRTH']
    ## income per family member
    df_both['INCOME_PER_FAM'] = df_both['AMT_INCOME_TOTAL'] / df_both['CNT_FAM_MEMBERS']
    ## income credit ratio
    df_both['INCOME_CREDIT_PERC'] = df_both['AMT_INCOME_TOTAL'] / df_both['AMT_CREDIT']
    ## children per household, relative to amount of familymembers
    df_both['CHILDREN_RATIO'] = df_both['CNT_CHILDREN'] / df_both['CNT_FAM_MEMBERS']
    df_merged = df_both
    print(f'\t-> {df_merged.shape}')

    # bureau
    print('Processing bureau data..')
    ## just join them, so we can use SK_ID_BUREAU in the next step
    df_merged = df_merged.merge(df_bureau, left_on=key, right_index=True, how='left', suffixes=['', '_BUR'])    
    print(f'\t-> {df_merged.shape}')

    # bureau balance
    print('Processing bureau balance..')
    most_recent_index = df_bureau_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].idxmax() # first occurence of maximum
    df_bureau_balance = df_bureau_balance.loc[most_recent_index, :]
    df_merged = df_merged.merge(df_bureau_balance, left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU', how='left', suffixes=['', '_B_B'])
    print(f'\t-> {df_merged.shape}')

    # credit card balance
    print('Processing credit card balance..')
    ## weighted average of monthly credit card balances
    wm = lambda x: np.average(x, weights=-1/df_credit_card.loc[x.index, 'MONTHS_BALANCE'])
    credit_card_avgs = df_credit_card.groupby(key).agg(wm)   
    df_merged = df_merged.merge(credit_card_avgs, left_on=key, right_index=True, how='left', suffixes=['', '_CC_WAVG'])                      
    print(f'\t-> {df_merged.shape}')

    # pos_cash
    print('Processing pos cash..')
    ## weighted positive cash balance
    wm = lambda x: np.average(x, weights=-1/df_pos_cash.loc[x.index, 'MONTHS_BALANCE'])
    cash_avg = df_pos_cash.groupby('SK_ID_CURR')['CNT_INSTALMENT','CNT_INSTALMENT_FUTURE','SK_DPD', 'SK_DPD_DEF'].agg({'CNT_INSTALMENT': wm, 'CNT_INSTALMENT_FUTURE': wm, 'SK_DPD': wm, 'SK_DPD_DEF':wm})
    print(f'\t-> {df_merged.shape}')

    # prev applications
    print('Processing previous applications..')
    ## clean up, 365243 == nan in this table
    df_prev_app['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    df_prev_app['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)
    df_prev_app['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    df_prev_app['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    df_prev_app['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    df_prev_app = df_prev_app.groupby('SK_ID_CURR').agg({'SK_ID_CURR': 'count', 'AMT_CREDIT': 'sum'})
    ## we engineer 2 new columns, previous application count and total previous loans
    df_prev_app.columns = ['PREV APP COUNT', 'TOTAL PREV LOAN AMT']
    df_merged = df_merged.merge(df_prev_app, left_on='SK_ID_CURR', right_index=True, how='left')
    print(f'\t-> {df_merged.shape}')

    # installment payments
    print('Processing installments data..')         
    ## ignore
    print(f'\t-> {df_merged.shape}')

    # other value counts
    print('Adding simple value counts..')
    df_merged = df_merged.merge(pd.DataFrame(df_bureau[key].value_counts()), left_on=key, right_index=True, how='left', suffixes=['', '_CNT_BUREAU'])
    df_merged = df_merged.merge(pd.DataFrame(df_credit_card[key].value_counts()), left_on=key, right_index=True, how='left', suffixes=['', '_CNT_CRED_CARD'])
    df_merged = df_merged.merge(pd.DataFrame(df_pos_cash[key].value_counts()), left_on=key, right_index=True, how='left', suffixes=['', '_CNT_POS_CASH'])
    df_merged = df_merged.merge(pd.DataFrame(df_install[key].value_counts()), left_on=key, right_index=True, how='left', suffixes=['', '_CNT_INSTALL'])
    print(f'\t-> {df_merged.shape}')

    print(f'Combined train & test after preprocessing {df_merged.shape}')

    # save as csv & allow loading of csv for speedup 
    print(f'Saving to {fname_preprocessed}')
    df_merged.to_csv(fname_preprocessed)
else:
    df_merged = pd.read_csv(fname_preprocessed)

df_meta = pd.DataFrame()
df_meta[key] = df_merged[key]

Combined train & test before processing (356255, 122)
Processing main data..
	-> (356255, 128)
Processing bureau data..
	-> (356255, 145)
Processing bureau balance..
	-> (356255, 147)
Processing credit card balance..
	-> (356255, 168)
Processing pos cash..
	-> (356255, 168)
Processing previous applications..
	-> (356255, 170)
Processing installments data..
	-> (356255, 170)
Adding simple value counts..
	-> (356255, 174)
Combined train & test after preprocessing (356255, 174)
Saving to my_preprocessed_data.csv


Now we label encode the categorical values because fastai does not do that on its own

In [4]:
from sklearn import preprocessing 

df_proc = df_merged
cat_feats = df_proc.columns[df_proc.dtypes == 'object']

for feat in cat_feats:
    encoder = preprocessing.LabelEncoder()
    df_proc[feat] = encoder.fit_transform(df_proc[feat].fillna('NULL'))

In [5]:
df_proc.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF,PREV APP COUNT,TOTAL PREV LOAN AMT,SK_ID_CURR_CNT_BUREAU,SK_ID_CURR_CNT_CRED_CARD,SK_ID_CURR_CNT_POS_CASH,SK_ID_CURR_CNT_INSTALL
0,100002,1.0,0,1,0,1,0,202500.0,406597.5,24700.5,...,,,,,1.0,179055.0,8.0,,19.0,19.0
1,100003,0.0,0,0,0,0,0,270000.0,1293502.5,35698.5,...,,,,,3.0,1452573.0,4.0,,28.0,25.0
2,100004,0.0,1,1,1,1,0,67500.0,135000.0,6750.0,...,,,,,1.0,20106.0,2.0,,4.0,3.0
3,100006,0.0,0,0,0,1,0,135000.0,312682.5,29686.5,...,,0.0,0.0,0.0,9.0,2625259.5,,6.0,21.0,16.0
4,100007,0.0,0,1,0,1,0,121500.0,513000.0,21865.5,...,,,,,6.0,999832.5,1.0,,66.0,66.0


## Model & Training

### FastAI

In [6]:
train = df_proc[:len_train]
test = df_proc[len_train:].drop(columns='TARGET')
target = train['TARGET']

dep_var = 'TARGET'
valid = train.sample(frac=0.2, random_state=42)
valid_idx = valid.index
procs = [FillMissing, Categorify, Normalize]

# dont normalize the key column, we dont need it either in here
data = TabularDataBunch.from_df("./", train.drop(columns=[key]), dep_var, valid_idx=valid_idx, procs=procs, bs=2048)
data.add_test(TabularList.from_df(test.drop(columns=[key]), path="./"))
data.show_batch(rows=5)

NONLIVINGAREA_MEDI_na,PREV APP COUNT_na,COMMONAREA_MEDI_na,FLOORSMIN_MEDI_na,DEF_30_CNT_SOCIAL_CIRCLE_na,DEF_60_CNT_SOCIAL_CIRCLE_na,AMT_REQ_CREDIT_BUREAU_HOUR_na,EXT_SOURCE_1_na,ENTRANCES_MEDI_na,DAYS_LAST_PHONE_CHANGE_na,YEARS_BUILD_MEDI_na,AMT_TOTAL_RECEIVABLE_na,LIVINGAREA_MODE_na,DAYS_CREDIT_ENDDATE_na,LIVINGAPARTMENTS_MODE_na,YEARS_BEGINEXPLUATATION_MEDI_na,SK_ID_CURR_CNT_INSTALL_na,ANNUITY_INCOME_RATIO_na,OWN_CAR_AGE_na,AMT_CREDIT_SUM_na,AMT_PAYMENT_CURRENT_na,TOTAL PREV LOAN AMT_na,AMT_ANNUITY_BUR_na,APARTMENTS_MODE_na,FLOORSMAX_MEDI_na,OBS_60_CNT_SOCIAL_CIRCLE_na,MONTHS_BALANCE_CC_WAVG_na,AMT_REQ_CREDIT_BUREAU_YEAR_na,FLOORSMAX_AVG_na,AMT_DRAWINGS_CURRENT_na,CHILDREN_RATIO_na,YEARS_BUILD_MODE_na,AMT_REQ_CREDIT_BUREAU_DAY_na,BASEMENTAREA_MEDI_na,LANDAREA_MEDI_na,AMT_CREDIT_LIMIT_ACTUAL_na,FLOORSMIN_AVG_na,FLOORSMIN_MODE_na,LIVINGAREA_AVG_na,COMMONAREA_MODE_na,AMT_DRAWINGS_OTHER_CURRENT_na,LIVINGAPARTMENTS_MEDI_na,NONLIVINGAPARTMENTS_MEDI_na,NONLIVINGAPARTMENTS_AVG_na,YEARS_BEGINEXPLUATATION_MODE_na,SK_ID_CURR_CNT_BUREAU_na,NONLIVINGAREA_MODE_na,CNT_DRAWINGS_ATM_CURRENT_na,ELEVATORS_MODE_na,FLOORSMAX_MODE_na,LANDAREA_MODE_na,SK_DPD_na,AMT_CREDIT_MAX_OVERDUE_na,AMT_DRAWINGS_ATM_CURRENT_na,AMT_INST_MIN_REGULARITY_na,AMT_RECIVABLE_na,SK_ID_CURR_CNT_POS_CASH_na,CNT_DRAWINGS_OTHER_CURRENT_na,LIVINGAREA_MEDI_na,APARTMENTS_MEDI_na,EXT_SOURCE_3_na,TOTALAREA_MODE_na,LIVINGAPARTMENTS_AVG_na,AMT_BALANCE_na,NONLIVINGAREA_AVG_na,OBS_30_CNT_SOCIAL_CIRCLE_na,YEARS_BUILD_AVG_na,EXT_SOURCE_2_na,APARTMENTS_AVG_na,LANDAREA_AVG_na,NONLIVINGAPARTMENTS_MODE_na,MONTHS_BALANCE_na,CNT_DRAWINGS_CURRENT_na,AMT_CREDIT_SUM_LIMIT_na,CNT_DRAWINGS_POS_CURRENT_na,CNT_FAM_MEMBERS_na,AMT_RECEIVABLE_PRINCIPAL_na,COMMONAREA_AVG_na,BASEMENTAREA_AVG_na,AMT_ANNUITY_na,SK_ID_CURR_CNT_CRED_CARD_na,DAYS_ENDDATE_FACT_na,ENTRANCES_MODE_na,BASEMENTAREA_MODE_na,AMT_REQ_CREDIT_BUREAU_WEEK_na,ENTRANCES_AVG_na,SK_DPD_DEF_na,AMT_CREDIT_SUM_DEBT_na,AMT_PAYMENT_TOTAL_CURRENT_na,AMT_GOODS_PRICE_na,AMT_REQ_CREDIT_BUREAU_QRT_na,SK_ID_PREV_na,CNT_INSTALMENT_MATURE_CUM_na,YEARS_BEGINEXPLUATATION_AVG_na,AMT_DRAWINGS_POS_CURRENT_na,ELEVATORS_AVG_na,ELEVATORS_MEDI_na,AMT_REQ_CREDIT_BUREAU_MON_na,INCOME_PER_FAM_na,NONLIVINGAREA_MEDI,CODE_GENDER,PREV APP COUNT,COMMONAREA_MEDI,FLOORSMIN_MEDI,STATUS,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_REQ_CREDIT_BUREAU_HOUR,EXT_SOURCE_1,ENTRANCES_MEDI,DAYS_LAST_PHONE_CHANGE,YEARS_BUILD_MEDI,AMT_TOTAL_RECEIVABLE,CREDIT_DAY_OVERDUE,LIVINGAREA_MODE,DAYS_CREDIT_ENDDATE,LIVINGAPARTMENTS_MODE,FLAG_DOCUMENT_15,NAME_INCOME_TYPE,FLAG_EMAIL,FLAG_DOCUMENT_3,REG_CITY_NOT_WORK_CITY,AMT_INCOME_TOTAL,WORKING_LIFE_RATIO,YEARS_BEGINEXPLUATATION_MEDI,SK_ID_CURR_CNT_INSTALL,REGION_RATING_CLIENT_W_CITY,SK_ID_CURR_BUR,ANNUITY_INCOME_RATIO,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,OWN_CAR_AGE,NAME_HOUSING_TYPE,WALLSMATERIAL_MODE,AMT_CREDIT_SUM,AMT_PAYMENT_CURRENT,TOTAL PREV LOAN AMT,AMT_CREDIT_SUM_OVERDUE,FLAG_MOBIL,AMT_ANNUITY_BUR,APARTMENTS_MODE,NAME_FAMILY_STATUS,FLOORSMAX_MEDI,FLAG_DOCUMENT_18,NAME_TYPE_SUITE,LIVE_CITY_NOT_WORK_CITY,OBS_60_CNT_SOCIAL_CIRCLE,MONTHS_BALANCE_CC_WAVG,AMT_REQ_CREDIT_BUREAU_YEAR,FLOORSMAX_AVG,FLAG_DOCUMENT_21,CNT_CHILDREN,DAYS_REGISTRATION,AMT_DRAWINGS_CURRENT,CHILDREN_RATIO,YEARS_BUILD_MODE,AMT_REQ_CREDIT_BUREAU_DAY,FLAG_DOCUMENT_13,DAYS_ID_PUBLISH,EMERGENCYSTATE_MODE,CREDIT_ACTIVE,BASEMENTAREA_MEDI,LANDAREA_MEDI,INCOME_CREDIT_PERC,FLAG_DOCUMENT_4,AMT_CREDIT_LIMIT_ACTUAL,FLOORSMIN_AVG,CNT_CREDIT_PROLONG,FLOORSMIN_MODE,LIVINGAREA_AVG,COMMONAREA_MODE,AMT_CREDIT,AMT_DRAWINGS_OTHER_CURRENT,LIVINGAPARTMENTS_MEDI,NONLIVINGAPARTMENTS_MEDI,WEEKDAY_APPR_PROCESS_START,NONLIVINGAPARTMENTS_AVG,NAME_EDUCATION_TYPE,YEARS_BEGINEXPLUATATION_MODE,FLAG_DOCUMENT_5,FLAG_PHONE,SK_ID_CURR_CNT_BUREAU,ORGANIZATION_TYPE,NONLIVINGAREA_MODE,CNT_DRAWINGS_ATM_CURRENT,ELEVATORS_MODE,REGION_POPULATION_RELATIVE,REG_CITY_NOT_LIVE_CITY,FLOORSMAX_MODE,LANDAREA_MODE,SK_DPD,AMT_CREDIT_MAX_OVERDUE,FLAG_DOCUMENT_11,AMT_DRAWINGS_ATM_CURRENT,AMT_INST_MIN_REGULARITY,FLAG_DOCUMENT_10,LIVE_REGION_NOT_WORK_REGION,AMT_RECIVABLE,SK_ID_CURR_CNT_POS_CASH,FLAG_DOCUMENT_19,FLAG_DOCUMENT_12,CNT_DRAWINGS_OTHER_CURRENT,REG_REGION_NOT_LIVE_REGION,LIVINGAREA_MEDI,APARTMENTS_MEDI,EXT_SOURCE_3,TOTALAREA_MODE,LIVINGAPARTMENTS_AVG,AMT_BALANCE,NONLIVINGAREA_AVG,HOUSETYPE_MODE,FLAG_DOCUMENT_6,OBS_30_CNT_SOCIAL_CIRCLE,DAYS_CREDIT_UPDATE,YEARS_BUILD_AVG,EXT_SOURCE_2,APARTMENTS_AVG,LANDAREA_AVG,NONLIVINGAPARTMENTS_MODE,MONTHS_BALANCE,CNT_DRAWINGS_CURRENT,HOUR_APPR_PROCESS_START,AMT_CREDIT_SUM_LIMIT,CNT_DRAWINGS_POS_CURRENT,LOAN_INCOME_RATIO,CNT_FAM_MEMBERS,AMT_RECEIVABLE_PRINCIPAL,COMMONAREA_AVG,BASEMENTAREA_AVG,AMT_ANNUITY,FLAG_DOCUMENT_14,SK_ID_CURR_CNT_CRED_CARD,FLAG_DOCUMENT_2,DAYS_ENDDATE_FACT,ENTRANCES_MODE,FLAG_DOCUMENT_7,BASEMENTAREA_MODE,FLAG_EMP_PHONE,AMT_REQ_CREDIT_BUREAU_WEEK,ENTRANCES_AVG,FLAG_DOCUMENT_9,CREDIT_TYPE,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FONDKAPREMONT_MODE,OCCUPATION_TYPE,SK_DPD_DEF,REG_REGION_NOT_WORK_REGION,AMT_CREDIT_SUM_DEBT,AMT_PAYMENT_TOTAL_CURRENT,FLAG_CONT_MOBILE,FLAG_OWN_REALTY,AMT_GOODS_PRICE,FLAG_DOCUMENT_20,AMT_REQ_CREDIT_BUREAU_QRT,DAYS_CREDIT,SK_ID_BUREAU,SK_ID_PREV,CNT_INSTALMENT_MATURE_CUM,FLAG_DOCUMENT_8,REGION_RATING_CLIENT,YEARS_BEGINEXPLUATATION_AVG,CREDIT_CURRENCY,AMT_DRAWINGS_POS_CURRENT,DAYS_EMPLOYED,ELEVATORS_AVG,ELEVATORS_MEDI,FLAG_WORK_PHONE,AMT_REQ_CREDIT_BUREAU_MON,DAYS_BIRTH,INCOME_PER_FAM,target
True,False,True,True,False,False,False,True,True,False,True,False,True,False,True,True,False,False,False,False,True,False,True,True,True,False,False,False,True,False,False,True,False,True,True,False,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,False,True,True,False,False,False,True,True,True,True,True,True,False,True,False,True,False,True,True,True,False,False,True,True,False,False,True,True,False,False,True,True,True,False,True,False,False,False,False,False,False,False,True,True,True,True,False,False,-0.2324,1.3867,-0.2013,-0.1664,-0.0817,0.8707,-0.3194,-0.2749,-0.0715,0.0124,-0.0788,-1.0093,0.0123,-0.3896,-0.0219,-0.2033,0.0406,-0.1593,-0.0341,0.9162,-0.2458,0.6393,1.829,0.9714,0.448,0.0459,-0.5512,-0.063,0.827,-0.5529,-0.3251,1.391,0.4172,-0.3047,-0.6763,0.9044,-0.0725,-0.1945,-0.0056,0.002,-0.0871,-0.1925,1.3117,-0.2787,-0.0905,0.4407,2.1406,-0.5855,1.4388,1.8231,-0.2811,-0.0176,-0.5776,-1.2489,-0.2631,-0.6287,0.0251,-0.0586,-0.0588,0.599,-1.0376,-1.3154,-0.0947,-0.1411,-0.3625,-0.0086,-0.7644,-0.083,-0.0644,-0.0697,-0.2057,-0.1684,0.9436,-0.038,-0.1512,-0.0992,-1.4388,-0.1003,0.6261,0.0501,-0.124,1.6009,-1.0581,-1.0971,-0.2384,-0.1276,-0.3563,-0.3071,-0.2916,-0.2653,-0.146,-0.0467,-0.0372,-0.0626,-0.1178,-0.4141,-0.004,-0.2059,-0.3895,-0.4682,-0.0241,-0.002,-0.0425,-0.1243,-0.2075,-0.198,0.1194,-0.2213,-0.1491,-0.3901,-0.2312,-0.9597,-0.3107,-0.5873,0.7915,0.0119,-0.3677,-0.1897,-0.1403,-0.095,0.2516,-0.2124,0.5926,-0.1126,-0.1288,-0.1223,-1.2684,-0.388,-0.1648,-0.0944,0.5236,-0.0544,-1.1086,-0.0057,0.1311,-0.0505,-0.014,-0.0981,0.4689,-0.1554,-0.0827,-0.0627,0.6992,-0.1003,-0.0156,-0.6571,-0.3008,-0.0058,-0.2316,1.7841,-0.3134,0.0435,0.6633,0.4409,-0.0223,-0.2956,-0.0249,0.1379,1.2104,-0.6225,-0.297,-0.1034,0.0525,-0.0287,-0.1038,-0.4551,-0.3688,-0.3656,-0.4983,0.9019,1.5223,2.4201,0.0
False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,False,False,True,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,True,True,True,True,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,True,False,True,False,False,False,True,True,False,False,False,False,True,False,True,False,False,True,True,False,True,False,False,False,False,1.0509,-0.721,-0.2013,-0.472,-0.0817,-2.7396,1.9191,2.4803,-0.0715,0.0124,-0.0788,-2.0641,-0.7134,-0.2418,-0.0219,-0.3971,0.0314,-0.4822,-0.0341,0.9162,-0.2458,0.6393,-0.5468,-0.4062,0.4706,-0.0234,-0.2757,-0.063,-0.5003,2.5539,-0.3251,-0.7189,-0.1459,-0.3047,1.3047,-0.1798,-0.0725,-0.1066,-0.0056,0.002,0.1352,-0.3418,-0.4028,-0.2787,-0.0905,-0.9698,-0.4672,1.0871,0.2026,-0.4413,-0.2811,-0.0176,-0.5776,0.4584,-0.1875,-0.6287,-0.6941,-0.0586,-0.0588,1.4625,0.9079,-1.3154,-0.0609,-0.1241,-0.9881,-0.0086,-0.1547,-0.083,-0.0644,-0.0697,-0.4477,-0.4355,3.5083,-0.038,-0.5465,0.6242,-0.5081,0.613,-1.6819,-0.0144,-0.124,-0.6246,0.6477,0.0269,1.1322,-0.1276,-0.3563,0.1281,-0.2916,-0.2653,-0.0722,-0.0467,-0.0372,-0.0626,-0.1178,-0.1861,-0.004,-0.2059,-0.2418,0.0815,-0.0241,-0.002,-0.0425,-0.1243,-0.4383,-0.3852,1.2853,-0.3653,-0.5563,-0.242,1.0232,0.9204,-0.3107,1.0704,0.7487,-0.7153,-1.8438,-0.4032,-0.1307,0.6499,0.1536,-0.1784,-0.6314,-0.1126,-0.1288,4.4605,-0.168,-0.2417,-0.4778,-0.07,1.801,-0.0544,-0.2238,-0.0057,0.1311,-0.0505,-0.014,0.0117,0.4689,-0.1554,-0.0827,-0.0627,-0.3271,-0.1003,-0.0156,1.4586,-1.4602,-0.0058,-0.2316,0.1015,-0.1618,0.0435,0.6633,3.4094,-0.0223,-0.2956,1.3021,1.247,-0.0072,-0.1274,-0.297,-0.1034,-0.0293,-0.0287,-0.1038,-0.4761,-0.3688,-0.3656,-0.4983,-0.2704,-0.2997,-0.4095,0.0
False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,-0.2943,-0.721,0.0468,-0.6213,-1.4334,-2.7396,-0.3194,-0.2749,-0.0715,-0.5976,-1.0527,-1.1809,-1.8529,-0.1211,-0.0219,-1.0084,0.1141,-1.3257,-0.0341,0.9162,-0.2458,0.6393,-0.5468,1.1867,0.5038,-0.1483,3.2309,-0.063,0.2596,-0.1548,-0.3251,1.391,-0.5683,-0.3047,1.3047,1.7202,-0.1504,0.1158,-0.0056,0.002,0.5262,-1.1195,-1.26,-1.4469,-0.0905,0.4407,-0.4672,-0.5855,-2.4123,0.1248,-1.4518,-0.0176,0.81,-0.636,-0.1997,1.0428,-1.8216,-0.0586,-0.0588,-0.9735,0.9079,-1.3154,-1.5184,-0.7809,-0.4693,-0.0086,-0.4386,-1.4388,-0.0644,-1.4304,-1.0406,-0.5905,1.5464,-0.038,-1.3668,-0.0992,-0.5081,-0.1003,0.6261,-0.1305,-0.124,-0.6246,0.4041,-1.1948,-0.2611,-0.1276,-0.3563,-0.8104,-0.2916,-1.4494,-0.7344,-0.0467,-0.0372,-0.0626,-0.1178,-0.1861,-0.004,-0.2059,-0.121,2.1956,-0.0241,-0.002,-0.0425,-0.1243,-1.0321,-1.1454,0.5031,-1.002,-1.3725,-0.1231,-0.304,0.9204,-0.3107,-0.5873,0.8081,-1.858,-0.9438,-1.1624,-0.7822,-0.095,0.2516,-0.1774,0.5926,-0.1126,-0.1288,0.0823,0.9325,-0.1208,-0.6297,-1.5222,1.4825,-0.0544,2.9513,-0.0057,0.1311,-1.02,-0.014,-1.4636,0.4689,-0.1554,-1.0594,-0.0627,-2.3796,-0.1003,-0.0156,1.4586,0.6268,-0.0058,-0.2316,1.8769,-0.1677,0.0435,-1.5076,1.5845,-0.0223,-0.2956,0.4297,0.1017,-1.8163,-0.1274,-0.297,-0.1034,-0.1557,-0.0287,-0.1038,-0.4906,-0.3688,-0.3656,-0.4983,-0.2704,0.6771,0.0621,0.0
True,False,True,True,False,False,True,False,True,False,True,True,True,False,True,True,False,False,True,False,True,False,True,True,True,False,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,False,True,False,True,True,True,True,True,False,True,False,True,True,True,False,True,False,True,True,True,True,True,False,True,False,True,True,True,True,True,True,True,True,False,-0.2324,-0.721,-0.6976,-0.1664,-0.0817,0.4194,-0.3194,-0.2749,-0.0715,2.0007,-0.0788,-0.6939,0.0123,-0.2418,-0.0219,-0.2033,-0.005,-0.1593,-0.0341,-0.6557,-0.2458,0.6393,-0.5468,0.1104,-2.4339,0.0459,-0.5261,-0.063,-0.3855,-0.0445,-0.3251,-0.7189,-0.1459,-0.3047,-0.6763,0.4595,-0.0725,-0.5499,-0.0056,0.002,-0.0871,-0.1925,1.3117,-0.2787,-0.0905,0.4407,-0.4672,-0.5855,0.2026,-0.4413,-0.2811,-0.0176,-0.5776,0.2125,-0.1875,-0.6287,0.0251,-0.0586,-0.0588,0.1975,-1.0376,0.751,-0.0947,-0.1411,-0.1574,-0.0086,-0.1547,-0.083,-0.0644,-0.0697,-0.2057,-0.1684,-0.1944,-0.038,-0.1512,-0.0992,-1.4388,-0.1003,0.6261,0.0501,-0.124,-0.6246,-0.327,1.3463,-0.2384,-0.1276,-0.3563,-1.1503,-0.2916,-0.2653,-0.146,-0.0467,-0.0372,-0.0626,-0.1178,-0.1861,-0.004,-0.2059,-0.2418,-0.4682,-0.0241,-0.002,-0.0425,-0.1243,-0.2075,-0.198,0.1194,-0.2213,-0.1491,-0.242,-0.2312,-0.9597,-0.3107,-0.5873,-0.3788,0.0119,0.5059,-0.1897,-0.1403,-0.095,0.2516,-0.1784,-0.6314,-0.1126,-0.1288,-0.3945,-1.2684,-0.2417,-0.1648,-0.0944,0.3229,-0.0544,-0.2238,-0.0057,0.1345,-0.0505,-0.014,-0.0981,-2.1328,-0.1554,-0.0827,-0.0627,-0.3271,-0.1003,-0.0156,-0.6571,0.6268,-0.0058,-0.2316,-0.1951,-0.1618,0.0435,-1.5076,-0.2403,-0.0223,-0.2956,-0.3027,0.5967,-0.0072,-0.1274,-0.297,-0.1034,0.0525,-0.0287,-0.1038,2.1329,-0.3688,-0.3656,-0.4983,-0.2704,-0.7161,1.1898,0.0
True,False,True,True,False,False,False,True,True,False,True,True,True,False,True,True,False,False,True,False,True,False,False,True,True,False,True,False,True,True,False,True,False,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,False,True,True,True,False,True,True,True,True,False,True,False,True,True,True,False,True,False,True,False,True,True,True,False,True,True,True,True,False,True,True,False,True,False,False,True,True,True,True,True,True,False,False,-0.2324,-0.721,0.0468,-0.1664,-0.0817,-2.7396,-0.3194,-0.2749,-0.0715,0.0124,-0.0788,-0.6831,0.0123,-0.2418,-0.0219,-0.2033,0.0626,-0.1593,-0.0341,0.9162,-0.2458,0.6393,-0.5468,-0.4062,0.4712,0.0459,0.601,-0.063,-1.6083,-0.8089,-0.3251,-0.7189,-0.1459,-0.3047,-0.6763,0.005,-0.0725,0.8639,-0.0056,0.002,0.3212,-0.1925,-0.4028,-0.2787,-0.0905,0.4407,-0.4672,-0.5855,0.2026,0.6909,-0.2811,-0.0176,-0.5776,0.2199,-0.1875,-0.6287,0.0251,-0.0586,-0.0588,0.8475,-1.0376,-1.3154,-0.0947,-0.1411,0.6448,-0.0086,-0.1547,-0.083,-0.0644,-0.0697,-0.2057,-0.1684,-0.9821,-0.038,-0.1512,-0.0992,-0.9735,-0.1003,0.6261,0.0501,-0.124,-0.6246,-0.5707,0.6133,-0.2384,-0.1276,-0.3563,-0.7749,-0.2916,-0.2653,-0.146,-0.0467,-0.0372,-0.0626,-0.1178,-0.1861,-0.004,-0.2059,-0.2418,1.5191,-0.0241,-0.002,-0.0425,-0.1243,-0.2075,-0.198,0.4137,-0.2213,-0.1491,-0.242,-0.2312,-0.9597,-0.3107,-0.5873,0.7846,0.0119,-0.4907,-0.1897,-0.1403,-0.095,0.2516,-0.1784,-0.3254,-0.1126,-0.1288,-0.8689,-0.168,-0.2417,-0.1648,-0.0944,-0.9628,-0.0544,-0.2238,-0.0057,0.1311,-0.0505,-0.014,-0.0981,0.4689,-0.1554,-0.0827,-0.0627,0.6992,-0.1003,-0.0156,-0.6571,1.3224,-0.0058,-0.2316,0.3937,-0.1618,0.0435,0.6633,-0.9703,-0.0223,-0.2956,-0.2761,1.6622,-0.0072,-0.1274,-0.297,-0.1034,0.0525,-0.0287,-0.1038,-0.4778,-0.3688,-0.3656,-0.4983,-0.2704,-0.4893,-0.4095,0.0


In [7]:
def accuracy_fixed(input:Tensor, targs:Tensor)->Rank0Tensor:
    targs = targs.view(-1).long()
    n = targs.shape[0]
    input = input.argmax(dim=-1).view(n,-1)
    targs = targs.view(n,-1)
    return (input==targs).float().mean()

wd=0.2 # https://becominghuman.ai/this-thing-called-weight-decay-a7cd4bcfccab, 0.2 seems to work best
learn = tabular_learner(data, layers=[200,100], metrics=accuracy_fixed, wd=wd)
learn.fit_one_cycle(4)

epoch,train_loss,valid_loss,accuracy_fixed,time
0,0.083959,0.074481,0.919531,02:05
1,0.071101,0.114668,0.919531,02:04
2,0.068736,0.06954,0.919531,02:03
3,0.06691,0.551512,0.919531,02:02


In [8]:
from sklearn.metrics import roc_auc_score

valid_predicts, _ = learn.get_preds(ds_type=DatasetType.Valid)
valid_probs = np.array(valid_predicts)
valid_targets = df_train.loc[valid_idx].TARGET.values
valid_score = roc_auc_score(valid_targets, valid_probs)
print(f'AUC/ROC: {valid_score}')

preds, y = learn.get_preds(ds_type=DatasetType.Test)
out = pd.DataFrame()
out[key] = test[key]
out['TARGET'] = preds.numpy()[:, 0]
out['TARGET'] = out['TARGET'].astype('float')
out['TARGET'] = out['TARGET'].clip(lower=0, upper=1)
out.to_csv('sub_fastai.csv', index=False)
# ~74% without agg

### Other approaches (lgbm, keras, scikit,..)

In [9]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn import tree, ensemble
from sklearn.pipeline import Pipeline
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.models import Sequential
from keras import layers
import lightgbm as lgbm
import xgboost # conda install -c anaconda py-xgboost

def print_scores(y_test, preds):
    print(f'Classification: {classification_report(y_test, preds)}')
    print(f'Confusion Matrix: {confusion_matrix(y_test, preds)}')
    print(f'Accuracy: {accuracy_score(y_test,preds)}')
    print(f'ROC/AU: {roc_auc_score(y_test,  preds)}')
    
    
def create_sub(name, model, testset, test):
    y_pred = model.predict_proba(testset)[:,1]
    df_sub = pd.DataFrame()
    df_sub['SK_ID_CURR'] = test['SK_ID_CURR']
    df_sub['TARGET'] = y_pred
    df_sub['TARGET'] = df_sub['TARGET'].astype('float')
    df_sub['TARGET'] = df_sub['TARGET'].clip(lower=0, upper=1)
    df_sub.head()
    df_sub.to_csv(name, index=False)
           
# https://lightgbm.readthedocs.io/en/latest/Parameters.html
lgbm_params = {
    'boosting': 'dart', # gbdt, rf, dart, goss
    'application': 'binary', # binary, multiclassova, ..
    'metric': 'auc',
    'feature_fraction': 0.5, # default=1, randomly select part of features with each iteration, helps with overfitting
    'scale_pos_weight': 2, # default=1, exclusive with is_unbalance, increaes overall performance
    'drop_rate': 0.02, # default=0.1, dropout, used against overfitting
}

# this does not use the preprocessing from fastai
lgbm_train = lgbm.Dataset(train.drop(columns=['TARGET']), label=target, categorical_feature=cat_feats.tolist(), free_raw_data=False)

results = lgbm.cv(train_set=lgbm_train,
                     params=lgbm_params,
                     nfold=5,
                     num_boost_round=500,
                     verbose_eval=100,
                     metrics=['auc'])
opt = np.argmax(results['auc-mean'])
print(f'Best result = {np.max(results["auc-mean"])}')

clf = lgbm.train(lgbm_params, lgbm_train, num_boost_round=opt)
y_pred = clf.predict(test)
out = pd.DataFrame()
out[key] = test[key]
out['TARGET'] = y_pred
out.to_csv('sub_lgbm.csv', index=False)
# ~76% without aggregatopm
print('Feature importances:', list(clf.feature_importance()))

Using TensorFlow backend.


[100]	cv_agg's auc: 0.763167 + 0.00495816
[200]	cv_agg's auc: 0.764178 + 0.00499839
[300]	cv_agg's auc: 0.765714 + 0.00490112
[400]	cv_agg's auc: 0.765983 + 0.00516591
[500]	cv_agg's auc: 0.766047 + 0.00495535
Best result = 0.766229175205221




Feature importances: [110, 50, 111, 30, 13, 7, 99, 307, 302, 322, 20, 25, 79, 70, 38, 127, 377, 158, 180, 255, 175, 0, 3, 45, 0, 18, 1, 288, 16, 19, 75, 60, 73, 3, 3, 1, 44, 2, 2, 1433, 505, 503, 577, 48, 47, 43, 46, 41, 15, 28, 31, 31, 65, 29, 50, 31, 49, 45, 43, 57, 52, 49, 6, 26, 13, 18, 60, 54, 43, 23, 30, 36, 39, 54, 19, 37, 8, 25, 11, 10, 53, 40, 46, 18, 48, 2, 2, 57, 29, 1, 39, 67, 42, 56, 165, 0, 60, 0, 0, 2, 0, 2, 0, 0, 1, 0, 6, 5, 3, 21, 0, 23, 0, 0, 0, 5, 14, 11, 20, 70, 64, 155, 194, 209, 113, 121, 9, 96, 102, 2, 0, 115, 5, 127, 74, 41, 1, 71, 55, 27, 5, 3, 98, 52, 42, 2, 64, 64, 96, 182, 48, 41, 28, 58, 78, 57, 62, 58, 69, 27, 121, 161, 18, 30, 47, 41, 63, 124, 415, 177, 52, 287, 233]


In [10]:
# For keras models we need to deal with the imbalance manually & it helps to min-max scale

X = data.train_ds.inner_df.drop(columns=['TARGET'])
y = data.train_ds.inner_df['TARGET']

rs = RandomOverSampler(random_state=42)
X_resampled, y_resampled = rs.fit_resample(X, y)

scaler = MinMaxScaler()
X_resampled_tf = scaler.fit_transform(X_resampled)
X_train, X_test, y_train, y_test = train_test_split(X_resampled_tf, y_resampled, test_size=0.2, random_state=42)

In [13]:
def create_model():
    model = Sequential()
    model.add(Dense(200, input_dim=(271), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(2, activation='softmax'))
    optimizer = Adam(lr=0.001)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])  
    return model

clf = KerasClassifier(build_fn=create_model, epochs=20, batch_size=64, verbose=1)
pipeline = Pipeline([('clf',  clf)])
model = pipeline.fit(X_train, y_train)
print_scores(y_test, model.predict(X_test))
create_sub('sub_ann.csv', model, data.test_ds.inner_df, test)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Classification:               precision    recall  f1-score   support

         0.0       0.76      0.65      0.70     45171
         1.0       0.69      0.79      0.74     45283

    accuracy                           0.72     90454
   macro avg       0.73      0.72      0.72     90454
weighted avg       0.73      0.72      0.72     90454

Confusion Matrix: [[29239 15932]
 [ 9300 35983]]
Accuracy: 0.7210515842306586
ROC/AU: 0.7209603729116487


In [14]:
clf = ensemble.RandomForestClassifier(criterion='entropy', n_estimators=100, max_depth=15, min_samples_leaf=5)
clf.fit(X_train, y_train)
print_scores(y_test, clf.predict(X_test))
# the roc/au score is wrong here for some reason (way lower in reality)
create_sub('sub_rf.csv', clf, data.test_ds.inner_df, test)

Classification:               precision    recall  f1-score   support

         0.0       0.87      0.81      0.84     45171
         1.0       0.83      0.88      0.85     45283

    accuracy                           0.85     90454
   macro avg       0.85      0.85      0.85     90454
weighted avg       0.85      0.85      0.85     90454

Confusion Matrix: [[36794  8377]
 [ 5344 39939]]
Accuracy: 0.8483096380480686
ROC/AU: 0.8482678875699079


## Blends

In [17]:
def merge_dataframes(dfs, merge_keys):
    dfs_merged = reduce(lambda left,right: pd.merge(left, right, on=merge_keys), dfs)
    return dfs_merged

M1 = pd.read_csv('sub_fastai.csv')
M2 = pd.read_csv('sub_lgbm.csv')
M3 = pd.read_csv('sub_ann.csv')
M4 = pd.read_csv('sub_rf.csv')

dfs = [M1,M2,M3,M4]
merge_keys=['SK_ID_CURR']
df_blend = merge_dataframes(dfs, merge_keys=merge_keys)
df_blend.columns = ['SK_ID_CURR','T1','T2','T3','T4']

prob_preds = 0.2 * df_blend['T1'] + 0.8 * df_blend['T2'] #+ 0.2 * df_blend['T3'] + 0.2 * df_blend['T4']

sub_blend = pd.DataFrame()
sub_blend['SK_ID_CURR'] = df_blend['SK_ID_CURR']
sub_blend['TARGET']= prob_preds
sub_blend.head()
sub_blend.to_csv('sub_blend.csv', index=False)

The fastai model, which was scoring lower than the lgbm model helped to increase its score on submission slightly.

![kaggle.PNG](attachment:kaggle.PNG)

The scores are not impressive, but it was a good way to learn about how to appaoch kaggle challenges.