In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import VarianceThreshold
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#меняем рабочую директорию
import os
os.chdir('../scoring')

In [3]:
print('Importing data...')
data = pd.read_csv('application_train.csv')
test = pd.read_csv('application_test.csv')
prev = pd.read_csv('previous_application.csv')
buro = pd.read_csv('bureau.csv')
buro_balance = pd.read_csv('bureau_balance.csv')
credit_card  = pd.read_csv('credit_card_balance.csv')
POS_CASH  = pd.read_csv('POS_CASH_balance.csv')
payments = pd.read_csv('installments_payments.csv')
lgbm_submission = pd.read_csv('sample_submission.csv')

Importing data...


In [4]:
#Separate target variable
y = data['TARGET']
del data['TARGET']

In [5]:
data.shape

(307511, 121)

In [6]:
#Feature engineering
data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
data['CODE_GENDER'].replace({'XNA': 'F'}, inplace=True)
data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
data['YEARS_BUILD_CREDIT'] = data['AMT_CREDIT']/data['YEARS_BUILD_AVG']
data['Annuity_Income'] = data['AMT_ANNUITY']/data['AMT_INCOME_TOTAL']
data['Income_Cred'] = data['AMT_CREDIT']/data['AMT_INCOME_TOTAL']
data['EMP_AGE'] = data['DAYS_EMPLOYED']/data['DAYS_BIRTH']
data['Income_PP'] = data['AMT_INCOME_TOTAL']/data['CNT_FAM_MEMBERS']
data['CHILDREN_RATIO'] = (1 + data['CNT_CHILDREN']) / data['CNT_FAM_MEMBERS']
data['PAYMENTS'] = data['AMT_ANNUITY']/ data['AMT_CREDIT']
#data['Annuity_Credit'] = data['AMT_CREDIT']/data['AMT_ANNUITY']
data['NEW_CREDIT_TO_GOODS_RATIO'] = data['AMT_CREDIT'] / data['AMT_GOODS_PRICE']
data['GOODS_INCOME'] =  data['AMT_GOODS_PRICE']/data['AMT_INCOME_TOTAL']
data['Ext_source_mult'] = data['EXT_SOURCE_1'] * data['EXT_SOURCE_2'] * data['EXT_SOURCE_3']
data['Ext_SOURCE_MEAN'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
data['Ext_SOURCE_SD'] = data[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis = 1)

In [7]:
#Feature engineering
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
test['CODE_GENDER'].replace({'XNA': 'F'}, inplace=True)
test['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
test['YEARS_BUILD_CREDIT'] = test['AMT_CREDIT']/test['YEARS_BUILD_AVG']
test['Annuity_Income'] = test['AMT_ANNUITY']/test['AMT_INCOME_TOTAL']
test['Income_Cred'] = test['AMT_CREDIT']/test['AMT_INCOME_TOTAL']
test['EMP_AGE'] = test['DAYS_EMPLOYED']/test['DAYS_BIRTH']
test['Income_PP'] = test['AMT_INCOME_TOTAL']/test['CNT_FAM_MEMBERS']
test['CHILDREN_RATIO'] = (1 + test['CNT_CHILDREN']) / test['CNT_FAM_MEMBERS']
test['PAYMENTS'] = test['AMT_ANNUITY']/ test['AMT_CREDIT']
#test['Annuity_Credit'] = test['AMT_CREDIT']/test['AMT_ANNUITY']
test['NEW_CREDIT_TO_GOODS_RATIO'] = test['AMT_CREDIT'] / test['AMT_GOODS_PRICE']
test['GOODS_INCOME'] =  test['AMT_GOODS_PRICE']/test['AMT_INCOME_TOTAL']
test['Ext_source_mult'] = test['EXT_SOURCE_1'] * test['EXT_SOURCE_2'] * test['EXT_SOURCE_3']
test['Ext_SOURCE_MEAN'] = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis = 1)
test['Ext_SOURCE_SD'] = test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis = 1)

In [8]:
data.shape

(307511, 133)

In [9]:
#One-hot encoding of categorical features in data and test sets
categorical_features = [col for col in data.columns if data[col].dtype == 'object']

In [10]:
#view list of categorical features
categorical_features

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

In [11]:
one_hot_df = pd.concat([data,test])
one_hot_df = pd.get_dummies(one_hot_df, columns=categorical_features)

In [12]:
data = one_hot_df.iloc[:data.shape[0],:]
test = one_hot_df.iloc[data.shape[0]:,]

In [13]:
#Pre-processing buro_balance
print('Pre-processing buro_balance...')
buro_grouped_size = buro_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].size()
buro_grouped_max = buro_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].max()
buro_grouped_min = buro_balance.groupby('SK_ID_BUREAU')['MONTHS_BALANCE'].min()

Pre-processing buro_balance...


In [14]:
buro_counts = buro_balance.groupby('SK_ID_BUREAU')['STATUS'].value_counts(normalize = False)
buro_counts_unstacked = buro_counts.unstack('STATUS')
buro_counts_unstacked.columns = ['STATUS_0', 'STATUS_1','STATUS_2','STATUS_3','STATUS_4','STATUS_5','STATUS_C','STATUS_X',]
buro_counts_unstacked['MONTHS_COUNT'] = buro_grouped_size
buro_counts_unstacked['MONTHS_MIN'] = buro_grouped_min
buro_counts_unstacked['MONTHS_MAX'] = buro_grouped_max

In [15]:
buro = buro.join(buro_counts_unstacked, how='left', on='SK_ID_BUREAU')

In [16]:
#Pre-processing previous_application
print('Pre-processing previous_application...')
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
#One-hot encoding of categorical features in previous application data set
prev_cat_features = [pcol for pcol in prev.columns if prev[pcol].dtype == 'object']
prev = pd.get_dummies(prev, columns=prev_cat_features)
avg_prev = prev.groupby('SK_ID_CURR').mean()
cnt_prev = prev[['SK_ID_CURR', 'SK_ID_PREV']].groupby('SK_ID_CURR').count()
avg_prev['nb_app'] = cnt_prev['SK_ID_PREV']
del avg_prev['SK_ID_PREV']

Pre-processing previous_application...


In [17]:
#Pre-processing buro
print('Pre-processing buro...')
#One-hot encoding of categorical features in buro data set
buro_cat_features = [bcol for bcol in buro.columns if buro[bcol].dtype == 'object']
buro = pd.get_dummies(buro, columns=buro_cat_features)
avg_buro = buro.groupby('SK_ID_CURR').mean()
avg_buro['buro_count'] = buro[['SK_ID_BUREAU', 'SK_ID_CURR']].groupby('SK_ID_CURR').count()['SK_ID_BUREAU']
del avg_buro['SK_ID_BUREAU']

Pre-processing buro...


In [18]:
#Pre-processing POS_CASH
print('Pre-processing POS_CASH...')
le = LabelEncoder()
POS_CASH['NAME_CONTRACT_STATUS'] = le.fit_transform(POS_CASH['NAME_CONTRACT_STATUS'].astype(str))
nunique_status = POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()
nunique_status2 = POS_CASH[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').max()
POS_CASH['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']
POS_CASH['NUNIQUE_STATUS2'] = nunique_status2['NAME_CONTRACT_STATUS']
POS_CASH.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)

Pre-processing POS_CASH...


In [19]:
#Pre-processing credit_card
print('Pre-processing credit_card...')
credit_card['NAME_CONTRACT_STATUS'] = le.fit_transform(credit_card['NAME_CONTRACT_STATUS'].astype(str))
nunique_status = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').nunique()
nunique_status2 = credit_card[['SK_ID_CURR', 'NAME_CONTRACT_STATUS']].groupby('SK_ID_CURR').max()
credit_card['NUNIQUE_STATUS'] = nunique_status['NAME_CONTRACT_STATUS']
credit_card['NUNIQUE_STATUS2'] = nunique_status2['NAME_CONTRACT_STATUS']
credit_card.drop(['SK_ID_PREV', 'NAME_CONTRACT_STATUS'], axis=1, inplace=True)

Pre-processing credit_card...


In [21]:
#Pre-processing payments
print('Pre-processing payments...')
# Percentage and difference paid in each installment (amount paid and installment value)
payments['PAYMENT_PERC'] = payments['AMT_PAYMENT'] / payments['AMT_INSTALMENT']
payments['PAYMENT_DIFF'] = payments['AMT_INSTALMENT'] - payments['AMT_PAYMENT']
# Days past due and days before due (no negative values)
payments['DPD'] = payments['DAYS_ENTRY_PAYMENT'] - payments['DAYS_INSTALMENT']
payments['DBD'] = payments['DAYS_INSTALMENT'] - payments['DAYS_ENTRY_PAYMENT']
payments['DPD'] = payments['DPD'].apply(lambda x: x if x > 0 else 0)
payments['DBD'] = payments['DBD'].apply(lambda x: x if x > 0 else 0)
avg_payments = payments.groupby('SK_ID_CURR').mean()
avg_payments2 = payments.groupby('SK_ID_CURR').max()
avg_payments3 = payments.groupby('SK_ID_CURR').min()
del avg_payments['SK_ID_PREV']

Pre-processing payments...


In [22]:
#Join data and test tables
print('Joining databases...')
data = data.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(right=avg_prev.reset_index(), how='left', on='SK_ID_CURR')

Joining databases...


In [23]:
data = data.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(right=avg_buro.reset_index(), how='left', on='SK_ID_CURR')

data = data.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(POS_CASH.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')

data = data.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(credit_card.groupby('SK_ID_CURR').mean().reset_index(), how='left', on='SK_ID_CURR')

data = data.merge(right=avg_payments.reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(right=avg_payments.reset_index(), how='left', on='SK_ID_CURR')

data = data.merge(right=avg_payments2.reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(right=avg_payments2.reset_index(), how='left', on='SK_ID_CURR')

data = data.merge(right=avg_payments3.reset_index(), how='left', on='SK_ID_CURR')
test = test.merge(right=avg_payments3.reset_index(), how='left', on='SK_ID_CURR')

In [24]:
data.shape

(307511, 528)

In [25]:
test.shape

(48744, 528)

In [26]:
#Remove features with many missing values
print('Removing features with more than 80% missing...')
test = test[test.columns[data.isnull().mean() < 0.85]]
data = data[data.columns[data.isnull().mean() < 0.85]]

Removing features with more than 80% missing...


In [27]:
data.shape

(307511, 519)

In [28]:
#Delete customer Id
del data['SK_ID_CURR']
del test['SK_ID_CURR']

In [29]:
data.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in data.columns]
test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test.columns]

In [30]:
#Create train and validation set
train_x, valid_x, train_y, valid_y = train_test_split(data, y, test_size=0.2, shuffle=True, random_state = 42)

In [31]:
#------------------------Build LightGBM Model-----------------------
train_data=lgb.Dataset(train_x,label=train_y)
valid_data=lgb.Dataset(valid_x,label=valid_y)

In [33]:
#Select Hyper-Parameters
params = {'boosting_type': 'gbdt',
          'max_depth' : 10,
          'objective': 'binary',
          'nthread': 5,
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1,
          'metric' : 'auc'
          }

#Train model on selected parameters and number of iterations
lgbm = lgb.train(params,
                 train_data,
                 2500,
                 valid_sets=valid_data,
                 early_stopping_rounds= 40,
                 verbose_eval= 10
                 )


Training until validation scores don't improve for 40 rounds
[10]	valid_0's auc: 0.748434
[20]	valid_0's auc: 0.754334
[30]	valid_0's auc: 0.760127
[40]	valid_0's auc: 0.764095
[50]	valid_0's auc: 0.767527
[60]	valid_0's auc: 0.771133
[70]	valid_0's auc: 0.773476
[80]	valid_0's auc: 0.775816
[90]	valid_0's auc: 0.777544
[100]	valid_0's auc: 0.778941
[110]	valid_0's auc: 0.779994
[120]	valid_0's auc: 0.780963
[130]	valid_0's auc: 0.781893
[140]	valid_0's auc: 0.782591
[150]	valid_0's auc: 0.783082
[160]	valid_0's auc: 0.783815
[170]	valid_0's auc: 0.784332
[180]	valid_0's auc: 0.784836
[190]	valid_0's auc: 0.784962
[200]	valid_0's auc: 0.785129
[210]	valid_0's auc: 0.785612
[220]	valid_0's auc: 0.785729
[230]	valid_0's auc: 0.785854
[240]	valid_0's auc: 0.786295
[250]	valid_0's auc: 0.786395
[260]	valid_0's auc: 0.786432
[270]	valid_0's auc: 0.786538
[280]	valid_0's auc: 0.786561
[290]	valid_0's auc: 0.786601
[300]	valid_0's auc: 0.786748
[310]	valid_0's auc: 0.786847
[320]	valid_0's au

In [35]:
#Predict on test set and write to submit
predictions_lgbm_prob = lgbm.predict(test)
lgbm_submission.TARGET = predictions_lgbm_prob
lgbm_submission.to_csv('lgbm_submission_28052020.csv', index=False)