In [1]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/Colab Notebooks/home-credit-default-risk' 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install category_encoders
!pip install xgboost
!pip install joblib



In [3]:
import sklearn
import sklearn.metrics
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
import os
import pickle
import category_encoders
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier

In [4]:
data = pd.read_csv(path + '/train.csv')

In [5]:
# 30 most valuable features
features = ['CODE_GENDER', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START',
       'HOUR_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'LANDAREA_AVG', 'APARTMENTS_MODE',
       'YEARS_BEGINEXPLUATATION_MEDI', 'DAYS_LAST_PHONE_CHANGE',
       'FLAG_DOCUMENT_3', 'b_closed_Consumer credit_num',
       'b_active_all_num', 'b_Consumer credit_sum_1', 'b_all_sum_1',
       'b_Credit card_sum_3']

In [6]:
# categorical features
categorical = ['CODE_GENDER', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 
               'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE',
               'FLAG_DOCUMENT_3']
# numerical features
numerical = list(set(features) - set(categorical))

In [7]:
y = data['TARGET']
X = data.drop(columns = 'TARGET')
X = X.drop(columns = 'SK_ID_CURR')
X = X[features]

In [8]:
X[X == np.inf] = np.NaN
X_mean = X[numerical].mean()

In [9]:
# the most frequent value in each categorical column to fill missing values
train_mode = dict(X[categorical].mode().iloc[0])
# the mean value in each numeric column to fill missing values
for i in range(len(numerical)):
  train_mode[numerical[i]] = X_mean[i]
train_mode

{'AMT_ANNUITY': 27098.711187502868,
 'AMT_CREDIT': 598791.1144006519,
 'AMT_INCOME_TOTAL': 168933.15970279355,
 'APARTMENTS_MODE': 0.11435157552821915,
 'CODE_GENDER': 'F',
 'DAYS_BIRTH': -16034.765785204909,
 'DAYS_EMPLOYED': 63763.424413889144,
 'DAYS_ID_PUBLISH': -2994.4622738958774,
 'DAYS_LAST_PHONE_CHANGE': -962.6356763829323,
 'DAYS_REGISTRATION': -4979.819900988584,
 'EXT_SOURCE_1': 0.501821413793641,
 'EXT_SOURCE_2': 0.5143236546127142,
 'EXT_SOURCE_3': 0.5109342692505695,
 'FLAG_DOCUMENT_3': 1,
 'HOUR_APPR_PROCESS_START': 12.066033116028525,
 'LANDAREA_AVG': 0.06636075445041069,
 'NAME_EDUCATION_TYPE': 'Secondary / secondary special',
 'NAME_FAMILY_STATUS': 'Married',
 'NAME_INCOME_TYPE': 'Working',
 'OCCUPATION_TYPE': 'Laborers',
 'ORGANIZATION_TYPE': 'Business Entity Type 3',
 'OWN_CAR_AGE': 12.07186301955097,
 'REGION_POPULATION_RELATIVE': 0.020870456944583983,
 'WEEKDAY_APPR_PROCESS_START': 'TUESDAY',
 'YEARS_BEGINEXPLUATATION_MEDI': 0.9776657025039116,
 'b_Consumer credi

In [10]:
X[numerical] = X[numerical].fillna(X_mean)
X[categorical] = X[categorical].fillna('-1')

from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for col in categorical:
        le = LabelEncoder()
        le.fit(X[col].astype('str'))
        X[col] = le.transform(X[col].astype('str'))
        label_encoders[col] = le

In [11]:
data_test = pd.read_csv(path + '/test.csv')
test_Y = data_test['TARGET']
test = data_test.drop(columns = ['TARGET','SK_ID_CURR'])
test = test[features]

test[test == np.inf] = np.NaN
test_mean = test[numerical].mean()
test[numerical] = test[numerical].fillna(test_mean)
test[categorical] = test[categorical].fillna('-1')
from sklearn.preprocessing import LabelEncoder
for col in categorical:
        le = LabelEncoder()
        le.fit(test[col].astype('str'))
        test[col] = le.transform(test[col].astype('str'))

In [12]:
X_mte = X.copy()
test_mte = test.copy()

target_encoders = {}
for col in categorical:
        TE = TargetEncoder()
        TE.fit(X[col].astype('str'), y)
        X_mte[col] = TE.transform(X[col].astype('str'))
        test_mte[col] = TE.transform(test[col].astype('str'))
        target_encoders[col] = TE
X_mte.head()

Unnamed: 0,CODE_GENDER,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,LANDAREA_AVG,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MEDI,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_3,b_closed_Consumer credit_num,b_active_all_num,b_Consumer credit_sum_1,b_all_sum_1,b_Credit card_sum_3
0,0.070154,247500.0,450000.0,27324.0,0.09585,0.053353,0.081905,0.009175,-13480,-3009,-4507.0,-4323,12.071863,0.06795,0.083753,15,0.065841,0.501821,0.745131,0.510934,0.066361,0.114352,0.977666,-970.0,0.088509,3.014555,2.056737,292859.207456,454251.3,148118.8
1,0.101038,202500.0,180000.0,9000.0,0.09585,0.08943,0.07529,0.002042,-17748,-1733,-4666.0,-1303,12.071863,0.113395,0.081383,10,0.093886,0.501821,0.518422,0.538863,0.066361,0.114352,0.977666,-230.0,0.061686,0.0,6.0,66105.0,66105.0,22500.0
2,0.101038,382500.0,760225.5,32337.0,0.074519,0.053353,0.07529,0.04622,-11533,-1088,-589.0,-4069,12.071863,0.104819,0.0818,16,0.09291,0.501821,0.608844,0.610991,0.0,0.2542,0.999,-578.0,0.088509,2.0,2.0,0.0,2018639.0,2018639.0
3,0.101038,90000.0,450000.0,17095.5,0.09585,0.08943,0.07529,0.008068,-19184,-1758,-782.0,-2554,12.071863,0.065108,0.083753,9,0.102642,0.501821,0.054112,0.510934,0.066361,0.114352,0.977666,-655.0,0.088509,3.014555,2.056737,292859.207456,454251.3,148118.8
4,0.070154,292500.0,675000.0,53460.0,0.074519,0.112346,0.07529,0.019101,-14265,-566,-1889.0,-4846,12.071863,0.099366,0.083753,13,0.09291,0.501315,0.669662,0.170446,0.066361,0.114352,0.977666,-2240.0,0.088509,9.0,3.0,408780.0,408780.0,0.0


In [13]:
#0.778
params = {'n_estimators': 817, 'max_depth': 6, 'reg_alpha': 2, 'reg_lambda': 0, 'min_child_weight': 0, 'gamma': 3, 'learning_rate': 0.02927458043402023, 'colsample_bytree': 0.19}

In [14]:
clf = XGBClassifier(**params)
clf.fit(X_mte, y)
pred = clf.predict_proba(test_mte)[:,1]
roc_auc_score(test_Y, pred)

0.7525062477213109

In [15]:
from sklearn.externals import joblib

joblib.dump(clf, path + '/XGBoost.pkl', compress=True)
joblib.dump(label_encoders, path + '/label_encoders.joblib', compress=True)
joblib.dump(target_encoders, path + '/target_encoders.joblib', compress=True)
joblib.dump(train_mode, path + '/train_mode.joblib', compress=True)

['/content/drive/My Drive/Colab Notebooks/home-credit-default-risk/train_mode.joblib']

In [16]:
modelReload=joblib.load(path + '/XGBoost.pkl')
pred_new = modelReload.predict_proba(test_mte)[:,1]

In [17]:
roc_auc_score(test_Y, pred_new)

0.7525062477213109