# Project 3 part 3
Work flow: LightGBM (final model)

Author: HE HAOKAI

In [None]:
import os
import time
import scipy
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import lightgbm
import warnings
from sklearn import datasets
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, mean_squared_error, precision_score, roc_auc_score, roc_curve
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
pd.set_option('display.notebook_repr_html', False)
plt.style.use('seaborn-white')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
def RandomSeedGetInt():
    a = int(time.time() * 1000)
    return ((a >> 12) ^ (a & 0xffffffff))

In [2]:
#导入数据
app_train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
app_test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')
# 检查train data中的数据类型
app_train.dtypes.value_counts()

In [3]:
# 编码种类不超过2的分类变量
le = LabelEncoder()
le_count = 0
 
# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        app_train[col].fillna('m1ss', inplace = True)
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

In [4]:
#处理性别变量
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [5]:
targ = app_train['TARGET'] #save the column "TARGET"
 
# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
 
# Add the target back to the train data
app_train['TARGET'] = targ
 
print('Training Features shape: ', app_train.shape)
print('Testing Features shape: ', app_test.shape)

In [6]:
# Even though we found some null data in the train data, we can still use correlations to select appropriate variables
# Find correlations beween variables with the target 
# and rank them in order to pick up variables relatively highly related to the response

correlations = app_train.corr()['TARGET'].sort_values()

In [7]:
print('Variables with TOP 15 Positive Correlation coefficient:\n', correlations.tail(15))
print('\nVariables with TOP 15 Negative Correlation coefficient:\n', correlations.head(15))

In [8]:
def hcdrlgbm(xtrn, ytrn, xtst, fileprefix, lr, foldcnt = 10):
    percentagestep = 100 / float(foldcnt)
    for i in range(len(lr)):
        foldidx = KFold(n_splits = foldcnt, shuffle = True, random_state = RandomSeedGetInt()).split(xtrn)
        percentagestage = 0
        pred = np.zeros(xtst.shape[0])
        cvtstavgaccuracy = []
        print(fileprefix + ' -- Learning rate ' + str(lr[i]) + ' in progress:')
        for cvtrnidx, cvtstidx in foldidx:
            cvtrn = xtrn[cvtrnidx]
            cvtrnlbl = ytrn[cvtrnidx]
            cvtst = xtrn[cvtstidx]
            cvtstlbl = ytrn[cvtstidx]
            model = lightgbm.LGBMClassifier(n_estimators = 10000, objective = 'binary', class_weight = 'balanced', learning_rate = lr[i], reg_alpha = 0.1, reg_lambda = 0.1, subsample = 0.8, random_state = RandomSeedGetInt())
            model.fit(cvtrn, cvtrnlbl, eval_metric = 'auc', eval_set = [(cvtst, cvtstlbl), (cvtrn, cvtrnlbl)], eval_names = ['cvtst', 'cvtrn'], categorical_feature = 'auto', early_stopping_rounds = 40, verbose = 10000)
            bestiter = model.best_iteration_
            cvtstavgaccuracy.append(np.sum(np.abs(model.predict_proba(cvtst, num_iteration = bestiter)[:, 1] - cvtstlbl)<= 0.5) /  cvtstlbl.shape[0])
            pred += model.predict_proba(xtst, num_iteration = bestiter)[:, 1]
            percentagestage += percentagestep
            print(str(percentagestage) + '%')
        pred /= foldcnt
        print(np.mean(cvtstavgaccuracy))
        ans = pd.DataFrame({'SK_ID_CURR': app_test['SK_ID_CURR'], 'TARGET': pred})
        ans.to_csv(fileprefix + '_result_learningrate' + str(lr[i]) + '.csv', index = False)

In [9]:
# Select the top 15 most and least correlated features
# Fill missing values with column medians
imputer = SimpleImputer(strategy = 'median')
lgbm_features = app_train[list(correlations.head(15).index)+list(correlations.tail(15).index)].copy()
lgbm_features_test = app_test[list(correlations.head(15).index)+list(correlations.tail(15).index)[:-1]].copy()
# Separate y label from the training data
lgbm_target = lgbm_features['TARGET']
lgbm_features = lgbm_features.drop(columns = ['TARGET'])
lgbm_features = imputer.fit_transform(lgbm_features)
lgbm_features_test = imputer.transform(lgbm_features_test)
hcdrlgbm(lgbm_features, lgbm_target, lgbm_features_test, 'corrtop15_10fold', np.array([0.01, 0.05]))

In [10]:
# Select all features
lgbm_features = app_train.copy()
lgbm_features_test = app_test.copy()
lgbm_target = lgbm_features['TARGET']
lgbm_features = lgbm_features.drop(columns = ['TARGET'])
lgbm_features = imputer.fit_transform(lgbm_features)
lgbm_features_test = imputer.transform(lgbm_features_test)
hcdrlgbm(lgbm_features, lgbm_target, lgbm_features_test, 'allfeatures_10fold', np.array([0.01, 0.05]))

In [11]:
# Reduce the number of folds to 6
hcdrlgbm(lgbm_features, lgbm_target, lgbm_features_test, 'allfeatures_6fold', np.array([0.05]), 6)

In [12]:
# Reduce the number of folds to 3
hcdrlgbm(lgbm_features, lgbm_target, lgbm_features_test, 'allfeatures_3fold', np.array([0.05]), 3)

In [16]:
# Fill missing values with column means
imputer = SimpleImputer(strategy = 'mean')
lgbm_features = app_train.copy()
lgbm_features_test = app_test.copy()
lgbm_target = lgbm_features['TARGET']
lgbm_features = lgbm_features.drop(columns = ['TARGET'])
lgbm_features = imputer.fit_transform(lgbm_features)
lgbm_features_test = imputer.transform(lgbm_features_test)
hcdrlgbm(lgbm_features, lgbm_target, lgbm_features_test, 'allfeatures_3fold_fillmeans', np.array([0.05]), 3)

In [17]:
# Fill missing values with zeros
imputer = SimpleImputer(strategy = 'constant', fill_value = 0)
lgbm_features = app_train.copy()
lgbm_features_test = app_test.copy()
lgbm_target = lgbm_features['TARGET']
lgbm_features = lgbm_features.drop(columns = ['TARGET'])
lgbm_features = imputer.fit_transform(lgbm_features)
lgbm_features_test = imputer.transform(lgbm_features_test)
hcdrlgbm(lgbm_features, lgbm_target, lgbm_features_test, 'allfeatures_3fold_fillzeros', np.array([0.05]), 3)