In [59]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
import random
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import Imputer
from sklearn import metrics

In [60]:
cust_data = pd.read_csv('data_transform.csv')
cust_data.shape

(100233, 174)

In [61]:
cust_data = cust_data[['TARGET',
        'AGE',
'AVG_CALL_FREQ',
'AVG_CALL_TIME',
'CRDT_CARD',
'CRDT_CARD_CNT',
'CRDT_OCCR_MDIF',
'CRMM_OVDU_AMT',
'CTCD_OCCR_MDIF',
'HIGH_AMT_RATE',
'L_H_RATE',
'LNIF_CNT',
'LOW_AMT_RATE',
'LT1Y_MXOD_AMT',
'MOBL_PRIN',
'MON_TLFE_AMT',
'OVDU_HIGH_RATE',
'PAYM_METD_G',
'SPTCT_OCCR_MDIF',
'TEL_CNTT_QTR',
'TEL_OVDU_RATE',
'TOT_LNIF_AMT',
'TOT_LOAN_CNT'
]]

In [62]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

### ML modeling

In [63]:
# train / test set 분리
x = cust_data.drop('TARGET', axis=1)
y = cust_data['TARGET']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print('연체자 수 train : {} / {}'.format(sum(y_train == 1), y_train.shape[0]))
print('연체자 수 test  : {} / {}'.format(sum(y_test  == 1), y_test.shape[0]))

연체자 수 train : 3439 / 80186
연체자 수 test  : 848 / 20047


In [64]:
# train set 중에서 연체자만 추출
x_overdue = x_train[y_train == 1]
y_overdue = y_train[y_train == 1]
print(x_overdue.shape)
print(y_overdue.shape)

(3439, 22)
(3439,)


In [65]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80186 entries, 59527 to 68268
Data columns (total 22 columns):
AGE                80186 non-null int64
AVG_CALL_FREQ      80186 non-null int64
AVG_CALL_TIME      80186 non-null int64
CRDT_CARD          80186 non-null int64
CRDT_CARD_CNT      80186 non-null int64
CRDT_OCCR_MDIF     80186 non-null int64
CRMM_OVDU_AMT      80186 non-null int64
CTCD_OCCR_MDIF     80186 non-null int64
HIGH_AMT_RATE      80186 non-null float64
L_H_RATE           80186 non-null float64
LNIF_CNT           80186 non-null int64
LOW_AMT_RATE       80186 non-null float64
LT1Y_MXOD_AMT      80186 non-null int64
MOBL_PRIN          80186 non-null int64
MON_TLFE_AMT       80186 non-null int64
OVDU_HIGH_RATE     80186 non-null float64
PAYM_METD_G        80186 non-null int64
SPTCT_OCCR_MDIF    80186 non-null int64
TEL_CNTT_QTR       80186 non-null int64
TEL_OVDU_RATE      80186 non-null float64
TOT_LNIF_AMT       80186 non-null int64
TOT_LOAN_CNT       80186 non-null flo

In [66]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=100):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.03
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 2
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7 # 70프로만 뽑겠다
    param['colsample_bytree'] = 0.7 # 변수 컬럼 비율
    param['seed'] = seed_val # 초기값 설정 랜덤 안되게
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y) # D매트릭스로 바꿔줘야 한다.
    feature_names = xgtrain.feature_names

    # test의 타겟값을 넣은 xgb model
    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    # test의 타겟값을 넣지 않은 xgb model
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model, feature_names

In [67]:
train_epochs = [300, 400, 500]
for epoch in train_epochs:
    preds, model, feature_names = runXGB(x_train, y_train, x_test, y_test, num_rounds=epoch)
    y_pred = np.argmax(preds, axis=1)
    print('overdue pred :', np.sum(y_pred))
    model_performance(y_test, aa)

[0]	train-mlogloss:0.66758	test-mlogloss:0.667668
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.643384	test-mlogloss:0.643534
[2]	train-mlogloss:0.620565	test-mlogloss:0.620791
[3]	train-mlogloss:0.598996	test-mlogloss:0.599304
[4]	train-mlogloss:0.578572	test-mlogloss:0.578962
[5]	train-mlogloss:0.559265	test-mlogloss:0.559717
[6]	train-mlogloss:0.540921	test-mlogloss:0.541469
[7]	train-mlogloss:0.523524	test-mlogloss:0.524163
[8]	train-mlogloss:0.506951	test-mlogloss:0.507653
[9]	train-mlogloss:0.4912	test-mlogloss:0.491985
[10]	train-mlogloss:0.476172	test-mlogloss:0.47704
[11]	train-mlogloss:0.461838	test-mlogloss:0.462802
[12]	train-mlogloss:0.44817	test-mlogloss:0.449223
[13]	train-mlogloss:0.435119	test-mlogloss:0.436271
[14]	train-mlogloss:0.422657	test-mlogloss:0.423895
[15]	train-mlogloss:0.410788	test-mlogloss:0.412108
[16]	train-mlogloss:0.399419	tes