In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection

In [20]:
cust_data = pd.read_csv('data_transform.csv')
cust_data.shape

(100233, 126)

In [21]:
t_data = pd.read_csv('test_transform.csv')
t_data.shape

(2019, 125)

In [22]:
cust_data.head()

Unnamed: 0,ACTL_FMLY_NUM,AGE,ARPU,AUTR_FAIL_MCNT,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,CB_GUIF_AMT,CB_GUIF_CNT,CNTT_LAMT_CNT,...,MATE_OCCP_NAME_C_07,MATE_OCCP_NAME_C_08,MATE_OCCP_NAME_C_09,MATE_OCCP_NAME_C_10,MATE_OCCP_NAME_C_11,MATE_OCCP_NAME_C_12,MATE_OCCP_NAME_C_13,MATE_OCCP_NAME_C_14,MATE_OCCP_NAME_C_15,MATE_OCCP_NAME_C_16
0,4,50,30000,10,493,450,0,420001,3,0,...,0,0,0,0,0,0,0,0,1,0
1,4,50,30000,0,22,81,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,4,60,30000,0,17,139,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,35,30000,0,0,1118,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4,45,50000,0,354,396,95,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [23]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

## 연체자 data duplicate

In [24]:
cust = cust_data[cust_data['TARGET'] == 0]
cust_overdue = cust_data[cust_data['TARGET'] == 1]     # 연체자

print(cust.shape[0])
print(cust_overdue.shape[0])

95946
4287


In [25]:
variables = cust_data.drop('TARGET', 1).columns
variables

Index(['ACTL_FMLY_NUM', 'AGE', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT', 'CB_GUIF_CNT',
       'CNTT_LAMT_CNT',
       ...
       'MATE_OCCP_NAME_C_07', 'MATE_OCCP_NAME_C_08', 'MATE_OCCP_NAME_C_09',
       'MATE_OCCP_NAME_C_10', 'MATE_OCCP_NAME_C_11', 'MATE_OCCP_NAME_C_12',
       'MATE_OCCP_NAME_C_13', 'MATE_OCCP_NAME_C_14', 'MATE_OCCP_NAME_C_15',
       'MATE_OCCP_NAME_C_16'],
      dtype='object', length=125)

### ML modeling

In [26]:
# train / test set 분리
x = cust_data.drop('TARGET', axis=1)
y = cust_data['TARGET']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print('연체자 수 train : {} / {}'.format(sum(y_train == 1), y_train.shape[0]))
print('연체자 수 test  : {} / {}'.format(sum(y_test  == 1), y_test.shape[0]))

연체자 수 train : 3439 / 80186
연체자 수 test  : 848 / 20047


In [27]:
# train set 중에서 연체자만 추출
x_overdue = x_train[y_train == 1]
y_overdue = y_train[y_train == 1]
print(x_overdue.shape)
print(y_overdue.shape)

(3439, 125)
(3439,)


In [28]:
max_epoch = 3

for i in range(max_epoch+1):
    x2_train, x2_test, y2_train, y2_test = train_test_split(x_train, y_train, test_size=0.2, random_state=0)
    
    print('========================================= ', i)
    print('test {} : test overdue {}'.format(y2_test.shape[0], sum(y2_test == 1)))
    print('')
    
#     clf = DecisionTreeClassifier()
    clf = GradientBoostingClassifier(n_estimators=500, random_state=0)
    
    clf.fit(x2_train, y2_train)
    
#     y_pred = clf.predict(x2_test)
#     model_performance(y2_test, y_pred)
    
#     varDic = {'var':variables, 'importance':clf.feature_importances_}
#     importance = pd.DataFrame(varDic)
#     print(importance.sort_values(by='importance', ascending=False).head(20))

    x_train = pd.concat([x_train, x_overdue])
    y_train = pd.concat([y_train, y_overdue])

    print('------------------------')
    y_pred_0 = clf.predict(x_test)
    model_performance(y_test, y_pred_0)
    print('------------------------')

    
    if i == max_epoch:
        test_data = pd.read_csv('test_transform.csv')
        print(test_data.shape)

        y_pred_test = clf.predict(test_data)

        test_data['TARGET'] = y_pred_test

test 16038 : test overdue 720

------------------------
confusion matrix
[[19062   137]
 [  645   203]]
accuracy : 0.961
precision : 0.597
recall : 0.239
F1 : 0.342
------------------------
test 16725 : test overdue 1415

------------------------
confusion matrix
[[18896   303]
 [  541   307]]
accuracy : 0.958
precision : 0.503
recall : 0.362
F1 : 0.421
------------------------
test 17413 : test overdue 2095

------------------------
confusion matrix
[[18739   460]
 [  460   388]]
accuracy : 0.954
precision : 0.458
recall : 0.458
F1 : 0.458
------------------------
test 18101 : test overdue 2740

------------------------
confusion matrix
[[18565   634]
 [  406   442]]
accuracy : 0.948
precision : 0.411
recall : 0.521
F1 : 0.459
------------------------
(2019, 125)


In [29]:
test_data['TARGET']

0       0
1       0
2       0
3       1
4       0
5       0
6       0
7       1
8       0
9       1
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
1989    0
1990    0
1991    0
1992    0
1993    0
1994    0
1995    0
1996    0
1997    0
1998    0
1999    0
2000    0
2001    0
2002    0
2003    0
2004    0
2005    0
2006    0
2007    0
2008    0
2009    0
2010    1
2011    0
2012    0
2013    0
2014    0
2015    1
2016    0
2017    0
2018    1
Name: TARGET, dtype: int64

In [32]:
test_data = test_data['TARGET']

In [33]:
test_data.to_csv("testset_0923.csv", encoding='utf-8', index=False)