In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

In [2]:
cust_data = pd.read_csv('data_transform.csv', encoding='euckr')
cust_data.shape

(100233, 126)

In [3]:
cust_data.head()

Unnamed: 0,ACTL_FMLY_NUM,AGE,ARPU,AUTR_FAIL_MCNT,AVG_CALL_FREQ,AVG_CALL_TIME,AVG_STLN_RATE,CB_GUIF_AMT,CB_GUIF_CNT,CNTT_LAMT_CNT,...,MATE_OCCP_NAME_C_07,MATE_OCCP_NAME_C_08,MATE_OCCP_NAME_C_09,MATE_OCCP_NAME_C_10,MATE_OCCP_NAME_C_11,MATE_OCCP_NAME_C_12,MATE_OCCP_NAME_C_13,MATE_OCCP_NAME_C_14,MATE_OCCP_NAME_C_15,MATE_OCCP_NAME_C_16
0,4,50,30000,10,493,450,0,420001,3,0,...,0,0,0,0,0,0,0,0,1,0
1,4,50,30000,0,22,81,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,4,60,30000,0,17,139,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,35,30000,0,0,1118,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4,45,50000,0,354,396,95,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

## 연체자 data duplicate

In [5]:
cust = cust_data[cust_data['TARGET'] == 0]
cust_overdue = cust_data[cust_data['TARGET'] == 1]     # 연체자

print(cust.shape[0])
print(cust_overdue.shape[0])

95946
4287


In [6]:
variables = cust_data.drop('TARGET', 1).columns
variables

Index(['ACTL_FMLY_NUM', 'AGE', 'ARPU', 'AUTR_FAIL_MCNT', 'AVG_CALL_FREQ',
       'AVG_CALL_TIME', 'AVG_STLN_RATE', 'CB_GUIF_AMT', 'CB_GUIF_CNT',
       'CNTT_LAMT_CNT',
       ...
       'MATE_OCCP_NAME_C_07', 'MATE_OCCP_NAME_C_08', 'MATE_OCCP_NAME_C_09',
       'MATE_OCCP_NAME_C_10', 'MATE_OCCP_NAME_C_11', 'MATE_OCCP_NAME_C_12',
       'MATE_OCCP_NAME_C_13', 'MATE_OCCP_NAME_C_14', 'MATE_OCCP_NAME_C_15',
       'MATE_OCCP_NAME_C_16'],
      dtype='object', length=125)

In [9]:
x_data = pd.concat([cust, cust_overdue])
x = x_data.drop('TARGET', axis=1)
y = x_data['TARGET']

for i in range(1):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
      
    print('---------------------------------------- ', i)
    print('total overdue {} : test {} : test overdue {}'.format(x_data[x_data['TARGET'] == 1].shape[0], x_test.shape[0], len(y_test[y_test == 1])))
    print('')
    
    tree = DecisionTreeClassifier()
    tree.fit(x_train, y_train)
    y_pred = tree.predict(x_test)
    model_performance(y_test, y_pred)
    
    varDic = {'var':variables, 'importance':tree.feature_importances_}
    importance = pd.DataFrame(varDic)
    print(importance.sort_values(by='importance', ascending=False).head(10))

    # 연체자 수를 점차 늘려간다.
    x_data = pd.concat([x_data, cust_overdue])
    x = x_data.drop('TARGET', axis=1)
    y = x_data['TARGET']


----------------------------------------  0
total overdue 4287 : test 20047 : test overdue 883

confusion matrix
[[18485   679]
 [  610   273]]
accuracy : 0.936
precision : 0.287
recall : 0.309
F1 : 0.298
    importance              var
78    0.059800      PAYM_METD_G
14    0.053742    CRDT_CARD_CNT
54    0.035875  SPTCT_OCCR_MDIF
17    0.030078   CRDT_OCCR_MDIF
63    0.027304     TOT_LNIF_AMT
64    0.026506     TOT_LOAN_CNT
5     0.026278    AVG_CALL_TIME
49    0.025577        MOBL_PRIN
34    0.022698   HSHD_INFR_INCM
58    0.021126     TEL_CNTT_QTR


In [10]:
from sklearn.ensemble import GradientBoostingClassifier

In [11]:
gb = GradientBoostingClassifier(n_estimators=500, random_state=0)
gb.fit(x_train, y_train)
y_pred = gb.predict(x_test)
model_performance(y_test, y_pred)

confusion matrix
[[19058   106]
 [  669   214]]
accuracy : 0.961
precision : 0.669
recall : 0.242
F1 : 0.356


In [15]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel='linear')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
model_performance(y_test, y_pred)