In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras import backend as K

Using TensorFlow backend.


In [2]:
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [3]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall))

In [4]:
def model_performance(y_test, y_pred):    
    print('confusion matrix')
    print(metrics.confusion_matrix(y_test, y_pred))
    print('accuracy : {}'.format(metrics.accuracy_score(y_test, y_pred).round(3)))
    print('precision : {}'.format(metrics.precision_score(y_test, y_pred, pos_label=1).round(3)))
    print('recall : {}'.format(metrics.recall_score(y_test, y_pred, pos_label=1).round(3)))
    print('F1 : {}'.format(metrics.f1_score(y_test, y_pred, pos_label=1).round(3)))

In [5]:
cust_data = pd.read_csv('data_transform.csv', encoding='euckr')
cust_data.shape

(100233, 126)

In [6]:
# 전체 연체자 수
cust_overdue = cust_data[cust_data['TARGET'] == 1]     # 연체자
print(cust_overdue.shape[0])

4287


In [7]:
# train / test set 분리
x = cust_data.drop('TARGET', axis=1).as_matrix()
y = cust_data['TARGET']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

print('연체자 수 train : {} / {}'.format(sum(y_train == 1), y_train.shape[0]))
print('연체자 수 test  : {} / {}'.format(sum(y_test  == 1), y_test.shape[0]))

연체자 수 train : 3439 / 80186
연체자 수 test  : 848 / 20047


In [8]:
# train set 중에서 연체자만 추출
x_overdue = x_train[y_train == 1]
y_overdue = y_train[y_train == 1]
print(x_overdue.shape)
print(y_overdue.shape)

(3439, 125)
(3439,)


In [9]:
y_train = to_categorical(y_train, 2)
y_test = to_categorical(y_test, 2)

print(y_train.shape)

n_cols = x_train.shape[1]

(80186, 2)


In [18]:
print(sum(y_train[:,0]))
print(sum(y_train[:,1]))

76747.0
3439.0


In [19]:
print(sum(y_test[:,0]))
print(sum(y_test[:,1]))   # 연체자

19199.0
848.0


In [35]:
model = Sequential()
model.add(Dense(125, activation='relu', input_shape=(n_cols,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 125)               15750     
_________________________________________________________________
dense_15 (Dense)             (None, 100)               12600     
_________________________________________________________________
dense_16 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_17 (Dense)             (None, 2)                 202       
Total params: 38,652.0
Trainable params: 38,652
Non-trainable params: 0.0
_________________________________________________________________


In [36]:
model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(x_train, y_train,
            batch_size = 1000,
            epochs = 5,
            verbose = 1,
            validation_split=0.1)

Train on 72167 samples, validate on 8019 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2050ddd1048>

In [37]:
y_pred = model.predict(x_test)
print(sum(y_pred[:,0]))
print(sum(y_pred[:,1]))   # 연체자

0.0
20047.0
