# 모델링
- 레이블 인코딩한 데이터
    * 오버샘플링 하지 않은 데이터 - 학습시 Cost-sensitive learning 사용해 클래스 불균형 해소 예정
    * SMOTE NC 오버샘플링한 데이터
<br><br>
- 원-핫 인코딩한 데이터-> ``메모리 과부하 걸려서 Kernel이 죽어버림..``
    * 오버샘플링 하지 않은 데이터 - 학습시 Cost-sensitive learning 사용해 클래스 불균형 해소 예정
    * SMOTE NC 오버샘플링한 데이터 
    
<br>

- 선형기반 알고리즘인 Logistic Regression 하나 적용
- 트리기반 알고리즘인 Decision Tree 하나 적용
- 두 모델에서 각각 가장 성능이 좋은 데이터 종류 하나씩 선정
- Train : Test 비율은 7 : 3으로 적용

In [7]:
import pandas as pd
import numpy as np
import os
os.chdir('/Users/younghun/Desktop/gitrepo/data/healthcare/encoding_df')

In [8]:
# index_col=[0] : index 칼럼을 0번째 칼럼으로 설정해서 Unnamed 제거시키기

label_df = pd.read_csv('label_df.csv', index_col=[0])
#label_smote_df = pd.read_csv('label_smote_df.csv')
#ohe_df = pd.read_csv('ohe_df.csv', index_col=[0])
#ohe_smote_df = pd.read_csv('ohe_smote_df.csv')

## Multi-Logistic Regression

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

feature = label_df.iloc[:, :-1]
target = label_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.3, random_state=42)

lr_clf = LogisticRegression(multi_class='multinomial')
lr_clf.fit(X_train, y_train)
y_pred = lr_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test 데이터에 대한 정확도: {acc: .4f}")

Test 데이터에 대한 정확도:  0.2808


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Random Forest 
- 레이블 인코딩은 트리 기반에 강하다..!

In [5]:
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.3, random_state=42)

rf_clf = RandomForestClassifier(n_estimators=200,
                               criterion='entropy',
                               min_samples_split=10)
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"테스트 데이터에 대한 정확도: {acc: .4f}")

테스트 데이터에 대한 정확도:  0.3878


## Light GBM
- Light GBM도 결과가 좋지 않다면... Tensorflow 딥러닝을 활용해보자

In [10]:
label_df.head(1)

Unnamed: 0,Unnamed0,Hospitaltypecode,CityCodeHospital,Hospitalregioncode,Department,WardType,WardFacilityCode,TypeofAdmission,SeverityofIllness,Age,Hospitaltypeabinary,AvailableExtraRoomsinHospital,VisitorswithPatient,AdmissionDeposit,VisitorsRobust,VisitorsQ3ohe,AdmissionDepositRobust,AdmissionDepositQ3ohe,AdmissionDepositDiscretization,YStay
0,0,2,2,2,3,2,5,0,0,5,0.0,3,2,4911.0,-0.5,0.0,0.139002,0.0,5.0,0


In [9]:
import re
label_df = label_df.rename(columns= lambda x : re.sub('[^A-Za-z0-9]+', '', x))

In [13]:
from lightgbm import LGBMClassifier

feature = label_df.iloc[:, :-1]
target = label_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(feature,
                                                   target,
                                                   test_size=0.3,
                                                   random_state=12)
lgbm = LGBMClassifier(objective='multiclass',
                      n_estimators=400, learning_rate=0.1,
                     random_state=42)
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"테스트 데이터에 대한 정확도: {acc: .4f}")

테스트 데이터에 대한 정확도:  0.4032


## Hybrid Voting

# Tensorflow 이용해야 할듯...

In [4]:
# 우선 Train : Test 7:3으로 나누기
from sklearn.model_selection import train_test_split

feature = ohe_df.iloc[:, :-1]
target = ohe_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(feature, target,
                                                   test_size=0.3,
                                                   random_state=42)
print("Train shape: ", X_train.shape)
print("Test shape: ", X_test.shape)

Train shape:  (222906, 53)
Test shape:  (95532, 53)


In [5]:
def next_batch(num, data, labels):
    idx = np.arange(0, len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[i] for i in idx]
    labels_shuffle = [labels[i] for i in idx]
    
    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

In [None]:
import tensorflow as tf

X_train = X_train.values
y_train = y_train.values

y_train_ohe = tf.one_hot(y_train, 10)

num_classes = 10

X = tf.placeholder(tf.float32, shape=[None, 53])
y = tf.placeholder(tf.float32, shape=[None, num_classes])

# layer1 - RELU
W1 = tf.Variable(tf.random_normal([53, 256]), name='weight1')
b1 = tf.Variable(tf.random_normal([256]), name='bias1')
layer1 = tf.nn.sigmoid(tf.matmul(X, W1) + b1)
# layer2 - RELU
W2 = tf.Variable(tf.random_normal([256, 1024]), name='weight2')
b2 = tf.Variable(tf.random_normal([1024]), name='bias2')
layer2 = tf.nn.relu(tf.matmul(layer1, W2) + b2)
# layer3 - RELU
W3 = tf.Variable(tf.random_normal([1024, 512]), name='weight3')
b3 = tf.Variable(tf.random_normal([512]), name='bias3')
layer3 = tf.nn.sigmoid(tf.matmul(layer2, W3) + b3)
# layer4 - Softmax로 최종 분류
W4 = tf.Variable(tf.random_normal([512, num_classes]), name='weight4')
b4 = tf.Variable(tf.random_normal([num_classes]), name='bias4')
hypothesis = tf.nn.softmax(tf.matmul(layer3, W4) + b4)

# cost function - multi_class cross entropy
cost = -tf.reduce_mean(tf.reduce_mean(y * tf.log(hypothesis), axis=1))
# SGD
train = tf.train.AdamOptimizer(learning_rate=0.01).minimize(cost)

correct_pred = tf.equal(tf.argmax(hypothesis, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, dtype=tf.float32))

num_epoch = 10
batch_size = 200
num_iter = int(X_train.shape[0] / batch_size)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range(num_epoch):
        # epoch 한 번 수행한 후 평균 cost 계산하기 위해 정의
        avg_cost = 0
        avg_acc = 0
        # iteration 
        for i in range(num_iter):
            batch = next_batch(batch_size, X_train, y_train_ohe.eval())
            
            _, cost_val, acc_val = sess.run([train, cost, accuracy], feed_dict={X: batch[0],
                                                            y: batch[1]})
            # Epoch 한번 수행할 동안 평균 cost값 계속 계산
            avg_cost += cost_val / num_iter
            avg_acc  += acc_val / num_iter
        print(f"## Epoch:{epoch+1}번 후, 평균 Cost:{avg_cost:.4f}, 평균 Accuracy:{avg_acc:.4f}")
        print()

In [43]:
int(X_train.shape[0] / 200)

1114