# 로지스틱회귀분석 3: 실전

## 모듈 불러오기

In [2]:
from IPython.display import display, HTML
import numpy as np
import pandas as pd

import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

import seaborn as sns
import matplotlib.pyplot as plt

## 데이터 불러오기

### Parkinson 질병 예측

### 타겟변수(Y): status

### 0: 정상 / 1: 환자

In [4]:
data = pd.read_csv('Data/Parkinson.csv')
data.head()

Unnamed: 0,name,status,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,1,-0.8293,-0.436165,-0.952037,0.334914,0.749759,0.132963,0.7608,0.131755,...,0.332985,0.607532,-0.067893,-0.193225,-0.807838,1.760814,0.801323,0.480477,-0.210531,0.868886
1,phon_R01_S01_2,1,-0.770972,-0.530974,-0.057721,0.715418,1.037674,0.453892,1.276809,0.452684,...,1.159454,1.548254,-0.137843,-0.634508,-0.387524,1.837562,1.479853,1.311185,0.275077,1.803605
2,phon_R01_S01_3,1,-0.909476,-0.723168,-0.109875,0.884991,1.325589,0.72077,1.585687,0.721813,...,0.699187,1.175323,-0.291633,-0.27976,-0.662075,1.942048,1.141445,1.017682,-0.103629,1.402661
3,phon_R01_S01_4,1,-0.909622,-0.649092,-0.114229,0.775389,1.325589,0.578885,1.284076,0.577677,...,0.806859,1.340229,-0.280719,-0.281346,-0.613134,1.83238,1.440945,1.29384,0.062145,1.806954
4,phon_R01_S01_5,1,-0.925657,-0.606245,-0.130608,1.368893,1.901418,1.09575,2.047187,1.096793,...,1.216839,1.899461,-0.178026,-0.506745,-0.783021,1.909364,1.78094,0.096195,-0.130026,2.267082


## 데이터 전처리

### 필요하지 않은 변수 제거

In [5]:
data = data.drop('name', axis=1)

### 학습/테스트 데이터 분리

In [6]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=55)

### 학습 데이터와 테스트 데이터에서 클래스 비율 확인

In [7]:
train_data['status'].value_counts()

1    118
0     38
Name: status, dtype: int64

In [8]:
test_data['status'].value_counts()

1    29
0    10
Name: status, dtype: int64

## 모델링

In [9]:
logistic = sm.Logit(train_data['status'], train_data.drop('status', axis=1))
logistic_trained = logistic.fit()
logistic_trained.summary()

Optimization terminated successfully.
         Current function value: 0.446425
         Iterations 10


0,1,2,3
Dep. Variable:,status,No. Observations:,156.0
Model:,Logit,Df Residuals:,134.0
Method:,MLE,Df Model:,21.0
Date:,"Tue, 23 Apr 2019",Pseudo R-squ.:,0.1959
Time:,14:53:02,Log-Likelihood:,-69.642
converged:,True,LL-Null:,-86.608
,,LLR p-value:,0.03685

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
MDVP:Fo(Hz),-1.1359,0.634,-1.792,0.073,-2.378,0.107
MDVP:Fhi(Hz),0.0176,0.244,0.072,0.942,-0.460,0.495
MDVP:Flo(Hz),-0.3346,0.353,-0.948,0.343,-1.026,0.357
MDVP:Jitter(%),-6.1660,3.925,-1.571,0.116,-13.859,1.527
MDVP:Jitter(Abs),-0.7244,1.498,-0.483,0.629,-3.661,2.213
MDVP:RAP,-143.0350,254.641,-0.562,0.574,-642.121,356.051
MDVP:PPQ,-1.6761,2.416,-0.694,0.488,-6.412,3.060
Jitter:DDP,150.2003,254.632,0.590,0.555,-348.869,649.269
MDVP:Shimmer,5.0427,6.878,0.733,0.463,-8.438,18.523


## 모델 결과물 확인

In [10]:
train_prob = logistic_trained.predict(train_data.drop('status', axis=1))
test_prob = logistic_trained.predict(test_data.drop('status', axis=1))

train_pred = (train_prob.values > 0.5).astype(np.float)
test_pred = (test_prob.values > 0.5).astype(np.float)

In [11]:
print('Train Accuracy: {:.4f}'.format(accuracy_score(train_data['status'], train_pred)))
print('Test Accuracy: {:.4f}'.format(accuracy_score(test_data['status'], test_pred)))

Train Accuracy: 0.7756
Test Accuracy: 0.7436


## 확률 값을 기준(Threshold)으로 최종 클래스 결정하기

In [12]:
train_prob = logistic_trained.predict(train_data.drop('status', axis=1))
test_prob = logistic_trained.predict(test_data.drop('status', axis=1))

train_pred = (train_prob.values > 0.2).astype(np.float)
test_pred = (test_prob.values > 0.2).astype(np.float)

### 학습 데이터/테스트 데이터에 대한 정확도 확인

In [13]:
print('Train Accuracy: {:.4f}'.format(accuracy_score(train_data['status'], train_pred)))
print('Test Accuracy: {:.4f}'.format(accuracy_score(test_data['status'], test_pred)))

Train Accuracy: 0.9167
Test Accuracy: 0.8462


## 다양한 확률 값 Threshold에 대해 Confusion Matrix 그려보기

In [14]:
for threshold in np.arange(0.1, 1.0, 0.1):
    train_pred = (train_prob.values > threshold).astype(np.float)
    test_pred = (test_prob.values > threshold).astype(np.float)
    print('Threshold: {:.2f} - Test Accuracy: {:.4f}'.format(threshold, accuracy_score(test_data['status'], test_pred)))

Threshold: 0.10 - Test Accuracy: 0.8718
Threshold: 0.20 - Test Accuracy: 0.8462
Threshold: 0.30 - Test Accuracy: 0.7692
Threshold: 0.40 - Test Accuracy: 0.7692
Threshold: 0.50 - Test Accuracy: 0.7436
Threshold: 0.60 - Test Accuracy: 0.6154
Threshold: 0.70 - Test Accuracy: 0.5385
Threshold: 0.80 - Test Accuracy: 0.4359
Threshold: 0.90 - Test Accuracy: 0.3846


In [15]:
for threshold in np.arange(0.1, 1.0, 0.1):
    test_pred = (test_prob.values > threshold).astype(np.float)
    print('Threshold: {:.2f}'.format(threshold))
    print(confusion_matrix(test_data['status'], test_pred))

Threshold: 0.10
[[ 5  5]
 [ 0 29]]
Threshold: 0.20
[[ 5  5]
 [ 1 28]]
Threshold: 0.30
[[ 6  4]
 [ 5 24]]
Threshold: 0.40
[[ 7  3]
 [ 6 23]]
Threshold: 0.50
[[ 8  2]
 [ 8 21]]
Threshold: 0.60
[[ 8  2]
 [13 16]]
Threshold: 0.70
[[ 8  2]
 [16 13]]
Threshold: 0.80
[[ 9  1]
 [21  8]]
Threshold: 0.90
[[10  0]
 [24  5]]


## 최적의 Threshold 결정하기

### 정확도를 기준으로

In [16]:
from pycm import ConfusionMatrix

for threshold in np.arange(0.1, 1.0, 0.1):
    test_pred = (test_prob.values > threshold).astype(np.float)
    cm = ConfusionMatrix(test_data['status'].values, test_pred.astype(np.int))
    print(np.round(cm.ACC[1], 4))

0.8718
0.8462
0.7692
0.7692
0.7436
0.6154
0.5385
0.4359
0.3846


### 환자를 환자로 판단하는 비율을 기준으로

In [17]:
for threshold in np.arange(0.1, 1.0, 0.1):
    test_pred = (test_prob.values > threshold).astype(np.float)
    cm = ConfusionMatrix(test_data['status'].values, test_pred.astype(np.int))
    print(np.round(cm.TPR[1], 4))

1.0
0.9655
0.8276
0.7931
0.7241
0.5517
0.4483
0.2759
0.1724


### 정상을 정상으로 판단하는 비율을 기준으로

In [44]:
for threshold in np.arange(0.1, 1.0, 0.1):
    test_pred = (test_prob.values > threshold).astype(np.float)
    cm = ConfusionMatrix(test_data['status'].values, test_pred.astype(np.int))
    print(np.round(cm.TNR[1], 4))

0.5
0.5
0.6
0.7
0.8
0.8
0.8
0.9
1.0


### F1 Score를 기준으로

In [45]:
for threshold in np.arange(0.1, 1.0, 0.1):
    test_pred = (test_prob.values > threshold).astype(np.float)
    cm = ConfusionMatrix(test_data['status'].values, test_pred.astype(np.int))
    print(np.round(cm.F1[1], 4))

0.9206
0.9032
0.8421
0.8364
0.8077
0.6809
0.5909
0.4211
0.2941


### MCC를 기준으로

In [47]:
for threshold in np.arange(0.1, 1.0, 0.1):
    test_pred = (test_prob.values > threshold).astype(np.float)
    cm = ConfusionMatrix(test_data['status'].values, test_pred.astype(np.int))
    print(np.round(cm.MCC[1], 4))

0.653
0.5634
0.4149
0.4568
0.4653
0.3081
0.2228
0.1823
0.2252


## 모델에서 유의하지 않은 변수 제거하기

## 모델에서 유의하지 않은 변수 제거한 이후, 최적의 Threshold 결정하기