#### from blog : https://blog.naver.com/data_station/222493270368

- 의료 보조 서비스
- 수식화(모델링) 단계
- 어떤 대상이 이 서비스(모델)를 사용할 것인가?
- **어떤 설명변수를 선택하는가**
- 의사를 대상으로 할 때
- 환자를 대상으로 할 때


### EDA

In [None]:
import pandas as pd
df1 = pd.read_csv('../../../datasets/recurrenceOfSurgery.csv')
print(df1.shape)
df1.head(2)


(1894, 52)


Unnamed: 0.1,Unnamed: 0,환자ID,Large Lymphocyte,Location of herniation,ODI,가족력,간질성폐질환,고혈압여부,과거수술횟수,당뇨여부,...,Modic change,PI,PT,Seg Angle(raw),Vaccum disc,골밀도,디스크단면적,디스크위치,척추이동척도,척추전방위증
0,0,1PT,22.8,3,51.0,0.0,0,0,0,0,...,3,51.6,36.6,14.4,0,-1.01,2048.5,4,Down,0
1,1,2PT,44.9,4,26.0,0.0,0,0,0,0,...,0,40.8,7.2,17.8,0,-1.14,1753.1,4,Up,0


In [None]:
df1.columns


Index(['Unnamed: 0', '환자ID', 'Large Lymphocyte', 'Location of herniation',
       'ODI', '가족력', '간질성폐질환', '고혈압여부', '과거수술횟수', '당뇨여부', '말초동맥질환여부', '빈혈여부',
       '성별', '스테로이드치료', '신부전여부', '신장', '심혈관질환', '암발병여부', '연령', '우울증여부', '입원기간',
       '입원일자', '종양진행여부', '직업', '체중', '퇴원일자', '헤모글로빈수치', '혈전합병증여부', '환자통증정도',
       '흡연여부', '통증기간(월)', '수술기법', '수술시간', '수술실패여부', '수술일자', '재발여부', '혈액형',
       '전방디스크높이(mm)', '후방디스크높이(mm)', '지방축적도', 'Instability', 'MF + ES',
       'Modic change', 'PI', 'PT', 'Seg Angle(raw)', 'Vaccum disc', '골밀도',
       '디스크단면적', '디스크위치', '척추이동척도', '척추전방위증'],
      dtype='object')

In [None]:
# 변수 선택(서비스의 대상을 기준으로) / 결측치 제거
df2 = df1[['성별','신장','체중', '흡연여부', '연령', '혈액형', '직업', '재발여부' ]]
df3 = df2.dropna()
print(df3.shape)


(1479, 8)


- 설명변수 및 목표변수 설정


In [None]:
# 더미변수처리(문자 데이터 -> One Hot Encoding)
X = df3.drop(columns='재발여부')
X1 = pd.get_dummies(X)
X1


Unnamed: 0,성별,신장,체중,흡연여부,연령,혈액형_RH+A,혈액형_RH+AB,혈액형_RH+B,혈액형_RH+O,직업_건설업,...,직업_사무직,직업_사업가,직업_예술가,직업_운동선수,직업_운수업,직업_의료직,직업_자영업,직업_주부,직업_특수전문직,직업_학생
0,2,163,60.3,0,66,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,171,71.7,0,47,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,178,77.1,0,39,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,174,74.2,0,40,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,1,183,80.7,0,42,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1889,2,157,64.0,0,59,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1890,2,157,59.0,0,42,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1891,1,167,70.0,0,61,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1892,1,177,77.0,0,29,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [None]:
Y = df3['재발여부']


### machine learning

In [None]:
# 기계학습 알고리즘 호출
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.tree import plot_tree


In [None]:
pipe_list = [('scaler',MinMaxScaler()),('model',DecisionTreeClassifier())]
pipe_model = Pipeline(pipe_list)
pipe_model

In [None]:
from sklearn.metrics import make_scorer, f1_score
scoring = make_scorer(f1_score)
hyper_list = {'model__max_depth':range(2, 10),
              'model__min_samples_leaf': range(2, 10),
              'model__criterion': ['gini','entropy'],
              'model__class_weight': [None, 'balanced'],
              'model__min_samples_split': range(2, 10)}
# hyper_list = {'model__max_depth':range(2, 3),}

grid_model = GridSearchCV(pipe_model, param_grid=hyper_list,
                          scoring=scoring,
                          n_jobs = -1,
                          cv = 5)

In [None]:
from collections import Counter
Counter(Y)

Counter({0: 1302, 1: 177})

## resampling과 hypertuning 필요

### UnderSampling

In [None]:
from imblearn.under_sampling import NearMiss, EditedNearestNeighbours
from datetime import datetime
import numpy as np
sampling_strategy_list=np.arange(0.3, 1.0, 0.05)

for sampling_strategy in sampling_strategy_list :
  print(30*'-','{}'.format(sampling_strategy))
  resampler = NearMiss(sampling_strategy=sampling_strategy)
  features_under_resample, target_under_resample = resampler.fit_resample(X1, Y)
  print(Counter(target_under_resample))
  features_under_resample_train, features_under_resample_test, target_under_resample_train, target_under_resample_test = train_test_split(features_under_resample, target_under_resample, test_size = 0.3, random_state= 1234)
  print(Counter(target_under_resample_train), Counter(target_under_resample_test))
  # re learning with oversampling datasets

  start = datetime.now()  # start date and time

  grid_model.fit(features_under_resample_train, target_under_resample_train)

  end = datetime.now()  # end date and time - set to current date and time

  duration = end - start  # duration as a timedelta object

  print("Duration: {0}".format(duration))

  best_model = grid_model.best_estimator_
  target_under_resample_train_pred = best_model.predict(features_under_resample_train)
  target_under_resample_test_pred = best_model.predict(features_under_resample_test)

  print(classification_report(target_under_resample_train, target_under_resample_train_pred))
  print(classification_report(target_under_resample_test, target_under_resample_test_pred))
  print(30*'-')


------------------------------ 0.3
Counter({0: 590, 1: 177})
Counter({0: 408, 1: 128}) Counter({0: 182, 1: 49})
Duration: 0:01:37.471800
              precision    recall  f1-score   support

           0       0.85      0.86      0.85       408
           1       0.53      0.52      0.52       128

    accuracy                           0.78       536
   macro avg       0.69      0.69      0.69       536
weighted avg       0.77      0.78      0.77       536

              precision    recall  f1-score   support

           0       0.85      0.84      0.84       182
           1       0.42      0.45      0.44        49

    accuracy                           0.75       231
   macro avg       0.64      0.64      0.64       231
weighted avg       0.76      0.75      0.76       231

------------------------------
------------------------------ 0.35
Counter({0: 505, 1: 177})
Counter({0: 355, 1: 122}) Counter({0: 150, 1: 55})
Duration: 0:01:40.975634
              precision    recall  f1-sc