<a href="https://colab.research.google.com/github/sweetmjkim/study_AIs/blob/main/docs/quests/MLs/Q_SpineSurgeryList_GridSearchCV_resampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Quests
- copy SpineSurgeryList_GridSearchCV.ipynb
- refer : iris_samplings.ipynb
- resampling 종류 따른 F1 score 변화 관찰

## 데이터

In [1]:
import pandas as pd
df_SSL = pd.read_csv('/content/SpineSurgeryList.csv')
df_SSL[:2]

Unnamed: 0.1,Unnamed: 0,환자ID,Large Lymphocyte,Location of herniation,ODI,가족력,간질성폐질환,고혈압여부,과거수술횟수,당뇨여부,...,Modic change,PI,PT,Seg Angle(raw),Vaccum disc,골밀도,디스크단면적,디스크위치,척추이동척도,척추전방위증
0,0,1PT,22.8,3,51.0,0.0,0,0,0,0,...,3,51.6,36.6,14.4,0,-1.01,2048.5,4,Down,0
1,1,2PT,44.9,4,26.0,0.0,0,0,0,0,...,0,40.8,7.2,17.8,0,-1.14,1753.1,4,Up,0


In [2]:
df_SSL_extract = df_SSL[['성별','신장','체중', '흡연여부', '연령', '혈액형', '직업', '재발여부' ]]
# df_SSL_extract = df_SSL[['성별','신장','체중', '흡연여부', '연령', '재발여부' ]]
df_SSL_extract[:2]

Unnamed: 0,성별,신장,체중,흡연여부,연령,혈액형,직업,재발여부
0,2,163,60.3,0,66,RH+A,자영업,0
1,1,171,71.7,0,47,RH+A,운동선수,0


In [3]:
df_SSL_extract.isnull().sum()

성별        0
신장        0
체중        0
흡연여부      0
연령        0
혈액형       0
직업      415
재발여부      0
dtype: int64

### 데이터 전처리

In [4]:
df_SSL_extract.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_SSL_extract.dropna(inplace=True)


In [5]:
target = df_SSL_extract['재발여부']
Xs = df_SSL_extract.drop(columns=['재발여부'])
features = pd.get_dummies(Xs)

In [6]:
features[:2]

Unnamed: 0,성별,신장,체중,흡연여부,연령,혈액형_RH+A,혈액형_RH+AB,혈액형_RH+B,혈액형_RH+O,직업_건설업,...,직업_사무직,직업_사업가,직업_예술가,직업_운동선수,직업_운수업,직업_의료직,직업_자영업,직업_주부,직업_특수전문직,직업_학생
0,2,163,60.3,0,66,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,171,71.7,0,47,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


## resampling

In [7]:
from collections import Counter
Counter(target)

Counter({0: 1302, 1: 177})

### Oversampling (많은 갯수 기준으로 맞춤)

In [8]:
from imblearn.over_sampling import SMOTE
overSampling = SMOTE(sampling_strategy=0.8)
feature_over_sample, target_over_sample = overSampling.fit_resample(features,target)
feature_over_sample.shape, target_over_sample.shape

((2343, 26), (2343,))

In [9]:
Counter(target_over_sample)

Counter({0: 1302, 1: 1041})

### UnderSampling(적은 갯수 기준으로 맞춤)

In [10]:
from imblearn.under_sampling import NearMiss
underSampling = NearMiss(sampling_strategy=0.8)
features_under_sample, target_under_sample = underSampling.fit_resample(features,target)
features_under_sample.shape, target_under_sample.shape

((398, 26), (398,))

In [11]:
Counter(target_under_sample)

Counter({0: 221, 1: 177})

### over와 under 결합 sampling

In [12]:
from imblearn.combine import SMOTEENN
combineSampling = SMOTEENN(sampling_strategy=0.5)
features_combine_sample, target_combine_sample = combineSampling.fit_resample(features,target)
features_combine_sample.shape, target_combine_sample.shape

((1157, 26), (1157,))

In [13]:
Counter(target_combine_sample)

Counter({0: 785, 1: 372})

## 특성공학 - Cross Validation

In [14]:
 # 모델:hyper parameters 교차 적용, 데이터셋 : 알맞게 섞어서 대입
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

hyper_parameters = {'max_depth':range(2, 10),
              'min_samples_leaf': range(2, 10),
              'criterion': ['gini','entropy'],
              'class_weight': [None, 'balanced'],
              'min_samples_split': range(2, 10)}

estimator_model = DecisionTreeClassifier()
# grid_model = GridSearchCV(estimator_model, hyper_parameters)

# score 방식 지정
from sklearn.metrics import f1_score, make_scorer
scoring = make_scorer(f1_score)
grid_model = GridSearchCV(estimator_model, hyper_parameters
                          , scoring=scoring
                          , cv=5)   # 데이터셋 폴드

grid_model

## 3가지 모델 확인

### Over_sampling 모델

In [15]:
grid_model.fit(feature_over_sample, target_over_sample)

In [16]:
over_best_model = grid_model.best_estimator_

### under_sampling 모델

In [17]:
grid_model.fit(features_under_sample, target_under_sample)

In [18]:
under_best_model = grid_model.best_estimator_

### over와 under 결합 sampling 모델

In [19]:
grid_model.fit(features_combine_sample, target_combine_sample)

In [20]:
combine_best_model = grid_model.best_estimator_

In [21]:
grid_model.scorer_

make_scorer(f1_score)

In [22]:
best_model = grid_model.best_estimator_

## 평가

In [23]:
from sklearn.metrics import classification_report
target_predict = best_model.predict(features)
print(classification_report(target, target_predict))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      1302
           1       0.39      0.51      0.44       177

    accuracy                           0.85      1479
   macro avg       0.66      0.70      0.68      1479
weighted avg       0.87      0.85      0.85      1479



In [24]:
target.value_counts()

0    1302
1     177
Name: 재발여부, dtype: int64

## samplings

In [25]:
best_model.fit(features, target)