# Google drive mount
#### * google colab 사용 시, google drive 내 데이터를 사용하기 위한 절차

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 작업 경로 설정
#### * csv가 존재하는 작업 경로 설정

In [2]:
import os
os.chdir('/content/drive/MyDrive/숨고/양희성님(요통발생예측)')

# 현재 분석 코딩을 위한 필요 library import


In [None]:
# 아래 코드 실행 후, 런타임 다시시작 필요
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

In [None]:
!pip install mljar-supervised

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from supervised.automl import AutoML
import matplotlib.pyplot as plt
import seaborn as sns

plt.rc('font', family='NanumBarunGothic')

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


# Data load
#### * pandas를 이용해 csv파일 load
- 데이터를 보면 범주형과 수치형 변수들이 섞여 있음을 확인 가능
- 결측치 다수 존재

In [6]:
data = pd.read_csv("./data.csv")

In [7]:
data

Unnamed: 0,요통,체중조절,음주경험,수면시간,흡연여부,근력운동,유연성운동,아침,점심,저녁,외식,식이요법,앉는시간
0,1,4,2,5,2,1,1,1,1,1,7,2,18
1,1,3,2,6,2,1,1,1,1,1,6,2,18
2,1,1,2,6,3,5,6,4,1,1,5,2,18
3,2,4,2,6,2,1,3,1,1,1,3,2,18
4,1,1,2,3,3,1,6,1,1,1,7,1,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1860,2,1,2,7,3,1,4,1,1,1,5,2,1
1861,2,3,1,6,3,1,6,,,,,,1
1862,1,2,2,6,2,1,1,1,3,1,7,2,1
1863,1,1,2,8,2,5,6,1,1,1,5,1,1


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1865 entries, 0 to 1864
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   요통      1865 non-null   int64 
 1   체중조절    1865 non-null   int64 
 2   음주경험    1865 non-null   int64 
 3   수면시간    1865 non-null   int64 
 4   흡연여부    1865 non-null   int64 
 5   근력운동    1865 non-null   int64 
 6   유연성운동   1865 non-null   int64 
 7   아침      1865 non-null   object
 8   점심      1865 non-null   object
 9   저녁      1865 non-null   object
 10  외식      1865 non-null   object
 11  식이요법    1865 non-null   object
 12  앉는시간    1865 non-null   int64 
dtypes: int64(8), object(5)
memory usage: 189.5+ KB


# 각 변수별 처리 진행
- 순서형에 맞게 크기순으로 값 재배열(ex, 4:노력안함 -> 0:노력안함)
- 결측값 처리

In [9]:
#요통 변수 처리
data[['요통']] = data[['요통']].replace({2:0})

In [10]:
#체중조절 변수 처리
data[['체중조절']] = data[['체중조절']].replace({4:0})

In [11]:
#음주경험 변수 처리
data[['음주경험']] = data[['음주경험']].replace({2:0})

In [12]:
#흡연여부 변수 처리
data[['흡연여부']] = data[['흡연여부']].replace({3:0})

In [13]:
#근력운동 변수 처리
data[['근력운동']] = data[['근력운동']].replace({1:0,2:1,3:2,4:3,5:4,6:5})

In [14]:
#유연성운동 변수 처리
data[['유연성운동']] = data[['유연성운동']].replace({1:0,2:1,3:2,4:3,5:4,6:5})

In [15]:
#아침, 점심, 저녁 변수 처리
# - 요통 범주별 최빈값으로 대체
def replace_value(col):
    data[[col]] = data[[col]].replace({" ":np.nan})
    for i in range(len(data)):
        if pd.isna(data.loc[i, col]):
            mode_value = data.loc[data['요통']==data.loc[i, '요통'], col].mode()[0]
            data.loc[i, col] = mode_value
    data[[col]] = data[[col]].astype(int)
    data[[col]] = data[[col]].replace({1:3,2:2,3:1,4:0})
    return data[[col]]

data[['아침']] = replace_value('아침')
data[['점심']] = replace_value('점심')
data[['저녁']] = replace_value('저녁')

In [16]:
#외식 변수 처리
data[["외식"]] = data[["외식"]].replace({" ":np.nan})
for i in range(len(data)):
    if pd.isna(data.loc[i, "외식"]):
        mode_value = data.loc[data['요통']==data.loc[i, '요통'], "외식"].mode()[0]
        data.loc[i, "외식"] = mode_value
data[["외식"]] = data[["외식"]].astype(int)
data[["외식"]] = data[["외식"]].replace({1:6,2:5,3:4,4:3,5:2,6:1,7:0})

In [17]:
#식이요법 변수 처리
data[["식이요법"]] = data[["식이요법"]].replace({" ":np.nan})
for i in range(len(data)):
    if pd.isna(data.loc[i, "식이요법"]):
        mode_value = data.loc[data['요통']==data.loc[i, '요통'], "식이요법"].mode()[0]
        data.loc[i, "식이요법"] = mode_value
data[["식이요법"]] = data[["식이요법"]].astype(int)
data[["식이요법"]] = data[["식이요법"]].replace({1:0,2:1})

In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1865 entries, 0 to 1864
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   요통      1865 non-null   int64
 1   체중조절    1865 non-null   int64
 2   음주경험    1865 non-null   int64
 3   수면시간    1865 non-null   int64
 4   흡연여부    1865 non-null   int64
 5   근력운동    1865 non-null   int64
 6   유연성운동   1865 non-null   int64
 7   아침      1865 non-null   int64
 8   점심      1865 non-null   int64
 9   저녁      1865 non-null   int64
 10  외식      1865 non-null   int64
 11  식이요법    1865 non-null   int64
 12  앉는시간    1865 non-null   int64
dtypes: int64(13)
memory usage: 189.5 KB


# 학습을 위한 데이터 처리
 - 데이터 불균형으로, 요통발생 기준 6:4 비율로 샘플링
 - train:test = 7:3 비율로 분리
 - 0 또는 1의 값만 가지는 변수는 놔두고 나머지 순서형 연속형 변수의 경우 minmax normalization 진행

In [19]:
# sampling
data_0 = data.loc[data['요통']==0].sample(n=round((6*len(data.loc[data['요통']==1]))/4)).reset_index(drop=True)
data_1 = data.loc[data['요통']==1].reset_index(drop=True)
data_sampled = pd.concat([data_0, data_1], axis=0).reset_index(drop=True)
data_sampled

Unnamed: 0,요통,체중조절,음주경험,수면시간,흡연여부,근력운동,유연성운동,아침,점심,저녁,외식,식이요법,앉는시간
0,0,2,0,8,0,1,3,3,3,3,1,1,4
1,0,1,0,8,0,0,5,3,3,3,1,1,10
2,0,2,0,8,0,0,4,3,3,3,2,1,7
3,0,1,1,5,0,0,5,0,3,3,1,1,4
4,0,0,1,8,0,0,0,3,3,3,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
778,1,0,1,6,0,0,0,3,3,3,0,1,1
779,1,0,0,7,2,0,0,3,3,3,2,1,1
780,1,3,0,8,2,0,1,3,3,3,1,1,1
781,1,2,0,6,2,0,0,3,1,3,0,1,1


In [20]:
# split
x_train, x_test, y_train, y_test = train_test_split(data_sampled.drop(['요통'],axis=1), data_sampled[['요통']], test_size=0.3, stratify=data_sampled['요통'])

In [21]:
y_train = y_train.values.reshape(-1)
y_test = y_test.values.reshape(-1)

# AutoML을 통한 자동 최적화 진행

In [31]:
automl = AutoML(
    mode = "Explain",
    eval_metric = "auc",
    algorithms=["Linear","Nearest Neighbors", "Xgboost", "Random Forest", "CatBoost", "LightGBM", "Neural Network"],
    train_ensemble=False,
    validation_strategy={"validation_type": "custom"}
)
cv = [(x_train.index.values, x_test.index.values)]
automl.fit(data_sampled.drop(['요통'],axis=1), data_sampled[['요통']], cv=cv)

AutoML directory: AutoML_1
The task is binary_classification with evaluation metric auc
AutoML will use algorithms: ['Linear', 'Nearest Neighbors', 'Xgboost', 'Random Forest', 'CatBoost', 'LightGBM', 'Neural Network']
AutoML steps: ['simple_algorithms', 'default_algorithms']
* Step simple_algorithms will try to check up to 1 model
Custom validation strategy
Split 0.
Train 548 samples.
Validation 235 samples.




1_Linear auc 0.503697 trained in 32.49 seconds




* Step default_algorithms will try to check up to 6 models




2_Default_LightGBM auc 0.562962 trained in 16.58 seconds




3_Default_Xgboost auc 0.540592 trained in 12.13 seconds




4_Default_CatBoost auc 0.513845 trained in 8.11 seconds




5_Default_NeuralNetwork auc 0.481817 trained in 7.12 seconds




6_Default_RandomForest auc 0.520975 trained in 11.82 seconds




7_Default_NearestNeighbors auc 0.557228 trained in 9.93 seconds




AutoML fit time: 121.62 seconds
AutoML best model: 2_Default_LightGBM


In [29]:
automl.report()