In [None]:
# Libraries for reading and handling the data
import numpy as np
import pandas as pd
import re

# Libraries for data visualization
import matplotlib.pyplot as plt
import seaborn 

# Libraries for data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# Libraries for creating ML model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Library for Analysing the ML model
from sklearn import metrics

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm

fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()


fonts-nanum is already the newest version (20170925-1).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


1.데이터 불러오기 (다중분류) 

In [None]:
df = pd.read_csv('/content/호읍기완성본.csv') # 파일을 불러오고 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38537 entries, 0 to 38536
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Symptoms   37841 non-null  object 
 1   Age        38195 non-null  float64
 2   Sex        37615 non-null  object 
 3   Disease    38197 non-null  object 
 4   Treatment  35696 non-null  object 
 5   Nature     36347 non-null  object 
dtypes: float64(1), object(5)
memory usage: 1.8+ MB


In [None]:
df.drop(['Nature'], axis = 1,inplace= True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38537 entries, 0 to 38536
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Symptoms   37841 non-null  object 
 1   Age        38195 non-null  float64
 2   Sex        37615 non-null  object 
 3   Disease    38197 non-null  object 
 4   Treatment  35696 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.5+ MB


2. 전처리과정
2-1 각 컬럼에 데이터 값 널 제거 

In [None]:
df.dropna(subset=['Symptoms','Age','Sex','Disease','Treatment'], axis =0, inplace=True) #각 컬럼에 데이터값 널 값을제거  Index: 34409 값이랑 각 컬럼이 같으면 좋은 데이터 
df.head()

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment
0,coughing,5.0,female,Asthma,Omalizumab
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab
2,wheezing,6.0,male,Asthma,Mepolizumab
3,shortness of breath,7.0,male,Asthma,Mepolizumab
4,shortness of breath,9.0,male,Asthma,Mepolizumab


In [None]:
df.info() # 정보 확인

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34409 entries, 0 to 38536
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Symptoms   34409 non-null  object 
 1   Age        34409 non-null  float64
 2   Sex        34409 non-null  object 
 3   Disease    34409 non-null  object 
 4   Treatment  34409 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.6+ MB


2-2 not say 삭제 

In [None]:
df['Sex'].replace('not to say',np.nan, inplace =True) # 자기 성별을 말안하겠다 가 존재 그것을 널 값으로 만들어줌 
df.head()

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment
0,coughing,5.0,female,Asthma,Omalizumab
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab
2,wheezing,6.0,male,Asthma,Mepolizumab
3,shortness of breath,7.0,male,Asthma,Mepolizumab
4,shortness of breath,9.0,male,Asthma,Mepolizumab


In [None]:
df.dropna(subset=['Sex'], axis =0, inplace=True) # 널 값을 지워주면 not to say 행이 지워짐
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33749 entries, 0 to 38536
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Symptoms   33749 non-null  object 
 1   Age        33749 non-null  float64
 2   Sex        33749 non-null  object 
 3   Disease    33749 non-null  object 
 4   Treatment  33749 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.5+ MB


3. 성별을 원 핫 코드 형식으로 바꿔줌 

In [None]:
onehot_sex = pd.get_dummies(df[['Sex']]) #dummies 코드를 사용하여 성별을 원핫 코드 형식으로 바꿔줌
onehot_sex

Unnamed: 0,Sex_female,Sex_male
0,1,0
1,1,0
2,0,1
3,0,1
4,0,1
...,...,...
38532,1,0
38533,1,0
38534,1,0
38535,1,0


In [None]:
df = pd.concat([df, onehot_sex], axis = 1) #concat 데이터 프레임을 추가 해준다. 원핫 코드을 사용하여 Sex_female	Sex_male두개의 컬럼을 추가 하고 female 1,0 male 0,1
df.head()

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment,Sex_female,Sex_male
0,coughing,5.0,female,Asthma,Omalizumab,1,0
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab,1,0
2,wheezing,6.0,male,Asthma,Mepolizumab,0,1
3,shortness of breath,7.0,male,Asthma,Mepolizumab,0,1
4,shortness of breath,9.0,male,Asthma,Mepolizumab,0,1


In [None]:
df['Sex'].unique() # 성별의 유니크 값 확인 

array(['female', 'male'], dtype=object)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33749 entries, 0 to 38536
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Symptoms    33749 non-null  object 
 1   Age         33749 non-null  float64
 2   Sex         33749 non-null  object 
 3   Disease     33749 non-null  object 
 4   Treatment   33749 non-null  object 
 5   Sex_female  33749 non-null  uint8  
 6   Sex_male    33749 non-null  uint8  
dtypes: float64(1), object(4), uint8(2)
memory usage: 1.6+ MB


4.Symptoms 원핫 코딩으로 바꿈

In [None]:
onehot_symptoms = pd.get_dummies(df[['Symptoms']]) #dummies 코드를 사용하여 증상을 원핫 코드 형식으로 바꿔줌
onehot_symptoms.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 33749 entries, 0 to 38536
Data columns (total 82 columns):
 #   Column                                                           Non-Null Count  Dtype
---  ------                                                           --------------  -----
 0   Symptoms_A cough that lasts more than three weeks                33749 non-null  uint8
 1   Symptoms_A dry, crackling sound in the lungs while breathing in  33749 non-null  uint8
 2   Symptoms_Bluish skin                                             33749 non-null  uint8
 3   Symptoms_Chest congestion                                        33749 non-null  uint8
 4   Symptoms_Chest pain                                              33749 non-null  uint8
 5   Symptoms_Chills                                                  33749 non-null  uint8
 6   Symptoms_Daytime sleepiness                                      33749 non-null  uint8
 7   Symptoms_Difficulties with memory and concentration       

In [None]:
df = pd.concat([df, onehot_symptoms], axis = 1) #concat 데이터 프레임을 추가 해준다. 원핫 코드을 사용하여 Sex_female	Sex_male두개의 컬럼을 추가 하고 female 1,0 male 0,1


In [None]:
df.Symptoms.unique() # 컬럼 값 확인 

array(['coughing', 'tight feeling in the chest', 'wheezing',
       'shortness of breath', 'fever', 'cold', 'allergy',
       'coughingup yellow or green mucus daily',
       'shortness of breath that gets worse during flare-ups',
       'fatigue, feeling run-down or tired', 'Chest pain',
       'whistling sound while you breathe', 'coughingup blood',
       'runny nose', 'stuffy nose', 'loss of appetite', 'cough',
       'Runny nose', 'Low-grade fever', 'Chest congestion',
       'whistling sound while breathing', 'yellow cough',
       'Feeling run-down or tired', 'mucus', 'chest pain',
       'chronic cough', 'fatigue', 'lower back pain', 'dry cough',
       'greenish cough', 'cough with blood', 'Fever', 'sweating',
       'shaking', 'Rapid breathing', 'shallow breathing', 'low energy',
       'Loss of appetite', 'Nausea', 'vomiting', 'sharp chest pain',
       'Bluish skin', 'Rapid heartbeat', 'Rapid heartbeatz', 'high fever',
       'Headache', 'muscle aches', 'joint pain', 'Chill

5.Disease 원핫 코드로 변경

In [None]:
onehot_dummies = pd.get_dummies(df[['Disease']]) #dummies 코드를 사용하여 증상을 원핫 코드 형식으로 바꿔줌
onehot_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33749 entries, 0 to 38536
Data columns (total 17 columns):
 #   Column                                         Non-Null Count  Dtype
---  ------                                         --------------  -----
 0   Disease_Acute Respiratory Distress Syndrome    33749 non-null  uint8
 1   Disease_Asbestosis                             33749 non-null  uint8
 2   Disease_Aspergillosis                          33749 non-null  uint8
 3   Disease_Asthma                                 33749 non-null  uint8
 4   Disease_Bronchiectasis                         33749 non-null  uint8
 5   Disease_Chronic Bronchitis                     33749 non-null  uint8
 6   Disease_Influenza                              33749 non-null  uint8
 7   Disease_Mesothelioma                           33749 non-null  uint8
 8   Disease_Pneumonia                              33749 non-null  uint8
 9   Disease_Pneumothorax                           33749 non-null  uint8
 10

In [None]:
df = pd.concat([df, onehot_dummies], axis = 1) #concat 데이터 프레임을 추가 해준다. 원핫 코드을 사용하여 Sex_female	Sex_male두개의 컬럼을 추가 하고 female 1,0 male 0,1
df.head()

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment,Sex_female,Sex_male,Symptoms_A cough that lasts more than three weeks,"Symptoms_A dry, crackling sound in the lungs while breathing in",Symptoms_Bluish skin,Symptoms_Chest congestion,Symptoms_Chest pain,Symptoms_Chills,Symptoms_Daytime sleepiness,Symptoms_Difficulties with memory and concentration,Symptoms_Dry mouth,Symptoms_Feeling run-down or tired,Symptoms_Fever,Symptoms_Frequently waking,Symptoms_Headache,Symptoms_Loss of appetite,Symptoms_Loss of appetite and unintentional weight loss,Symptoms_Low-grade fever,Symptoms_Morning headaches,Symptoms_Nasal congestion,Symptoms_Nausea,Symptoms_Night sweats,Symptoms_Pauses in breathing,Symptoms_Persistent dry coug,Symptoms_Persistent dry cough,Symptoms_Rapid breathing,Symptoms_Rapid heartbeat,Symptoms_Rapid heartbeatz,Symptoms_Runny nose,Symptoms_Snoring,Symptoms_Sore throat,Symptoms_Unusual moodiness,Symptoms_Weight loss from loss of appetite,Symptoms_Wider and rounder than normal fingertips and toes,Symptoms_allergy,...,Symptoms_lower back pain,Symptoms_mucus,Symptoms_muscle aches,Symptoms_nausea,Symptoms_pain,Symptoms_runny nose,Symptoms_shaking,Symptoms_shallow breathing,Symptoms_sharp chest pain,Symptoms_short of breath,"Symptoms_short, shallow and rapid breathing",Symptoms_shortness of breath,Symptoms_shortness of breath that gets worse during flare-ups,Symptoms_stuffy nose,Symptoms_sweating,Symptoms_tight feeling in the chest,Symptoms_vomiting,Symptoms_weight loss,Symptoms_wheezing,Symptoms_wheezing cough,Symptoms_whistling sound while breathing,Symptoms_whistling sound while you breathe,Symptoms_yellow cough,Disease_Acute Respiratory Distress Syndrome,Disease_Asbestosis,Disease_Aspergillosis,Disease_Asthma,Disease_Bronchiectasis,Disease_Chronic Bronchitis,Disease_Influenza,Disease_Mesothelioma,Disease_Pneumonia,Disease_Pneumothorax,Disease_Pulmonary hypertension,Disease_Respiratory syncytial virus,Disease_Tuberculosis,Disease_bronchiolitis,Disease_bronchitis,Disease_chronic obstructive pulmonary disease,Disease_sleep apnea
0,coughing,5.0,female,Asthma,Omalizumab,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,wheezing,6.0,male,Asthma,Mepolizumab,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,shortness of breath,7.0,male,Asthma,Mepolizumab,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,shortness of breath,9.0,male,Asthma,Mepolizumab,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df.head()

Unnamed: 0,Symptoms,Age,Sex,Disease,Treatment,Sex_female,Sex_male,Symptoms_A cough that lasts more than three weeks,"Symptoms_A dry, crackling sound in the lungs while breathing in",Symptoms_Bluish skin,Symptoms_Chest congestion,Symptoms_Chest pain,Symptoms_Chills,Symptoms_Daytime sleepiness,Symptoms_Difficulties with memory and concentration,Symptoms_Dry mouth,Symptoms_Feeling run-down or tired,Symptoms_Fever,Symptoms_Frequently waking,Symptoms_Headache,Symptoms_Loss of appetite,Symptoms_Loss of appetite and unintentional weight loss,Symptoms_Low-grade fever,Symptoms_Morning headaches,Symptoms_Nasal congestion,Symptoms_Nausea,Symptoms_Night sweats,Symptoms_Pauses in breathing,Symptoms_Persistent dry coug,Symptoms_Persistent dry cough,Symptoms_Rapid breathing,Symptoms_Rapid heartbeat,Symptoms_Rapid heartbeatz,Symptoms_Runny nose,Symptoms_Snoring,Symptoms_Sore throat,Symptoms_Unusual moodiness,Symptoms_Weight loss from loss of appetite,Symptoms_Wider and rounder than normal fingertips and toes,Symptoms_allergy,...,Symptoms_lower back pain,Symptoms_mucus,Symptoms_muscle aches,Symptoms_nausea,Symptoms_pain,Symptoms_runny nose,Symptoms_shaking,Symptoms_shallow breathing,Symptoms_sharp chest pain,Symptoms_short of breath,"Symptoms_short, shallow and rapid breathing",Symptoms_shortness of breath,Symptoms_shortness of breath that gets worse during flare-ups,Symptoms_stuffy nose,Symptoms_sweating,Symptoms_tight feeling in the chest,Symptoms_vomiting,Symptoms_weight loss,Symptoms_wheezing,Symptoms_wheezing cough,Symptoms_whistling sound while breathing,Symptoms_whistling sound while you breathe,Symptoms_yellow cough,Disease_Acute Respiratory Distress Syndrome,Disease_Asbestosis,Disease_Aspergillosis,Disease_Asthma,Disease_Bronchiectasis,Disease_Chronic Bronchitis,Disease_Influenza,Disease_Mesothelioma,Disease_Pneumonia,Disease_Pneumothorax,Disease_Pulmonary hypertension,Disease_Respiratory syncytial virus,Disease_Tuberculosis,Disease_bronchiolitis,Disease_bronchitis,Disease_chronic obstructive pulmonary disease,Disease_sleep apnea
0,coughing,5.0,female,Asthma,Omalizumab,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,tight feeling in the chest,4.0,female,Asthma,Mepolizumab,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,wheezing,6.0,male,Asthma,Mepolizumab,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,shortness of breath,7.0,male,Asthma,Mepolizumab,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,shortness of breath,9.0,male,Asthma,Mepolizumab,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn import preprocessing    

label_encoder = preprocessing.LabelEncoder()                                  # label encoder 생성 문자를 숫자 형식으로 바꿈 이유는 나는 문자열이기 때문에 
                                                                                                          #숫자형을로 변경하여 문제지와 정답지 비교 
onehot_location = label_encoder.fit_transform(df['Treatment'])


In [None]:
df['y_label'] = onehot_location

4.

In [None]:

X= df[['Age','Sex_female','Sex_male','Symptoms_wheezing','Symptoms_fatigue','Symptoms_shortness of breath','Symptoms_coughing','Symptoms_fatigue','Symptoms_low energy','Symptoms_shortness of breath','Symptoms_wheezing','Symptoms_A cough that lasts more than three weeks','Symptoms_A dry, crackling sound in the lungs while breathing in','Symptoms_Bluish skin','Symptoms_Chest congestion','Symptoms_Chest pain','Symptoms_Chills','Symptoms_Daytime sleepiness','Symptoms_Difficulties with memory and concentration','Symptoms_Dry mouth','Symptoms_fatigue, feeling run-down or tired','Symptoms_Feeling run-down or tired','Symptoms_Fever','Symptoms_Frequently waking','Symptoms_Headache','Symptoms_Loss of appetite','Symptoms_Loss of appetite and unintentional weight loss','Symptoms_Low-grade fever','Symptoms_Morning headaches','Symptoms_Nasal congestion','Symptoms_Nausea','Symptoms_Night sweats','Symptoms_Pauses in breathing','Symptoms_Persistent dry coug','Symptoms_Persistent dry cough','Symptoms_Rapid breathing','Symptoms_Rapid heartbeat','Symptoms_Runny nose','Symptoms_shortness of breath','Symptoms_shortness of breath that gets worse during flare-ups','Symptoms_Snoring','Symptoms_Sore throat','Symptoms_Unusual moodiness','Symptoms_Weight loss from loss of appetite','Symptoms_Wider and rounder than normal fingertips and toes','Symptoms_allergy','Symptoms_breath','Symptoms_chest pain','Symptoms_chronic cough','Symptoms_cold','Symptoms_cough','Symptoms_cough with blood','Symptoms_coughing','Symptoms_coughingup blood','Symptoms_coughingup yellow or green mucus daily','Symptoms_diarrhea','Symptoms_distressing','Symptoms_dizziness','Symptoms_dry cough','Symptoms_edema','Symptoms_fainting','Symptoms_faster heart beating','Symptoms_fatigue','Symptoms_fever', 'Symptoms_greenish cough','Symptoms_heart palpitations','Symptoms_high fever','Symptoms_irritability','Symptoms_joint pain','Symptoms_loss of appetite','Symptoms_lower back pain','Symptoms_mucus','Symptoms_muscle aches','Symptoms_nausea','Symptoms_pain','Symptoms_runny nose','Symptoms_shaking','Symptoms_shallow breathing','Symptoms_sharp chest pain','Symptoms_short of breath','Symptoms_short, shallow and rapid breathing','Symptoms_shortness of breath','Symptoms_stuffy nose','Symptoms_sweating','Symptoms_tight feeling in the chest','Symptoms_vomiting','Symptoms_weight loss','Symptoms_wheezing','Symptoms_wheezing cough','Symptoms_whistling sound while breathing','Symptoms_whistling sound while you breathe','Symptoms_yellow cough','Age','Sex_female','Sex_male','Disease_Acute Respiratory Distress Syndrome','Disease_Asbestosis','Disease_Aspergillosis','Disease_Asthma','Disease_Bronchiectasis','Disease_Chronic Bronchitis','Disease_Influenza','Disease_Mesothelioma','Disease_Pneumonia','Disease_Pneumothorax','Disease_Pulmonary hypertension','Disease_Respiratory syncytial virus','Disease_Tuberculosis','Disease_bronchiolitis','Disease_bronchitis','Disease_chronic obstructive pulmonary disease','Disease_sleep apnea']]



# 'Disease_Acute Respiratory','Distress Syndrome','Disease_Asbestosis','Disease_Aspergillosis',	
# 'Disease_Asthma','Disease_Bronchiectasis','Disease_Chronic Bronchitis','Disease_Influenza'	
# 'Disease_Mesothelioma','Disease_Pneumonia','Disease_Pneumothorax',	
# 'Disease_Pulmonary hypertension','Disease_Respiratory syncytial virus','Disease_Tuberculosis'	
# 'Disease_bronchiolitis','Disease_bronchitis','Disease_chronic obstructive pulmonary disease',
# 'Disease_sleep apnea'

#Symptoms_fatigue
#
y = df['y_label'] 




In [None]:
df['Symptoms'].unique()

데이터 분리하기

In [None]:
# train 80 test 20 -> train 70 test 30 # random_State = 어디 부분을 짜르거냐 랜덤으로 왜? 6:4 가르는 부분 어디를 자를것인가
X_train,X_test, y_train,y_test = train_test_split(X,
                                                  y,
                                                  train_size =0.7) 

In [None]:
# 설명 변수(데이터, 문제집) 정규화(normalization)...스케일 조정... 0~1 사이의 작은 값으로 축소 값이 일정 범위 안에 가둘라고 범위를 일반화 
# 머신이 공부 하기 쉽게요약집을 준다

from sklearn import preprocessing

X = preprocessing.StandardScaler().fit(X).transform(X)


In [None]:
from tensorflow.keras.utils import  to_categorical

#각 데이터의 레이블 0~32 숫자 값을 범주형 형태로 변경

y = to_categorical(y)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

**머신러닝 모델 구축 1)-LogisticRegression** 나는 다중분류 이기 때문에 이진분류를 사용할 필요가 없음

In [None]:
log_reg =LogisticRegression().fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
#정확도 확인
print('Train data score:', log_reg.score(X_train,y_train))
print('test data score:', log_reg.score(X_test,y_test))

Train data score: 0.7716728750423298
test data score: 0.7746172839506172


**머신러닝 모델 구축 -2) Decision Tree**

In [None]:
tree = DecisionTreeClassifier(max_depth = 5,
                              min_samples_leaf = 20,
                              min_samples_split = 40).fit(X_train, y_train)

In [None]:
#정확도 확인 train가 더 많이 나오면 과대 적합
 
print('Train data score:', tree.score(X_train,y_train))
print('test data score:', tree.score(X_test,y_test))

Train data score: 0.5165932949542837
test data score: 0.5217777777777778


**머신러닝 모델 구축 -3)GradientBoostingClassifier**

In [None]:
boost = GradientBoostingClassifier(max_depth=3, #트리의 갯수 
                                   learning_rate =0.05).fit(X_train, y_train)

In [None]:
#정확도 확인
print('Train data score:', boost.score(X_train,y_train))
print('test data score:', boost.score(X_test,y_test))

Train data score: 0.9824331188621741
test data score: 0.9816296296296296


**머신러닝 모델 구축 -4) RandomForestClassifier**

In [None]:
random = RandomForestClassifier(n_estimators=300,                                                      # 수치가 안맞을때 수를 낮춰야됨
                                random_state=0).fit(X_train,y_train)

In [None]:
#정확도 확인
print('Train accuracy score is:', random.score(X_train,y_train))
print('test accuracy score is:', random.score(X_test,y_test))

Train accuracy score is: 0.9910684050118523
test accuracy score is: 0.9899259259259259


<                                                                                  

<

<

<

<

<