In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:99% !important;}
div.cell.code_cell.rendered{width:90%;}
div.CodeMirror {font-family:Consolas; font-size:20pt;}
div.output {font-size:20pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:19pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
span.toc-item-num{display:none;}
div.text_cell_render ul li{font-size:16pt;padding:5px;}
div.CodeMirror-lines > div {padding-left:10px;}
table.dataframe{font-size:19px;}
</style>
"""))

In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split # 훈련셋과 테스트셋을 분리
from tensorflow.keras.models import Sequential # 모델생성
from tensorflow.keras.layers import Dense, Dropout # Dense(모델layer), Dropout(과적합 줄이기)
import matplotlib.pyplot as plt
from tensorflow.keras import metrics # 성능지표들

- 심장병 발병 예측
```
1. 데이터 셋 생성 & 전처리
    엑셀데이터 -> ?를 nan -> nan 처리 -> X, y 분리 -> X 스케일조정 -> 훈련셋과 테스트셋분리(api이용)
2. 모델 생성 & 구성(과적합 줄이기 단계 추가)
3. 모델 학습과정설정 (accuracy, precision, recall)
4. 모델 학습시키기 (훈련셋의 일부를 검증셋으로 학습)
5. 모델 평가 - 과정살펴보기(그래프), evaluate, pd.crosstab/confusion_matrix(혼돈행렬)
6. 모델 사용하기
```

## 1. 데이터 셋 생성 & 전처리
- 엑셀데이터 -> ?를 nan -> nan 처리 -> X, y 분리 -> X 스케일조정 -> 훈련셋과 테스트셋분리(api이용)

In [4]:
raw_data = pd.read_excel('./data/heart-disease.xlsx')
raw_data.head(3)

Unnamed: 0,age,sex,cp,treshtbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,hsl,heartDisease
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,0,108,1,1.5,2,3,3,1
2,67,1,4,120,?,0,2,129,1,2.6,2,2,7,1


In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           303 non-null    int64  
 1   sex           303 non-null    int64  
 2   cp            303 non-null    int64  
 3   treshtbps     303 non-null    int64  
 4   chol          303 non-null    object 
 5   fbs           303 non-null    int64  
 6   restecg       303 non-null    int64  
 7   thalach       303 non-null    int64  
 8   exang         303 non-null    int64  
 9   oldpeak       303 non-null    float64
 10  slope         303 non-null    int64  
 11  ca            303 non-null    object 
 12  hsl           303 non-null    object 
 13  heartDisease  303 non-null    int64  
dtypes: float64(1), int64(10), object(3)
memory usage: 33.3+ KB


In [9]:
# raw_data의 타입이 object인 열에 ?가 있는 데이터 추출
raw_data[(raw_data['chol']=='?') | (raw_data['ca']=='?') | (raw_data['hsl']=='?')]

Unnamed: 0,age,sex,cp,treshtbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,hsl,heartDisease
2,67,1,4,120,?,0,2,129,1,2.6,2,2,7,1
87,53,0,3,128,216,0,2,115,0,0.0,1,0,?,0
166,52,1,3,138,223,0,0,169,0,0.0,1,?,3,0
192,43,1,4,132,247,1,2,143,1,0.1,2,?,7,1
266,52,1,4,128,204,1,0,156,1,1.0,2,0,?,1
287,58,1,2,125,220,0,0,144,0,0.4,2,?,7,0
302,38,1,3,138,175,0,0,173,0,0.0,1,?,3,0


In [11]:
# '?'를 결측치(np.nan)로 대체
clean_data = raw_data.replace('?', np.nan)
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   age           303 non-null    int64  
 1   sex           303 non-null    int64  
 2   cp            303 non-null    int64  
 3   treshtbps     303 non-null    int64  
 4   chol          302 non-null    float64
 5   fbs           303 non-null    int64  
 6   restecg       303 non-null    int64  
 7   thalach       303 non-null    int64  
 8   exang         303 non-null    int64  
 9   oldpeak       303 non-null    float64
 10  slope         303 non-null    int64  
 11  ca            299 non-null    float64
 12  hsl           301 non-null    float64
 13  heartDisease  303 non-null    int64  
dtypes: float64(4), int64(10)
memory usage: 33.3 KB


In [20]:
# clean_data에 결측치가 포함된 데이터 추출
clean_data[clean_data.isna().any(axis=1)]

Unnamed: 0,age,sex,cp,treshtbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,hsl,heartDisease
2,67,1,4,120,,0,2,129,1,2.6,2,2.0,7.0,1
87,53,0,3,128,216.0,0,2,115,0,0.0,1,0.0,,0
166,52,1,3,138,223.0,0,0,169,0,0.0,1,,3.0,0
192,43,1,4,132,247.0,1,2,143,1,0.1,2,,7.0,1
266,52,1,4,128,204.0,1,0,156,1,1.0,2,0.0,,1
287,58,1,2,125,220.0,0,0,144,0,0.4,2,,7.0,0
302,38,1,3,138,175.0,0,0,173,0,0.0,1,,3.0,0


In [23]:
# 결측치를 중앙값으로 대체
# 열별 중앙값 : clean_data.median(axis=0) - axis=0는 기본값이므로 생략 가능
clean_data.fillna(value=clean_data.median())

Unnamed: 0,age,sex,cp,treshtbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,hsl,heartDisease
0,63,1,1,145,233.0,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286.0,0,0,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,241.5,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250.0,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204.0,0,2,172,0,1.4,1,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264.0,0,0,132,0,1.2,2,0.0,7.0,1
299,68,1,4,144,193.0,1,0,141,0,3.4,2,2.0,7.0,1
300,57,1,4,130,131.0,0,0,115,1,1.2,2,1.0,7.0,1
301,57,0,2,130,236.0,0,2,174,0,0.0,2,1.0,3.0,1


In [25]:
# 결측치가 하나라도 있는 행은 제거
clean_data.dropna(how='any', inplace=True)
clean_data.isnull().sum()

age             0
sex             0
cp              0
treshtbps       0
chol            0
fbs             0
restecg         0
thalach         0
exang           0
oldpeak         0
slope           0
ca              0
hsl             0
heartDisease    0
dtype: int64

In [26]:
clean_data.head()

Unnamed: 0,age,sex,cp,treshtbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,hsl,heartDisease
0,63,1,1,145,233.0,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286.0,0,0,108,1,1.5,2,3.0,3.0,1
3,37,1,3,130,250.0,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204.0,0,2,172,0,1.4,1,0.0,3.0,0
5,56,1,2,120,236.0,0,0,178,0,0.8,1,0.0,3.0,0


In [39]:
# 독립변수(입력변수, X), 종속변수(타겟변수, y)
Input = clean_data.iloc[:, :-1]
Target = clean_data.iloc[:, -1:]
Target = clean_data.iloc[:, [-1]]
Input.shape, Target.shape

((296, 13), (296, 1))

In [41]:
# 전체 296개 데이터 중 심장병이 걸린 데이터 수(1)
Target['heartDisease'].sum()

136

In [42]:
# 심장병이 걸린 데이터 확률
Target['heartDisease'].mean()

0.4594594594594595

In [43]:
# unique 별 데이터 갯수
Target['heartDisease'].value_counts()

0    160
1    136
Name: heartDisease, dtype: int64

In [48]:
# scale 조정(Input 데이터만)
scaler = MinMaxScaler()
scaled_input = pd.DataFrame(scaler.fit_transform(Input))
print(Input.shape, scaled_input.shape)
scaled_input.head(2)

(296, 13) (296, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.708333,1.0,0.0,0.481132,0.244292,1.0,1.0,0.603053,0.0,0.370968,1.0,0.0,0.75
1,0.791667,1.0,1.0,0.622642,0.365297,0.0,0.0,0.282443,1.0,0.241935,0.5,1.0,0.0


In [53]:
# scaled_input와 Target => 학습셋(=훈련셋,train set):시험셋(=test set) = 7:3
X_train, X_test, y_train, y_test =train_test_split(scaled_input, # 입력변수(독립변수)
                 Target,       # 타겟변수(종속변수)
                 test_size=0.3, # 시험셋 사이즈가 지정되면 학습셋은 자동 지정
                 random_state=42 # random seed값
                )
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((207, 13), (207, 1), (89, 13), (89, 1))

## 2. 모델 생성 & 구성(과적합 줄이기 단계 추가)

## 3. 모델 학습과정설정 (accuracy, precision, recall)

## 4. 모델 학습시키기 (훈련셋의 일부를 검증셋으로 학습)

## 5. 모델 평가 
- 과정살펴보기(그래프), evaluate, pd.crosstab/confusion_matrix(혼돈행렬)

## 6. 모델 사용하기