<a href="https://colab.research.google.com/github/vitamingyu/ml-for-Healthcare-Analytics/blob/main/diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 분석순서
# 딥러닝, 그리드 탐색을 사용한 당뇨병 진단
# 데이터셋 소개
# 케라스 모델 설계
# 사이킷런으로 그리드 탐색 실행하기
# 드롭아웃 규제로 과적합줄이기
# 최적의 초매개변수 찾기
# 최적의 초매개변수를 사용해 예측하기

In [3]:
# 딥러닝 알고리즘을 사용해 당뇨병 발생을 예측하는 방법과 그리드 탐색법을 사용해 알고리즘을 최적화하는 방법을 다룬다
import sys
import pandas as pd
import numpy as np
import sklearn
from tensorflow import keras

In [4]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['n_pregnant', 'glucose_concentration', 'blood_pressure (mm Hg)', 'skin_thickness (mm)', 'serum_insulin (mu U/ml)', 'BMI', 'pedigree_function', 'age', 'class']

df = pd.read_csv(url, names=names)
df.describe()

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
df[df['glucose_concentration']==0]
# 혈당이 0인 값(이상치) 필터링

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
75,1,0,48,20,0,24.7,0.14,22,0
182,1,0,74,20,23,27.7,0.299,21,0
342,1,0,68,35,0,32.0,0.389,22,0
349,5,0,80,32,0,41.0,0.346,37,1
502,6,0,68,41,0,39.0,0.727,41,1


In [8]:
columns=['glucose_concentration', 'blood_pressure (mm Hg)', 'skin_thickness (mm)', 'serum_insulin (mu U/ml)', 'BMI']
for col in columns:
  df[col].replace(0, np.nan, inplace=True)  # 0을 결측값으로 대체
df.describe()  # count는 결측값이 아닌 행의 수

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [9]:
df.dropna(inplace=True)
df.describe()

Unnamed: 0,n_pregnant,glucose_concentration,blood_pressure (mm Hg),skin_thickness (mm),serum_insulin (mu U/ml),BMI,pedigree_function,age,class
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777,0.471401
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [10]:
# 데이터 프레임을 Numpy 배열로 변환하기 위해 .values속성을 사용
dataset = df.values
print(dataset.shape)  # (392, 9)

(392, 9)


In [11]:
# Numpy배열은 인덱싱하기에 편리
X = dataset[:, 0:8]
Y = dataset[:, 8].astype(int)  # 정수로 변환

print(X.shape)  # (392, 8)
print(Y.shape)  # (392,)

print(X[:5])  # 부동소수점
print(Y[:5])  # 정수

(392, 8)
(392,)
[[1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
  2.100e+01]
 [0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
  3.300e+01]
 [3.000e+00 7.800e+01 5.000e+01 3.200e+01 8.800e+01 3.100e+01 2.480e-01
  2.600e+01]
 [2.000e+00 1.970e+02 7.000e+01 4.500e+01 5.430e+02 3.050e+01 1.580e-01
  5.300e+01]
 [1.000e+00 1.890e+02 6.000e+01 2.300e+01 8.460e+02 3.010e+01 3.980e-01
  5.900e+01]]
[0 1 1 1 1]


In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X)  # 스케일링을 위한 행렬이 내부에서 만들어진다
# fit 메서드는 스케일링을 위한 파라미터를 계산하는 역할,  훈련 데이터를 이용해 평균과 표준편차를 계산, 이 정보를 이용하여 나중에 데이터를 표준화합니다.

# 훈련 데이터의 변환(표준화)
X_standardized = scaler.transform(X)

data = pd.DataFrame(X_standardized)
data.describe()
# 정규화돼 평균 0, 표준편차 1을 가진다.
# 값들이 정규화돼 머신러닝 알고리즘이 모든 열에 대해 특별히 의존하거나 가중치를 부여하는 일 없이 균등한 값으로 처리할 수 있게 됨

Unnamed: 0,0,1,2,3,4,5,6,7
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,-9.063045e-18,1.132881e-17,-4.531523e-16,1.087565e-16,1.064908e-16,1.631348e-16,1.8126090000000003e-17,1.110223e-16
std,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278
min,-1.029213,-2.161731,-3.739001,-2.108484,-1.196867,-2.120941,-1.269525,-0.9682991
25%,-0.7174265,-0.7665958,-0.694164,-0.7755315,-0.6681786,-0.667678,-0.7340909,-0.771985
50%,-0.4056403,-0.1176959,-0.05314565,-0.01384444,-0.2574448,0.01621036,-0.2131475,-0.3793569
75%,0.5297185,0.6609841,0.5878727,0.7478426,0.2859877,0.5718696,0.4751644,0.5040564
max,4.271153,2.445459,3.151946,3.223325,5.81299,4.846172,5.497667,4.921123


In [25]:
!pip install scikeras

from sklearn.model_selection import GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier
from keras.optimizers import Adam

Collecting scikeras
  Downloading scikeras-0.12.0-py3-none-any.whl (27 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.12.0


In [26]:
# 모델 정의 과정을 사용자 정의 함수로 만들 것임
# 앞으로 매개변수를 바꿔가면서 모델을 여러 번 초기화해 재사용할 것이기 때문에 함수로 정의

def create_model():
  # 케라스 모델 정의
  model = Sequential()
  model.add(Dense(8, input_dim = 8, kernel_initializer='normal', activation='relu'))
  model.add(Dense(4, input_dim = 8 ,kernel_initializer='normal', activation='relu'))
  model.add(Dense(1, activation='sigmoid'))

  # 모델 컴파일
  adam = Adam(learning_rate = 0.01)
  model.compile(loss = 'bivary_crossentropy', optimizer=adam, metrics=['accuracy'])
  return model

model = create_model()
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 72        
                                                                 
 dense_1 (Dense)             (None, 4)                 36        
                                                                 
 dense_2 (Dense)             (None, 1)                 5         
                                                                 
Total params: 113 (452.00 Byte)
Trainable params: 113 (452.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [None]:
# 위 신경망의 모든 매개변수의 개수는 113개이다 Total params: 113
