In [1]:
import numpy as np
import pandas as pd

patient_data = pd.read_csv("data/Health_Data.csv")
patient_data.head()

Unnamed: 0,Patient_id,Age,Admission_type,PreExistingDisease,PreviousSurgery,Gender,Smoker,Homeless,DaysinHospital,Readmitted
0,1,33,Urgent,Y,0,M,1,0,1,0
1,2,34,Emergency,N,0,M,1,0,22,0
2,3,88,Trauma,Y,1,M,1,1,100,1
3,4,56,Elective,Y,0,M,1,0,2,0
4,5,45,Trauma,Y,0,M,1,0,34,0


In [4]:
patient_data.describe()

Unnamed: 0,Patient_id,Age,PreviousSurgery,Smoker,Homeless,DaysinHospital,Readmitted
count,357.0,357.0,357.0,357.0,357.0,357.0,357.0
mean,179.0,42.57423,0.341737,0.596639,0.378151,43.182073,0.193277
std,103.20126,29.274624,0.474957,0.491261,0.485606,47.362609,0.395423
min,1.0,0.0,0.0,0.0,0.0,1.0,0.0
25%,90.0,14.0,0.0,0.0,0.0,12.0,0.0
50%,179.0,35.0,0.0,1.0,0.0,32.0,0.0
75%,268.0,67.0,1.0,1.0,1.0,55.0,0.0
max,357.0,96.0,1.0,1.0,1.0,352.0,1.0


In [5]:
patient_data.describe(include=['object'])

Unnamed: 0,Admission_type,PreExistingDisease,Gender
count,357,357,357
unique,5,2,2
top,Urgent,N,F
freq,131,228,186


In [7]:
X = patient_data.iloc[:,1:9]
y = patient_data.iloc[:,9]

In [8]:
X.head()

Unnamed: 0,Age,Admission_type,PreExistingDisease,PreviousSurgery,Gender,Smoker,Homeless,DaysinHospital
0,33,Urgent,Y,0,M,1,0,1
1,34,Emergency,N,0,M,1,0,22
2,88,Trauma,Y,1,M,1,1,100
3,56,Elective,Y,0,M,1,0,2
4,45,Trauma,Y,0,M,1,0,34


In [9]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: Readmitted, dtype: int64

In [10]:
## 신규 입원 유형
A_type = pd.get_dummies(X.iloc[:,1], drop_first=True, prefix='Atype')
## 신규 성별
New_gender = pd.get_dummies(X.iloc[:,4], drop_first=True, prefix='Gender')
## 신규 기존 질병
Pre_exdis = pd.get_dummies(X.iloc[:,2], drop_first=True, prefix='PreExistDis')

In [11]:
A_type.head()

Unnamed: 0,Atype_Emergency,Atype_Newborn,Atype_Trauma,Atype_Urgent
0,0,0,0,1
1,1,0,0,0
2,0,0,1,0
3,0,0,0,0
4,0,0,1,0


In [12]:
New_gender.head()

Unnamed: 0,Gender_M
0,1
1,1
2,1
3,1
4,1


In [13]:
Pre_exdis.head()

Unnamed: 0,PreExistDis_Y
0,1
1,0
2,1
3,1
4,1


In [14]:
## 원래 있던 범주형 행 제거
X.drop(['Admission_type','PreExistingDisease','Gender'], axis=1, inplace=True)
## X 데이터프레임에 신규 변환된 데이터를 붙인다
X = pd.concat([X, A_type, New_gender, Pre_exdis], axis=1)

In [15]:
X.head()

Unnamed: 0,Age,PreviousSurgery,Smoker,Homeless,DaysinHospital,Atype_Emergency,Atype_Newborn,Atype_Trauma,Atype_Urgent,Gender_M,PreExistDis_Y
0,33,0,1,0,1,0,0,0,1,1,1
1,34,0,1,0,22,1,0,0,0,1,0
2,88,1,1,1,100,0,0,1,0,1,1
3,56,0,1,0,2,0,0,0,0,1,1
4,45,0,1,0,34,0,0,1,0,1,1


In [16]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=110)

In [18]:
## StandardScaler 초기화
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
# 학습 데이터 변환
xtrain = sc.fit_transform(xtrain)
xtrain = pd.DataFrame(xtrain, columns=xtest.columns)
# 테스트 데이터 변환
xtest = sc.transform(xtest)
xtest = pd.DataFrame(xtest, columns=xtrain.columns)

In [19]:
xtrain.head()

Unnamed: 0,Age,PreviousSurgery,Smoker,Homeless,DaysinHospital,Atype_Emergency,Atype_Newborn,Atype_Trauma,Atype_Urgent,Gender_M,PreExistDis_Y
0,1.415411,1.340803,0.792118,1.210515,1.883715,-0.363068,-0.303488,1.835326,-0.739313,-0.911527,1.306339
1,1.583479,1.340803,0.792118,1.210515,1.069799,-0.363068,-0.303488,1.835326,-0.739313,-0.911527,1.306339
2,0.675914,-0.745822,0.792118,1.210515,-0.453684,-0.363068,-0.303488,-0.544862,1.352607,1.09706,-0.765498
3,0.306166,-0.745822,-1.262438,-0.826095,-0.244988,2.754307,-0.303488,-0.544862,-0.739313,-0.911527,-0.765498
4,-1.071987,-0.745822,0.792118,-0.826095,-0.912816,-0.363068,-0.303488,-0.544862,-0.739313,1.09706,-0.765498


In [20]:
# 데이터 프레임을 넘파이 배열로 전환
x_train = xtrain.values
x_test = xtest.values
y_train = ytrain.values
y_test = ytest.values

In [22]:
x_train

array([[ 1.41541111,  1.34080305,  0.79211803, ..., -0.73931309,
        -0.91152748,  1.30633906],
       [ 1.58347853,  1.34080305,  0.79211803, ..., -0.73931309,
        -0.91152748,  1.30633906],
       [ 0.67591449, -0.74582169,  0.79211803, ...,  1.35260691,
         1.09705963, -0.76549805],
       ...,
       [-1.00475965,  1.34080305,  0.79211803, ..., -0.73931309,
        -0.91152748, -0.76549805],
       [-0.29887651, -0.74582169, -1.26243812, ...,  1.35260691,
        -0.91152748,  1.30633906],
       [ 0.70952797, -0.74582169,  0.79211803, ...,  1.35260691,
        -0.91152748, -0.76549805]])

In [23]:
## 필요한 케라스 라이브러리 임포트
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

Using TensorFlow backend.


In [24]:
model = Sequential()
# 첫번째 밀집 레이어와 드롭아웃 레이어 추가
model.add(Dense(units=6, activation='relu', kernel_initializer='uniform', input_dim=11))
model.add(Dropout(rate=0.3))
# 두번째 밀집 레이어와 드롭아웃 레이어 추가
model.add(Dense(units=6, activation='relu', kernel_initializer='uniform'))
model.add(Dropout(rate=0.3))
# 결과 밀집 레이어 추가
model.add(Dense(units=1, activation='sigmoid', kernel_initializer='uniform'))
# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# 모델 피팅
model.fit(x_train, y_train, epochs=100, batch_size=20)





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/100





Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f207919d6a0>

In [25]:
# 예측 변수는 2개 생성한다. y_pred_class 와 y_pred_prob
# y_pred_class 는 예측 결과이고, y_pred_prob 는 예측 확률이다.

y_pred_class = model.predict(x_test)
y_pred_prob = model.predict_proba(x_test)

In [26]:
y_pred_class[:5]

array([[0.08845335],
       [0.00634357],
       [0.05947018],
       [0.05425912],
       [0.03723252]], dtype=float32)

In [27]:
y_pred_prob[:5]

array([[0.08845335],
       [0.00634357],
       [0.05947018],
       [0.05425912],
       [0.03723252]], dtype=float32)

In [28]:
## 임계치 설정. 임계치 이상은 모두 1, 아래는 모두 0이 된다.
y_pred_class = y_pred_class > 0.5

In [29]:
y_pred_class[:5]

array([[False],
       [False],
       [False],
       [False],
       [False]])

In [30]:
y_pred_class.astype(int)[:5]

array([[0],
       [0],
       [0],
       [0],
       [0]])

In [31]:
# 사이킷런의 accuracy_score 함수를 사용해 정확성을 계산해본다.
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_class)

0.8703703703703703

In [32]:
###### 널 정확성 계산
ytest.value_counts()

0    91
1    17
Name: Readmitted, dtype: int64

In [33]:
ytest.value_counts().head(1)/len(ytest)

0    0.842593
Name: Readmitted, dtype: float64