# 피마 인디언 당뇨병 예측

In [1]:
import numpy as np
import pandas as pd

- 데이터 전처리

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [30]:
df.columns = ['P','I','BP','ST','I','BMI','DPF','A','Class']
df.head()

Unnamed: 0,P,I,BP,ST,I.1,BMI,DPF,A,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [10]:
X = df.iloc[:,:-1]
y = df['Class']
X.shape, y.shape

((768, 8), (768,))

In [11]:
type(y)

pandas.core.series.Series

In [12]:
X = df.iloc[:,:-1].values
y = df['Class'].values
X.shape, y.shape

((768, 8), (768,))

In [13]:
type(y)

numpy.ndarray

In [14]:
np.unique(y, return_counts=True)

(array([0, 1], dtype=int64), array([500, 268], dtype=int64))

- Train/Test dataset 분리

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)

In [16]:
np.unique(y_train, return_counts=True)

(array([0, 1], dtype=int64), array([400, 214], dtype=int64))

- Model 생성 및 학습

In [17]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)

In [18]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

- 예측 및 평가

In [19]:
pred = dtc.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7077922077922078

In [21]:
dtc.score(X_test, y_test)

0.7077922077922078

- 최적의 하이퍼 파라메터 도출 및 교차 검증

In [22]:
params = {
    'max_depth': [2,4,6],
    'min_samples_split': [2,4,6]
}

In [24]:
from sklearn.model_selection import GridSearchCV

grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [25]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [26]:
params = {
    'max_depth': [2,3,4],
    'min_samples_split': [2,3,4]
}

In [27]:
grid_dt = GridSearchCV(dtc, param_grid=params, scoring='accuracy', cv=3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [28]:
grid_dt.best_params_

{'max_depth': 3, 'min_samples_split': 2}

In [29]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.7142857142857143

- 실제 값 하나가 주어졌을 때 당뇨병 여부를 확인하는 방법

In [31]:
y_test[33]

0

In [33]:
X_test[33]

array([  0.   , 126.   ,  86.   ,  27.   , 120.   ,  27.4  ,   0.515,
        21.   ])

In [34]:
test_data = X_test[33]

In [38]:
result = best_dt.predict(test_data.reshape(1,8))[0]
print('음성' if result == 0 else '양성')

음성
