### 피마 인디언 당뇨병 예측

In [2]:
import numpy as np
import pandas as pd

- 1. 데이터 전처리 및 탐색

In [3]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# 1. Number of times pregnant
# 2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
# 3. Diastolic blood pressure (mm Hg)
# 4. Triceps skin fold thickness (mm)
# 5. 2-Hour serum insulin (mu U/ml)
# 6. Body mass index (weight in kg/(height in m)^2)
# 7. Diabetes pedigree function
# 8. Age (years)
# 9. Class variable (0 or 1)
df.columns = ['P', 'G', 'BP', 'S', 'I', 'BMI', 'D', 'Age', 'Target']
df.head()

Unnamed: 0,P,G,BP,S,I,BMI,D,Age,Target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# 결측치 확인
df.isna().sum().sum()

0

In [6]:
# X, y 데이터를 ndarray로 추출
# 일반적인 방법으로 대부분의 CSV 에 적용 가능
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y,
    test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [8]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2023)
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 2023,
 'splitter': 'best'}

In [9]:
dtc.fit(X_train, y_train)

In [10]:
pred = dtc.predict(X_test)

In [11]:
res_df = pd.DataFrame({'y_실제값':y_test, 'y_예측값':pred})
res_df

Unnamed: 0,y_실제값,y_예측값
0,0,0
1,1,1
2,0,0
3,1,1
4,1,0
...,...,...
149,0,0
150,0,0
151,0,0
152,0,0


In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7402597402597403

In [13]:
params = {
    'max_depth': [2, 5, 8],
    'min_samples_split': [2, 3, 4]
}

In [14]:
from sklearn.model_selection import GridSearchCV
grid_dt = GridSearchCV(
    dtc,
    param_grid=params,
    scoring='accuracy',
    cv=5
)

In [15]:
grid_dt.fit(X_train, y_train)

In [16]:
grid_dt.best_params_

{'max_depth': 5, 'min_samples_split': 2}

In [17]:
grid_dt.best_score_

0.7410635745701719

In [18]:
params = {
    'max_depth': [4, 5, 6],
    'min_samples_split': [2, 3, 4]
}

In [19]:
grid_dt = GridSearchCV(
    dtc, params, scoring='accuracy', cv=5
)
grid_dt.fit(X_train, y_train)

In [20]:
grid_dt.best_params_

{'max_depth': 6, 'min_samples_split': 3}

In [21]:
best_dt = grid_dt.best_estimator_

In [22]:
best_dt.score(X_test, y_test)

0.7207792207792207

In [23]:
pred = grid_dt.predict(X_test)
res_dt = pd.DataFrame({ 'y_실제값':y_test, 'y_예측값':pred})
res_dt.head()

Unnamed: 0,y_실제값,y_예측값
0,0,0
1,1,1
2,0,0
3,1,1
4,1,1


In [24]:
best_dt.score(X_test, y_test)

0.7207792207792207

- 실제 적용

In [25]:
X_test[10], y_test[10]

(array([  0.   , 101.   ,  62.   ,   0.   ,   0.   ,  21.9  ,   0.336,
         25.   ]),
 0)

In [26]:
test_data, test_target = X_test[10], y_test[10]

In [29]:
# predict() 메소드를 사용하기 위해서는 2차원 모양이 되어야 함
result = best_dt.predict(test_data.reshape(1, -1))[0]
result

0