# pima-indians-diabetes
- 피마 인디언 당뇨병 예측

In [1]:
import pandas as pd
import numpy as np

## 1. 데이터 전처리

In [2]:
df = pd.read_csv('pima-indians-diabetes.csv')
df.head()

Unnamed: 0,pregnant,plasma,pressure,triceps,insulin,bmi,pedigree,age,target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.isna().sum()  # NA 필드가 있는지 확인

pregnant    0
plasma      0
pressure    0
triceps     0
insulin     0
bmi         0
pedigree    0
age         0
target      0
dtype: int64

In [4]:
df.iloc[:, :-1]  # 슬라이싱이 잘 되나 확인

Unnamed: 0,pregnant,plasma,pressure,triceps,insulin,bmi,pedigree,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [5]:
df.iloc[:5, :8]

Unnamed: 0,pregnant,plasma,pressure,triceps,insulin,bmi,pedigree,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [6]:
df.target.value_counts()

0    500
1    268
Name: target, dtype: int64

### 2.Train/Test 데이터 셋으로 분리

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, :-1], df.target, stratify=df.target,
    test_size=0.2,random_state=2021
)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [9]:
y_test.value_counts()

0    100
1     54
Name: target, dtype: int64

### 3. 모델을 생성하고 학습

In [10]:
from sklearn.tree import DecisionTreeClassifier

In [11]:
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

### 4. 학습된 모델로 예측

In [12]:
pred = dtc.predict(X_test)

### 5. 예측값과 실제값을 비교하여 모델 성능을 평가

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7077922077922078

### GridSearchCV 사용

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
params = {
    'max_depth' : [2, 4, 6],
    'min_samples_split' : [2, 4, 6]
}

In [16]:
dtc = DecisionTreeClassifier(random_state=2021)
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=3)
grid_dtc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]})

In [17]:
grid_dtc.best_score_

0.7443248844253149

In [18]:
grid_dtc.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [19]:
pred = grid_dtc.best_estimator_.predict(X_test)
accuracy_score(y_test, pred)

0.7337662337662337

In [20]:
pred.shape

(154,)

- 실제 값 하나를 주고 당뇨병인지 아닌지 여부를 확인

In [21]:
estimator = grid_dtc.best_estimator_
test_data = [4, 110, 92, 0, 0, 97.6, 0.191, 30]
predict = dtc.predict(test_data)  # 오류 : 차원이 안 맞아서

NotFittedError: This DecisionTreeClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [28]:
estimator = grid_dtc.best_estimator_
test_data = [4,110,92,0,0,37.6,0.191,30]
predict = estimator.predict(np.array(test_data).reshape(1,8))

In [29]:
predict

array([0], dtype=int64)

In [30]:
predict[0]

0

In [31]:
estimator.predict([test_data])[0]

0