In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv("diabetes_prediction_dataset.csv")

df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [2]:
# Female - 0, Male - 1, Other - 2
"""
No Info - 0
current - 1
ever - 2
former - 3
never - 4 
not current - 5
"""

gender = {'Female':0, 'Male': 1, 'Other': 2}
smoking_history = {'No Info': 0, 'current': 1, 'ever': 2, 'former': 3, 'never': 4, 'not current': 5}

In [3]:
df = df.replace(to_replace={'gender': gender, 'smoking_history': smoking_history})
df.head()

  df = df.replace(to_replace={'gender': gender, 'smoking_history': smoking_history})


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [4]:
# 학습 데이터, 훈련 데이터
diabetes_target = df['diabetes'].to_numpy()
diabetes_data = df.drop('diabetes', axis=1).to_numpy()

In [7]:
diabetes_target[:5]

array([0, 0, 0, 0, 0])

In [8]:
# 훈련세트, 테스트 세트 분리
from sklearn.model_selection import train_test_split, GridSearchCV

In [9]:
train_input, test_input, train_target, test_target = train_test_split(
    diabetes_data, diabetes_target, test_size=0.1
)

In [10]:
# 최적의 모델 찾기
from sklearn.ensemble import HistGradientBoostingClassifier

In [11]:
params = {
    'max_iter': np.arange(90, 150, 5),
    'learning_rate': [0.0001, 0.001, 0.002, 0.003, 0.004, 0.01, 0.1],
}

gs = GridSearchCV(HistGradientBoostingClassifier(), params, n_jobs=-1)
gs.fit(train_input, train_target)

0,1,2
,estimator,HistGradientB...ngClassifier()
,param_grid,"{'learning_rate': [0.0001, 0.001, ...], 'max_iter': array([ 90, ...35, 140, 145])}"
,scoring,
,n_jobs,-1
,refit,True
,cv,
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,max_iter,np.int64(120)
,max_leaf_nodes,31
,max_depth,
,min_samples_leaf,20
,l2_regularization,0.0
,max_features,1.0
,max_bins,255
,categorical_features,'from_dtype'


In [12]:
np.mean(gs.cv_results_['mean_test_score'])

np.float64(0.9309625661375662)

In [13]:
gs.best_params_

{'learning_rate': 0.1, 'max_iter': np.int64(120)}

In [14]:
hgb = gs.best_estimator_

In [15]:
print("훈련세트:", hgb.score(train_input, train_target))
print("테스트세트:", hgb.score(test_input, test_target))

훈련세트: 0.9730111111111112
테스트세트: 0.9725


In [16]:
# 최적의 모델을 저장
import pickle
with open("model_ensemble.pkl", "wb") as f:
    pickle.dump(hgb, f)

In [18]:
test_input[:30].tolist()

[[0.0, 65.0, 0.0, 0.0, 3.0, 25.97, 6.2, 160.0],
 [0.0, 71.0, 0.0, 0.0, 4.0, 27.32, 6.2, 155.0],
 [0.0, 1.64, 0.0, 0.0, 0.0, 13.5, 4.5, 200.0],
 [1.0, 1.24, 0.0, 0.0, 0.0, 20.03, 5.8, 100.0],
 [0.0, 30.0, 0.0, 0.0, 0.0, 27.32, 5.8, 159.0],
 [1.0, 56.0, 0.0, 0.0, 1.0, 27.32, 4.8, 159.0],
 [1.0, 80.0, 1.0, 0.0, 3.0, 24.42, 7.5, 155.0],
 [0.0, 76.0, 0.0, 0.0, 4.0, 27.32, 6.0, 80.0],
 [0.0, 29.0, 0.0, 0.0, 0.0, 21.17, 3.5, 158.0],
 [0.0, 54.0, 0.0, 0.0, 0.0, 21.68, 4.0, 90.0],
 [1.0, 56.0, 0.0, 0.0, 1.0, 27.32, 5.0, 140.0],
 [0.0, 42.0, 0.0, 0.0, 0.0, 22.88, 4.0, 85.0],
 [0.0, 41.0, 0.0, 0.0, 1.0, 18.84, 6.0, 130.0],
 [0.0, 46.0, 0.0, 0.0, 0.0, 22.45, 4.0, 85.0],
 [0.0, 65.0, 0.0, 0.0, 3.0, 45.7, 6.5, 260.0],
 [1.0, 55.0, 0.0, 0.0, 1.0, 29.64, 5.7, 280.0],
 [1.0, 38.0, 1.0, 0.0, 2.0, 46.89, 6.2, 155.0],
 [1.0, 24.0, 0.0, 0.0, 4.0, 27.32, 4.8, 130.0],
 [0.0, 37.0, 0.0, 0.0, 0.0, 27.32, 3.5, 130.0],
 [1.0, 18.0, 0.0, 0.0, 4.0, 21.9, 3.5, 200.0],
 [0.0, 27.0, 0.0, 0.0, 3.0, 40.02, 4.5, 158.0],