In [3]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [4]:
df = pd.read_csv('../datasets/diabetes_dropped_imputed_encoded.csv')

In [5]:
df.head()

Unnamed: 0,id,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,...,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn,height_meters,BMI,hip_waist_ratio,diabetic
0,0,203.0,27,56.0,3.6,4.31,0,27,0,62.0,...,59.0,152.382979,92.524823,29.0,38.0,720.0,1.5748,48.79042,0.763158,0
1,1,165.0,42,24.0,6.9,4.44,0,10,0,64.0,...,68.0,152.382979,92.524823,46.0,48.0,360.0,1.6256,82.495282,0.958333,0
2,2,228.0,37,37.0,6.2,4.64,0,39,0,61.0,...,92.0,185.0,92.0,49.0,57.0,180.0,1.5494,106.638214,0.859649,0
3,3,78.0,38,12.0,6.5,4.63,0,48,1,67.0,...,50.0,152.382979,92.524823,33.0,38.0,480.0,1.7018,41.089412,0.868421,0
4,4,249.0,35,28.0,8.9,7.72,0,45,1,68.0,...,80.0,152.382979,92.524823,44.0,41.0,300.0,1.7272,61.343116,1.073171,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 23 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               403 non-null    int64  
 1   chol             403 non-null    float64
 2   stab.glu         403 non-null    int64  
 3   hdl              403 non-null    float64
 4   ratio            403 non-null    float64
 5   glyhb            403 non-null    float64
 6   location         403 non-null    int64  
 7   age              403 non-null    int64  
 8   gender           403 non-null    int64  
 9   height           403 non-null    float64
 10  weight           403 non-null    float64
 11  frame            403 non-null    int64  
 12  bp.1s            403 non-null    float64
 13  bp.1d            403 non-null    float64
 14  bp.2s            403 non-null    float64
 15  bp.2d            403 non-null    float64
 16  waist            403 non-null    float64
 17  hip             

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,403.0,201.0,116.480327,0.0,100.5,201.0,301.5,402.0
chol,403.0,207.845771,44.390242,78.0,179.0,204.0,230.0,443.0
stab.glu,403.0,41.491315,24.51757,0.0,26.0,34.0,51.0,115.0
hdl,403.0,50.445274,17.241141,12.0,38.0,46.0,59.0,120.0
ratio,403.0,4.521642,1.725736,1.5,3.2,4.2,5.4,19.299999
glyhb,403.0,5.589769,2.206036,2.68,4.39,4.87,5.589769,16.110001
location,403.0,0.503722,0.500608,0.0,0.0,1.0,1.0,1.0
age,403.0,27.776675,16.139774,0.0,15.0,26.0,41.0,67.0
gender,403.0,0.419355,0.494067,0.0,0.0,0.0,1.0,1.0
height,403.0,66.020101,3.89407,52.0,63.0,66.0,69.0,76.0


In [8]:
df.isnull().values.any()

False

In [9]:
df.columns

Index(['id', 'chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'location', 'age',
       'gender', 'height', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s',
       'bp.2d', 'waist', 'hip', 'time.ppn', 'height_meters', 'BMI',
       'hip_waist_ratio', 'diabetic'],
      dtype='object')

In [10]:
train_cols = ['chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'age',
       'gender', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s',
       'bp.2d', 'waist', 'hip', 'time.ppn', 'height_meters', 'BMI',
       'hip_waist_ratio']

x = df[train_cols]
y = df['diabetic']

In [28]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [29]:
x_train_scale = StandardScaler().fit_transform(x_train)
x_test_scale = StandardScaler().fit_transform(x_test)

In [30]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 18, 1), 
    'gamma': hp.uniform('gamma', 1, 9), 
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1), 
    'reg_lambda': hp.uniform('reg-lambda', 0, 1), 
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10,  1),
    'n_estimators': 180,
    'seed': 0,
    'use_label_encoder' : False
}

In [31]:
def objective(space): 
    clf = XGBClassifier(
        n_estimators = space['n_estimators'], 
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_alpha = int(space['reg_alpha']),
        reg_lambda = int(space['reg_lambda']),
        min_child_weight = int(space['min_child_weight']),
        colsample_bytree = int(space['colsample_bytree']),
        use_label_encoder= space['use_label_encoder']
    )
    
    evaluation = [(x_train, y_train), (x_test, y_test)]
    
    clf.fit(x_train, y_train, eval_set=evaluation, eval_metric='auc', early_stopping_rounds=10, verbose=False)
    
    pred = clf.predict(x_test)
    
    accuracy = accuracy_score(y_test, pred>0.5)
    
    print('Score: ', accuracy)
    
    return {'loss' : -accuracy, 'status' : STATUS_OK}

In [32]:
trials = Trials()

best_hyperparameters = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 100,
    trials = trials
)

Score:                                                 
0.8024691358024691                                     
Score:                                                                            
0.8024691358024691                                                                
Score:                                                                            
0.8024691358024691                                                                
Score:                                                                            
0.8024691358024691                                                                
Score:                                                                            
0.8024691358024691                                                                
Score:                                                                            
0.8024691358024691                                                                
Score:                                                    

In [33]:
print(best_hyperparameters)

{'colsample_bytree': 0.5806692234849613, 'gamma': 4.649197180671891, 'max_depth': 8.0, 'min_child_weight': 3.0, 'reg-lambda': 0.7326776941164995, 'reg_alpha': 99.0}


In [39]:
xgb_md = XGBClassifier(
    colsample_bytree = 0.5,
    gamma = 4.64,
    max_depth = 8, 
    min_child_weight = 3,
    reg_lambda = 0.73,
    reg_alpha = 99,
    use_label_encoder=False,
    objective='reg:logistic'
)

In [40]:
xgb_md.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=4.64, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=8,
              min_child_weight=3, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='reg:logistic', random_state=0, reg_alpha=99,
              reg_lambda=0.73, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [41]:
xg_preds = xgb_md.predict(x_test)

In [42]:
xgb_rmse = np.sqrt(mean_squared_error(y_test, xg_preds))

In [43]:
xgb_rmse

0.4444444444444444