In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [2]:
df = pd.read_csv('../datasets/synthesized_diabetes_data.csv')

In [3]:
df.head()

Unnamed: 0,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,...,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn,height_meters,BMI,hip_waist_ratio,diabetic
0,156.700394,31,52.154896,3.314927,4.299748,1,7,1,68.496912,146.234609,...,85.186571,151.973658,92.697071,29.222653,34.012633,761.958003,1.66703,53.028606,0.815154,0
1,205.708528,50,74.558454,2.397619,4.750497,1,50,0,67.514518,155.252176,...,88.848128,152.708004,92.802874,32.88762,40.64047,715.021735,1.68185,42.542497,0.855346,1
2,184.25974,39,50.107744,2.436952,5.033467,1,56,0,63.226992,134.186382,...,94.615906,153.064309,92.212116,32.548504,36.741043,210.965098,1.565997,54.781513,0.863442,0
3,236.859522,40,46.163972,3.794523,5.382256,0,51,0,66.799333,202.246191,...,79.333852,151.949308,92.541618,45.72684,50.127763,801.80059,1.752456,65.815471,0.851003,1
4,189.616025,92,40.045044,3.633247,4.634788,1,10,1,69.283944,186.218617,...,67.017735,151.475941,92.036697,32.192694,41.216135,160.570897,1.779865,58.610103,0.890379,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   chol             6500 non-null   float64
 1   stab.glu         6500 non-null   int64  
 2   hdl              6500 non-null   float64
 3   ratio            6500 non-null   float64
 4   glyhb            6500 non-null   float64
 5   location         6500 non-null   int64  
 6   age              6500 non-null   int64  
 7   gender           6500 non-null   int64  
 8   height           6500 non-null   float64
 9   weight           6500 non-null   float64
 10  frame            6500 non-null   int64  
 11  bp.1s            6500 non-null   float64
 12  bp.1d            6500 non-null   float64
 13  bp.2s            6500 non-null   float64
 14  bp.2d            6500 non-null   float64
 15  waist            6500 non-null   float64
 16  hip              6500 non-null   float64
 17  time.ppn      

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
chol,6500.0,194.529923,38.786233,88.190343,168.465145,191.462452,215.610411,404.115523
stab.glu,6500.0,57.202308,29.590738,0.0,32.0,47.0,85.0,133.0
hdl,6500.0,42.209101,13.987214,13.239606,32.197867,39.158042,49.521328,114.52473
ratio,6500.0,4.662652,2.081794,1.624225,3.236861,4.110966,5.770165,21.911009
glyhb,6500.0,6.546574,2.654119,2.399973,4.476406,5.230143,8.739059,15.881888
location,6500.0,0.444923,0.496996,0.0,0.0,0.0,1.0,1.0
age,6500.0,32.684462,16.465577,-10.0,18.0,35.0,46.0,74.0
gender,6500.0,0.436615,0.496004,0.0,0.0,0.0,1.0,1.0
height,6500.0,66.10659,3.860517,56.437835,63.03828,65.571497,69.272857,77.176333
weight,6500.0,179.415893,36.611845,84.528396,154.144628,173.301007,197.386422,321.264639


In [6]:
df.isnull().values.any()

False

In [7]:
df.columns

Index(['chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'location', 'age',
       'gender', 'height', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s',
       'bp.2d', 'waist', 'hip', 'time.ppn', 'height_meters', 'BMI',
       'hip_waist_ratio', 'diabetic'],
      dtype='object')

In [8]:
train_cols = ['chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'age',
       'gender', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s',
       'bp.2d', 'waist', 'hip', 'time.ppn', 'height_meters', 'BMI',
       'hip_waist_ratio']

train_cols_few = ['chol', 'stab.glu', 'glyhb', 'age',
       'gender', 'weight', 'bp.1s', 'waist', 'hip', 'BMI',
       'hip_waist_ratio']

x = df[train_cols_few]
y = df['diabetic']

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [10]:
x_train_scale = StandardScaler().fit_transform(x_train)
x_test_scale = StandardScaler().fit_transform(x_test)

In [11]:
space = {
    'max_depth': hp.quniform('max_depth', 3, 18, 1), 
    'gamma': hp.uniform('gamma', 1, 9), 
    'reg_alpha': hp.quniform('reg_alpha', 40, 180, 1), 
    'reg_lambda': hp.uniform('reg-lambda', 0, 1), 
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 0, 10,  1),
    'n_estimators': 180,
    'seed': 0,
    'use_label_encoder' : False
}

In [12]:
def objective(space): 
    clf = XGBClassifier(
        n_estimators = space['n_estimators'], 
        max_depth = int(space['max_depth']),
        gamma = space['gamma'],
        reg_alpha = int(space['reg_alpha']),
        reg_lambda = int(space['reg_lambda']),
        min_child_weight = int(space['min_child_weight']),
        colsample_bytree = int(space['colsample_bytree']),
        use_label_encoder= space['use_label_encoder']
    )
    
    evaluation = [(x_train_scale, y_train), (x_test_scale, y_test)]
    
    clf.fit(x_train_scale, y_train, eval_set=evaluation, eval_metric='auc', early_stopping_rounds=10, verbose=False)
    
    pred = clf.predict(x_test_scale)
    
    accuracy = accuracy_score(y_test, pred>0.5)
    
    print('Score: ', accuracy)
    
    return {'loss' : -accuracy, 'status' : STATUS_OK}

In [13]:
trials = Trials()

best_hyperparameters = fmin(
    fn = objective,
    space = space,
    algo = tpe.suggest,
    max_evals = 100,
    trials = trials
)

Score:                                                 
0.8512820512820513                                     
Score:                                                                            
0.8435897435897436                                                                
Score:                                                                            
0.8492307692307692                                                                
Score:                                                                            
0.8502564102564103                                                                
Score:                                                                            
0.8569230769230769                                                                
Score:                                                                            
0.8476923076923077                                                                
Score:                                                    

In [14]:
print(best_hyperparameters)

{'colsample_bytree': 0.5033374009337901, 'gamma': 4.2693170362258535, 'max_depth': 5.0, 'min_child_weight': 9.0, 'reg-lambda': 0.004098361803673067, 'reg_alpha': 40.0}


In [15]:
xgb_md = XGBClassifier(
    colsample_bytree = 0.97,
    gamma = 4.19,
    max_depth = 4, 
    min_child_weight = 8,
    reg_lambda = 0.12,
    reg_alpha = 58,
    use_label_encoder=False,
    objective='reg:logistic'
)

In [16]:
xgb_md.fit(x_train_scale, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.97, gamma=4.19, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=4,
              min_child_weight=8, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1,
              objective='reg:logistic', random_state=0, reg_alpha=58,
              reg_lambda=0.12, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [17]:
xg_preds = xgb_md.predict(x_test_scale)

In [18]:
xgb_rmse = np.sqrt(mean_squared_error(y_test, xg_preds))

In [19]:
xgb_rmse

0.3816294349772114