# Import dataset

In [20]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('diabetes.csv')

In [21]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='Outcome')
y = df.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Use XGBoost to predict outcome

In [23]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'objective':['reg:linear'],
              'learning_rate': [.01, 0.05, 0.1, 0.2], #so called `eta` value
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 5, 10],
              'silent': [1],
              'n_estimators': [500, 600, 700],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'max_depth': [3, 4, 5],
                'reg_alpha': [1.1, 1.2, 1.3],
                'reg_lambda': [1.1, 1.2, 1.3],
                'subsample': [0.7, 0.8, 0.9]
             }

xgb_grid = GridSearchCV(XGBClassifier(),
                        parameters,
                        cv = 2,
                        n_jobs = 2,
                        verbose=True)

In [24]:
xgb_grid.fit(X_train,y_train)

Fitting 2 folds for each of 43740 candidates, totalling 87480 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    5.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   14.9s
[Parallel(n_jobs=2)]: Done 546 tasks      | elapsed:   36.6s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 2146 tasks      | elapsed:  2.6min
[Parallel(n_jobs=2)]: Done 3246 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done 4546 tasks      | elapsed:  5.5min
[Parallel(n_jobs=2)]: Done 6046 tasks      | elapsed:  7.2min
[Parallel(n_jobs=2)]: Done 7746 tasks      | elapsed:  9.1min
[Parallel(n_jobs=2)]: Done 9646 tasks      | elapsed: 11.3min
[Parallel(n_jobs=2)]: Done 11746 tasks      | elapsed: 13.7min
[Parallel(n_jobs=2)]: Done 14046 tasks      | elapsed: 16.2min
[Parallel(n_jobs=2)]: Done 16546 tasks      | elapsed: 18.9min
[Parallel(n_jobs=2)]: Done 19246 tasks      | elapsed: 22.0min
[Parallel(n_jobs=2)]: Done 22146 tasks      | elapsed: 2

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=2,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             n_jobs=2,
             param_grid={'colsample_bytree': [0.6, 0.8, 1.0],
                         'gamma': [0.5, 1, 1.5, 2, 5],
        

In [26]:
print(xgb_grid.best_params_)
gbm = XGBClassifier(**xgb_grid.best_params_)
gbm.fit(X_train,y_train)

{'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.2, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 700, 'objective': 'reg:linear', 'reg_alpha': 1.3, 'reg_lambda': 1.1, 'silent': 1, 'subsample': 0.8}
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.2, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=700, n_jobs=0, num_parallel_tree=1,
              objective='reg:linear', random_state=0, reg_alpha=1.3,
              reg_lambda=1.1, scale_pos_weight=1, silent=1, subsample=0.8,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [27]:
from sklearn.metrics import accuracy_score

predictions = gbm.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 79.22%
