# Import dataset

In [4]:
# Import libraries and data set
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('diabetes.csv')

In [5]:
# Have a look at the first five rows
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Check if any column has missing data
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
# Split into train and test sets
from sklearn.model_selection import train_test_split

X = df.drop(columns='Outcome')
y = df.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

### Use XGBoost to predict outcome

In [8]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

parameters = {'objective':['reg:linear'],
              'learning_rate': [.01, 0.05, 0.1, 0.2], #so called `eta` value
              'gamma': [0.5, 1, 1.5, 2, 5],
              'subsample': [0.6, 0.8, 1.0],
              'min_child_weight': [1, 5, 10],
              'silent': [1],
              'n_estimators': [500, 600, 700],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'max_depth': [3, 4, 5],
                'reg_alpha': [1.1, 1.2, 1.3],
                'reg_lambda': [1.1, 1.2, 1.3],
                'subsample': [0.7, 0.8, 0.9]
             }

xgb_grid = GridSearchCV(XGBClassifier(),
                        parameters,
                        cv = 2,
                        n_jobs = 2,
                        verbose=True)

In [9]:
xgb_grid.fit(X_train,y_train)

Fitting 2 folds for each of 43740 candidates, totalling 87480 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  50 tasks      | elapsed:    4.3s
[Parallel(n_jobs=2)]: Done 350 tasks      | elapsed:   23.6s
[Parallel(n_jobs=2)]: Done 850 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done 1550 tasks      | elapsed:  2.0min
[Parallel(n_jobs=2)]: Done 2450 tasks      | elapsed:  2.9min
[Parallel(n_jobs=2)]: Done 3550 tasks      | elapsed:  4.3min
[Parallel(n_jobs=2)]: Done 4850 tasks      | elapsed:  5.9min
[Parallel(n_jobs=2)]: Done 6350 tasks      | elapsed:  7.7min
[Parallel(n_jobs=2)]: Done 8050 tasks      | elapsed:  9.8min
[Parallel(n_jobs=2)]: Done 9950 tasks      | elapsed: 12.2min
[Parallel(n_jobs=2)]: Done 12050 tasks      | elapsed: 14.7min
[Parallel(n_jobs=2)]: Done 14350 tasks      | elapsed: 17.7min
[Parallel(n_jobs=2)]: Done 16850 tasks      | elapsed: 20.7min
[Parallel(n_jobs=2)]: Done 19550 tasks      | elapsed: 24.1min
[Parallel(n_jobs=2)]: Done 22450 tasks      | elapsed: 2

GridSearchCV(cv=2, estimator=XGBClassifier(), n_jobs=2,
             param_grid={'colsample_bytree': [0.6, 0.8, 1.0],
                         'gamma': [0.5, 1, 1.5, 2, 5],
                         'learning_rate': [0.01, 0.05, 0.1, 0.2],
                         'max_depth': [3, 4, 5], 'min_child_weight': [1, 5, 10],
                         'n_estimators': [500, 600, 700],
                         'objective': ['reg:linear'],
                         'reg_alpha': [1.1, 1.2, 1.3],
                         'reg_lambda': [1.1, 1.2, 1.3], 'silent': [1],
                         'subsample': [0.7, 0.8, 0.9]},
             verbose=True)

In [10]:
print(xgb_grid.best_params_)
gbm = XGBClassifier(**xgb_grid.best_params_)
gbm.fit(X_train,y_train)

{'colsample_bytree': 0.8, 'gamma': 1.5, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 500, 'objective': 'reg:linear', 'reg_alpha': 1.2, 'reg_lambda': 1.1, 'silent': 1, 'subsample': 0.8}


XGBClassifier(colsample_bytree=0.8, gamma=1.5, learning_rate=0.2, max_depth=4,
              n_estimators=500, objective='reg:linear', reg_alpha=1.2,
              reg_lambda=1.1, silent=1, subsample=0.8)

In [11]:
from sklearn.metrics import accuracy_score

predictions = gbm.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 73.38%


In [13]:
# Import Logistic Regression library
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

# define models and parameters
lr = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=lr, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.775120 using {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [15]:
# Import kNN library
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=knn, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.759125 using {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}


In [16]:
# Import SVC library
from sklearn.svm import SVC

# define model and parameters
svc = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=svc, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.759125 using {'C': 10, 'gamma': 'scale', 'kernel': 'poly'}


In [17]:
# Import Bagging Classifier library
from sklearn.ensemble import BaggingClassifier

# define model and parameters
bc = BaggingClassifier()
n_estimators = [10, 100, 1000]

# define grid search
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=bc, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

NameError: name 'BaggingClassifier' is not defined