# Import dataset

In [14]:
# Import libraries and data set
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('diabetes.csv')

In [15]:
# Have a look at the first five rows
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [16]:
# Check if any column has missing data
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [17]:
# Split into train and test sets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X = df.drop(columns='Outcome')
y = df.Outcome
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

### Use XGBoost to predict the outcome

In [18]:
# from xgboost import XGBClassifier

# parameters = {'objective':['reg:linear'],
#               'learning_rate': [.01, 0.05, 0.1, 0.2], #so called `eta` value
#               'gamma': [0.5, 1, 1.5, 2, 5],
#               'subsample': [0.6, 0.8, 1.0],
#               'min_child_weight': [1, 5, 10],
#               'silent': [1],
#               'n_estimators': [500, 600, 700],
#                 'colsample_bytree': [0.6, 0.8, 1.0],
#                 'max_depth': [3, 4, 5],
#                 'reg_alpha': [1.1, 1.2, 1.3],
#                 'reg_lambda': [1.1, 1.2, 1.3],
#                 'subsample': [0.7, 0.8, 0.9]
#              }

# xgb_grid = GridSearchCV(XGBClassifier(),
#                         parameters,
#                         cv = 2,
#                         n_jobs = 2,
#                         verbose=True)

# # Try fitting training data sets with all parameters
# xgb_grid.fit(X_train,y_train)

# # Print the best parameters
# print(xgb_grid.best_params_)

# #Fit the training tests using the best parameters
# gbm = XGBClassifier(**xgb_grid.best_params_)
# gbm.fit(X_train,y_train)

# # Print the accuracy of prediction
# predictions = gbm.predict(X_test)
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Use Logistic Regression to predict the outcome

In [19]:
# Import Logistic Regression library
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold

# define models and parameters
lr = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=lr, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# Try fitting training data sets with all parameters
grid_result = grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_result.best_params_)

#Fit the training tests using the best parameters
model = LogisticRegression(**grid_result.best_params_)
model.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

{'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Accuracy: 78.57%


### Use kNN to predict the outcome

In [20]:
# Import kNN library
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=knn, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# Try fitting training data sets with all parameters
grid_result = grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_result.best_params_)

#Fit the training tests using the best parameters
model = KNeighborsClassifier(**grid_result.best_params_)
model.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}
Accuracy: 75.32%


### Use SVC to predict the outcome

In [None]:
# Import SVC library
from sklearn.svm import SVC

# define model and parameters
svc = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']

# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=svc, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# Try fitting training data sets with all parameters
grid_result = grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_result.best_params_)

#Fit the training tests using the best parameters
model = SVC(**grid_result.best_params_)
model.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Use Bagging Classifier to predict the outcome

In [None]:
# Import Bagging Classifier library
from sklearn.ensemble import BaggingClassifier

# define model and parameters
bc = BaggingClassifier()
n_estimators = [10, 100, 1000]

# define grid search
grid = dict(n_estimators=n_estimators)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=bc, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# Try fitting training data sets with all parameters
grid_result = grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_result.best_params_)

#Fit the training tests using the best parameters
model = BaggingClassifier(**grid_result.best_params_)
model.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Use RFC to predict the outcome

In [None]:
# # Import RFC library
# from sklearn.ensemble import RandomForestClassifier

# # define models and parameters
# rfc = RandomForestClassifier()
# n_estimators = [10, 100, 1000]
# max_features = ['sqrt', 'log2']

# # define grid search
# grid = dict(n_estimators=n_estimators,max_features=max_features)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid_search = GridSearchCV(estimator=rfc, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# # Try fitting training data sets with all parameters
# grid_result = grid_search.fit(X_train, y_train)

# # Print the best parameters
# print(grid_result.best_params_)

# #Fit the training tests using the best parameters
# model = BaggingClassifier(**grid_result.best_params_)
# model.fit(X_train,y_train)

# # Print the accuracy of prediction
# predictions = model.predict(X_test)
# accuracy = accuracy_score(y_test, predictions)
# print("Accuracy: %.2f%%" % (accuracy * 100.0))

### Use Gradient BOosting Classifier to predic the outcome

In [None]:
# Import GBC
from sklearn.ensemble import GradientBoostingClassifier

# define models and parameters
gbc = GradientBoostingClassifier()
n_estimators = [10, 100, 1000]
learning_rate = [0.001, 0.01, 0.1]
subsample = [0.5, 0.7, 1.0]
max_depth = [3, 7, 9]

# define grid search
grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=gbc, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

# Try fitting training data sets with all parameters
grid_result = grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_result.best_params_)

#Fit the training tests using the best parameters
model = GradientBoostingClassifier(**grid_result.best_params_)
model.fit(X_train,y_train)

# Print the accuracy of prediction
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))