IMPORTING LIBRARIES AND LOADING DIABETES DATA

In [None]:
import pickle
import xgboost
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', None)

df = pd.read_csv("diabetes.csv")

df.head()

FEATURE ENGINEERING

In [None]:
# dataset shape : number of records x number of features
print(df.shape)

# checking for null values:
print(df.isnull().values.any())

In [None]:
# correlation
corr_mat = df.corr()
top_corr_features = corr_mat.index
plt.figure(figsize=(15, 15))
g = sns.heatmap(corr_mat[top_corr_features].corr(), annot=True, cmap="Blues")

In [None]:
# converting output label i.e. df[diabetes] from boolean to int.
df['Outcome'] = df['Outcome'].map({True : 1, False: 0})

In [None]:
df.head()

In [None]:
diabetes_true_count = len(df.loc[df['Outcome'] == 1])
diabetes_false_count = len(df.loc[df['Outcome'] == 0])

In [None]:
print("Data having 1 as output: {}".format(diabetes_true_count))
print("Data having 0 as output: {}".format(diabetes_false_count))

FEATURE SELECTION BASED ON CORRELATION AND LATER BY RANDOM FOREST FEATURE IMPORTANCE

In [None]:
selected_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']
       
output_class = ['Outcome']

In [None]:
# converting from df to np array
X = df[selected_features].values
y = df[output_class].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# checking values which are 0:
print("total number of rows : {0}".format(len(df)))
print("number of rows missing Glucose: {0}".format(len(df.loc[df['Glucose'] == 0])))
print("number of rows missing BloodPressure: {0}".format(len(df.loc[df['BloodPressure'] == 0])))
print("number of rows missing insulin: {0}".format(len(df.loc[df['Insulin'] == 0])))
print("number of rows missing bmi: {0}".format(len(df.loc[df['BMI'] == 0])))
print("number of rows missing DiabetesPedigreeFunction: {0}".format(len(df.loc[df['DiabetesPedigreeFunction'] == 0])))
print("number of rows missing age: {0}".format(len(df.loc[df['Age'] == 0])))
print("number of rows missing SkinThickness: {0}".format(len(df.loc[df['SkinThickness'] == 0])))

In [None]:
# imputing these missing/zero values

missing_values_imputer = SimpleImputer(missing_values=0, strategy='mean')

X_train = missing_values_imputer.fit_transform(X_train)
X_test = missing_values_imputer.fit_transform(X_test)

In [None]:
X_train.shape, X_test.shape

APPLYING MACHINE LEARNING ALGORITHM

In [None]:
# using random forest classifier
rfc = RandomForestClassifier(random_state=10) # ravel : from (n,m) => (n,)
rfc.fit(X_train, y_train.ravel())

In [None]:
# random forest classifier accuracy:
y_preds = rfc.predict(X_test)
print("Accuracy : {:.2f}%".format(accuracy_score(y_test, y_preds)*100))

RANDOM FOREST FEATURE IMPORTANCE FOR FEATURE SELECTION

In [None]:
f_importance = rfc.feature_importances_

final_features = sorted(f_importance)

for i, v in enumerate(f_importance):
	print('{}, Score: {:.5f}'.format(selected_features[i], v))

print(final_features)

final_selected_features = ['Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

In [None]:
X_new = df[final_selected_features].values

X_train_new, X_test_new, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [None]:
X_train_new = missing_values_imputer.fit_transform(X_train_new)
X_test_new = missing_values_imputer.fit_transform(X_test_new)

In [None]:
rfc.fit(X_train_new, y_train.ravel())

# random forest classifier accuracy after feature importance:
y_preds_new = rfc.predict(X_test_new)
print("Accuracy : {:.2f}%".format(accuracy_score(y_test, y_preds_new)*100))

In [None]:
# using xgboost

# hyperparameter optimization

params = {
    "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth"        : [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight" : [1, 3, 5, 7],
    "gamma"            : [0.0, 0.1, 0.2 , 0.3, 0.4],
    "colsample_bytree" : [0.3, 0.4, 0.5, 0.7]
}

clf = xgboost.XGBClassifier()

random_search = RandomizedSearchCV(
    clf, 
    param_distributions=params, 
    n_iter=5, 
    scoring='roc_auc', 
    n_jobs=-1, 
    cv=5, 
    verbose=3
)

random_search.fit(X_train, y_train.ravel())

In [None]:
random_search.best_estimator_

In [None]:
classifier = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.1,
              learning_rate=0.2, max_delta_step=0, max_depth=5,
              min_child_weight=7, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

classifier.fit(X_train, y_train.ravel())

In [None]:
score = cross_val_score(classifier, X_train, y_train.ravel(), cv=10)
print(score.mean())

In [None]:
# xgboost classifier accuracy:
y_preds = classifier.predict(X_test)
print("Accuracy : {:.2f}%".format(accuracy_score(y_test, y_preds)*100))

In [None]:
# saving trained model
filename = 'diabetes.sav'
pickle.dump(rfc, open(filename, 'wb'))

# loading trained model
# model = pickle.load(open(filename, 'rb'))
# result = model.score(X_test, y_test)
# print(result)