In [2]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv("C:\\Users\IRONMAN\Desktop\AML Project\Dataset\diabetes.csv")

In [4]:
# Printing First 5 rows
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
cols_missing_vals = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI','Age']
df[cols_missing_vals] = df[cols_missing_vals].replace(to_replace=0, value=np.NaN)
df[cols_missing_vals] = df[cols_missing_vals].fillna(value=df.mean())

In [6]:
print("Dataset Length : ",len(df))

Dataset Length :  768


In [7]:
X = df.iloc[:,0:8]
y = df.iloc[:,8]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 1992)

# Train models


In [8]:
models = {
    'RandomForest':RandomForestClassifier(n_estimators=20),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=25, learning_rate=0.1, subsample=0.5),
    'SVM': SVC(kernel='linear', random_state=42),
    'LightGBM': LGBMClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(random_state=42, verbose=0),
    'KNNclassifier':KNeighborsClassifier(n_neighbors=11, p = 2, metric = 'euclidean')
}

In [9]:
for model_name, model in models.items():
    model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 215, number of negative: 399
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000987 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 675
[LightGBM] [Info] Number of data points in the train set: 614, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.350163 -> initscore=-0.618323
[LightGBM] [Info] Start training from score -0.618323


In [10]:
# Automatically select the best model based on accuracy
best_model_name = max(models, key=lambda k: accuracy_score(y_test, models[k].predict(X_test)))
best_model = models[best_model_name]
print(f'The best model selected is: {best_model_name}')

The best model selected is: XGBoost


In [11]:
# Creating a pickle file for the classifier
filename = 'XGB model.pkl'
pickle.dump(model, open(filename, 'wb'))