### Model Training - Diabetes Prediction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [2]:
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier


In [3]:
df = pd.read_csv('diabetes.csv')

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Splitting the Input and Output features

In [5]:
X = df.drop(columns=['Outcome'],axis=1)

In [6]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [7]:
y = df[['Outcome']]

In [8]:
y.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


In [9]:
from sklearn.model_selection import train_test_split
X_train ,X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [10]:
X_train.shape , y_train.shape

((614, 8), (614, 1))

In [11]:
X_test.shape , y_test.shape

((154, 8), (154, 1))

In [12]:
def evaluate_model(true , predicted):
    cm = confusion_matrix(true,predicted)
    clf_report  = classification_report(true,predicted)
    print('Confusion Matrix',cm)
    print('Classification Report:-',clf_report)
    

In [13]:
models = {
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Support Vector Classifier": SVC(probability=True),
    "CatBoost": CatBoostClassifier(verbose=0),
    "XGBoost": XGBClassifier()
}

model_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


     
    print('Model performance for Training set')
    evaluate_model(y_train,y_pred_train)
    

    print('----------------------------------')
    
    print('Model performance for Test set')
    evaluate_model(y_test,y_pred_test)
    



K-Nearest Neighbors
Model performance for Training set
Confusion Matrix [[354  47]
 [ 77 136]]
Classification Report:-               precision    recall  f1-score   support

           0       0.82      0.88      0.85       401
           1       0.74      0.64      0.69       213

    accuracy                           0.80       614
   macro avg       0.78      0.76      0.77       614
weighted avg       0.79      0.80      0.79       614

----------------------------------
Model performance for Test set
Confusion Matrix [[70 29]
 [23 32]]
Classification Report:-               precision    recall  f1-score   support

           0       0.75      0.71      0.73        99
           1       0.52      0.58      0.55        55

    accuracy                           0.66       154
   macro avg       0.64      0.64      0.64       154
weighted avg       0.67      0.66      0.67       154

Decision Tree
Model performance for Training set
Confusion Matrix [[401   0]
 [  0 213]]
Classificati

In [14]:
params = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'iterations': [100, 200, 300],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

cbc = CatBoostClassifier(verbose=0)
search = RandomizedSearchCV(cbc, param_distributions=params, n_iter=20, cv=5, scoring='f1')
search.fit(X_train, y_train)

print("Best Params:", search.best_params_)
print("Best Score:", search.best_score_)

Best Params: {'learning_rate': 0.05, 'l2_leaf_reg': 9, 'iterations': 300, 'depth': 4}
Best Score: 0.6560259310999372


In [15]:
models = {
    "CatBoost": CatBoostClassifier(
        learning_rate=0.05,
        l2_leaf_reg=9,
        iterations=300,
        depth=4
    ),
  
}

In [16]:
model_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)


    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


     
    print('Model performance for Training set')
    evaluate_model(y_train,y_pred_train)
    

    print('----------------------------------')
    
    print('Model performance for Test set')
    evaluate_model(y_test,y_pred_test)
    


0:	learn: 0.6709662	total: 5.84ms	remaining: 1.75s
1:	learn: 0.6530019	total: 7.05ms	remaining: 1.05s
2:	learn: 0.6319629	total: 7.68ms	remaining: 760ms
3:	learn: 0.6154493	total: 8.26ms	remaining: 611ms
4:	learn: 0.6019412	total: 9.13ms	remaining: 539ms
5:	learn: 0.5928285	total: 9.82ms	remaining: 481ms
6:	learn: 0.5781694	total: 10.4ms	remaining: 437ms
7:	learn: 0.5640957	total: 11ms	remaining: 402ms
8:	learn: 0.5542606	total: 11.7ms	remaining: 377ms
9:	learn: 0.5451063	total: 12.2ms	remaining: 354ms
10:	learn: 0.5362632	total: 12.9ms	remaining: 338ms
11:	learn: 0.5286995	total: 13.5ms	remaining: 324ms
12:	learn: 0.5230016	total: 14.1ms	remaining: 311ms
13:	learn: 0.5166155	total: 14.7ms	remaining: 300ms
14:	learn: 0.5123959	total: 15.3ms	remaining: 291ms
15:	learn: 0.5065160	total: 16ms	remaining: 283ms
16:	learn: 0.5011857	total: 16.6ms	remaining: 276ms
17:	learn: 0.4963198	total: 17.2ms	remaining: 270ms
18:	learn: 0.4926442	total: 18.9ms	remaining: 279ms
19:	learn: 0.4887105	total