In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --upgrade scikit-learn
!pip install --upgrade xgboost



In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline

Data Collection and Analysis

In [None]:
# PIMA diabetes dataset for only female
diabetes_df = pd.read_csv('/content/drive/MyDrive/ML/clg_project_1/Diabetes.csv')
diabetes_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [None]:
diabetes_df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [None]:
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
diabetes_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
diabetes_df.shape

(768, 9)

In [None]:
diabetes_df['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [None]:
diabetes_df.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [None]:
X = diabetes_df.drop(columns='Outcome', axis=1)
y = diabetes_df['Outcome']

In [None]:
X.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [None]:
y.head(5)

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

In [None]:
models = {
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier()
}

In [None]:
# evaluate models
for i in range(len(list(models))):
  name = list(models.keys())[i]
  model = list(models.values())[i]

  pipeline = Pipeline([
      ('scaler',StandardScaler()),
      ('model',model)
  ])

  pipeline.fit(X_train,y_train)

  y_train_pred = pipeline.predict(X_train)
  y_test_pred = pipeline.predict(X_test)

  print(f"{name} - model performance for training set")
  print("- Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
  print("- F1 score: {:.4f}".format(f1_score(y_train, y_train_pred, average='weighted')))
  print("- Precision: {:.4f}".format(precision_score(y_train, y_train_pred, average='weighted')))
  print("- Recall: {:.4f}".format(recall_score(y_train, y_train_pred, average='weighted')))
  print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_train, y_train_pred)))

  print('-' * 35)

  print(f"{name} - Model performance for Test set")
  print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
  print("- F1 score: {:.4f}".format(f1_score(y_test, y_test_pred, average='weighted')))
  print("- Precision: {:.4f}".format(precision_score(y_test, y_test_pred, average='weighted')))
  print("- Recall: {:.4f}".format(recall_score(y_test, y_test_pred, average='weighted')))
  print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_test, y_test_pred)))
  print('=' * 40, '\n')


Gradient Boosting - model performance for training set
- Accuracy: 0.9186
- F1 score: 0.9171
- Precision: 0.9200
- Recall: 0.9186
- ROC AUC Score: 0.8951
-----------------------------------
Gradient Boosting - Model performance for Test set
- Accuracy: 0.7532
- F1 score: 0.7496
- Precision: 0.7483
- Recall: 0.7532
- ROC AUC Score: 0.7163

AdaBoost - model performance for training set
- Accuracy: 0.8062
- F1 score: 0.8019
- Precision: 0.8029
- Recall: 0.8062
- ROC AUC Score: 0.7687
-----------------------------------
AdaBoost - Model performance for Test set
- Accuracy: 0.7792
- F1 score: 0.7771
- Precision: 0.7762
- Recall: 0.7792
- ROC AUC Score: 0.7491

Random Forest - model performance for training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0000
-----------------------------------
Random Forest - Model performance for Test set
- Accuracy: 0.7532
- F1 score: 0.7496
- Precision: 0.7483
- Recall: 0.7532
- ROC AUC Score: 0.7163

SVM

In [None]:
# Define hyperparameters
adaboost_params = {
    "model__n_estimators": [50, 60, 70, 80, 90],
    "model__algorithm": ['SAMME', 'SAMME.R']
}

gradient_params = {
    "model__loss": ['log_loss', 'deviance', 'exponential'],
    "model__criterion": ['friedman_mse', 'squared_error', 'mse'],
    "model__min_samples_split": [2, 15, 20],
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [5, 15, None]
}

svm_params = {
    "model__C": [0.1, 1, 10],
    "model__kernel": ['linear', 'rbf', 'poly'],
    "model__gamma": ['scale', 'auto'],
    "model__degree": [3, 4, 5]
}

random_forest_params = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 10],
    "model__min_samples_leaf": [1, 5],
    "model__bootstrap": [True, False]
}

In [None]:
# Models list for RandomizedSearchCV
randomcv_models = [
    ("ab", AdaBoostClassifier(), adaboost_params),
    ("gb", GradientBoostingClassifier(), gradient_params),
    ("svm", SVC(), svm_params),
    ("rf", RandomForestClassifier(), random_forest_params)
]

In [None]:
# Hyperparameter tuning with Pipelines
model_param = {}
for name, model, params in randomcv_models:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    random = RandomizedSearchCV(
        estimator=pipeline,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
    )
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

# Print best parameters
for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/

Fitting 3 folds for each of 100 candidates, totalling 300 fits


171 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File 

Fitting 3 folds for each of 54 candidates, totalling 162 fits




Fitting 3 folds for each of 72 candidates, totalling 216 fits
---------------- Best Params for ab -------------------
{'model__n_estimators': 60, 'model__algorithm': 'SAMME'}
---------------- Best Params for gb -------------------
{'model__n_estimators': 300, 'model__min_samples_split': 2, 'model__max_depth': 5, 'model__loss': 'exponential', 'model__criterion': 'friedman_mse'}
---------------- Best Params for svm -------------------
{'model__kernel': 'linear', 'model__gamma': 'scale', 'model__degree': 3, 'model__C': 1}
---------------- Best Params for rf -------------------
{'model__n_estimators': 100, 'model__min_samples_split': 10, 'model__min_samples_leaf': 5, 'model__max_depth': 10, 'model__bootstrap': True}


In [None]:
# Best hyperparameters obtained from RandomizedSearchCV
models = {
    "Adaboost": AdaBoostClassifier(n_estimators=60, algorithm='SAMME'),
    "Gradient_boosting": GradientBoostingClassifier(n_estimators=300, min_samples_split=20, max_depth=15, loss='exponential', criterion='squared_error'),
    "SVM": SVC(kernel='linear', gamma='scale', degree=3, C=1),
    "RandomForest": RandomForestClassifier(n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_depth=None, bootstrap=True)
}

In [None]:
for name, model in models.items():
    # Create a pipeline with StandardScaler and the model
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    print(f"{name} - model performance for training set")
    print("- Accuracy: {:.4f}".format(accuracy_score(y_train, y_train_pred)))
    print("- F1 score: {:.4f}".format(f1_score(y_train, y_train_pred, average='weighted')))
    print("- Precision: {:.4f}".format(precision_score(y_train, y_train_pred, average='weighted')))
    print("- Recall: {:.4f}".format(recall_score(y_train, y_train_pred, average='weighted')))
    print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_train, y_train_pred)))

    print('-' * 35)

    print(f"{name} - Model performance for Test set")
    print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
    print("- F1 score: {:.4f}".format(f1_score(y_test, y_test_pred, average='weighted')))
    print("- Precision: {:.4f}".format(precision_score(y_test, y_test_pred, average='weighted')))
    print("- Recall: {:.4f}".format(recall_score(y_test, y_test_pred, average='weighted')))
    print("- ROC AUC Score: {:.4f}".format(roc_auc_score(y_test, y_test_pred)))
    print('=' * 40, '\n')



Adaboost - model performance for training set
- Accuracy: 0.8127
- F1 score: 0.8077
- Precision: 0.8101
- Recall: 0.8127
- ROC AUC Score: 0.7726
-----------------------------------
Adaboost - Model performance for Test set
- Accuracy: 0.7922
- F1 score: 0.7879
- Precision: 0.7883
- Recall: 0.7922
- ROC AUC Score: 0.7548

Gradient_boosting - model performance for training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- ROC AUC Score: 1.0000
-----------------------------------
Gradient_boosting - Model performance for Test set
- Accuracy: 0.7078
- F1 score: 0.7072
- Precision: 0.7066
- Recall: 0.7078
- ROC AUC Score: 0.6770

SVM - model performance for training set
- Accuracy: 0.7915
- F1 score: 0.7843
- Precision: 0.7880
- Recall: 0.7915
- ROC AUC Score: 0.7444
-----------------------------------
SVM - Model performance for Test set
- Accuracy: 0.7208
- F1 score: 0.7141
- Precision: 0.7126
- Recall: 0.7208
- ROC AUC Score: 0.6743

RandomForest - model pe

In [None]:
# final code
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import joblib

from google.colab import drive
drive.mount('/content/drive')

diabetes_df = pd.read_csv('/content/drive/MyDrive/ML/clg_project_1/Diabetes.csv')

# dependent and independent features
X = diabetes_df.drop(columns='Outcome', axis=1)
y = diabetes_df['Outcome']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', AdaBoostClassifier(n_estimators=60, algorithm='SAMME'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# Evaluate the model
metrics = {
    "Training Accuracy": accuracy_score(y_train, y_train_pred),
    "Training F1 Score": f1_score(y_train, y_train_pred),
    "Training Precision": precision_score(y_train, y_train_pred),
    "Training Recall": recall_score(y_train, y_train_pred),
    "Training ROC AUC": roc_auc_score(y_train, y_train_pred),

    "Test Accuracy": accuracy_score(y_test, y_test_pred),
    "Test F1 Score": f1_score(y_test, y_test_pred),
    "Test Precision": precision_score(y_test, y_test_pred),
    "Test Recall": recall_score(y_test, y_test_pred),
    "Test ROC AUC": roc_auc_score(y_test, y_test_pred),
}

# Print evaluation results
for metric, value in metrics.items():
    print(f'{metric}: {value:.4f}')

# Save the trained pipeline as a joblib file
# # Define the path to save the model
# model_path = '/content/drive/MyDrive/ML/clg_project_1/Diabetes_model.joblib'

# # Save the model
# joblib.dump(pipeline, model_path)
# print(f"Model saved as '{model_path}'")


Mounted at /content/drive




Training Accuracy: 0.8127
Training F1 Score: 0.7044
Training Precision: 0.7829
Training Recall: 0.6402
Training ROC AUC: 0.7726
Test Accuracy: 0.7922
Test F1 Score: 0.6800
Test Precision: 0.7391
Test Recall: 0.6296
Test ROC AUC: 0.7548
Model saved as '/content/drive/MyDrive/ML/clg_project_1/Diabetes_model.joblib'


In [None]:
# loaded_model = joblib.load('/content/drive/MyDrive/ML/clg_project_1/Diabetes_model.joblib')

In [None]:
# sample_input = pd.DataFrame([{
#     'Pregnancies': 1,
#     'Glucose': 93,
#     'BloodPressure': 70,
#     'SkinThickness': 31,
#     'Insulin': 0,
#     'BMI': 30.4,
#     'DiabetesPedigreeFunction': 0.315,
#     'Age': 23
# }])

# prediction = loaded_model.predict(sample_input)

In [None]:
# prediction

array([0])

In [None]:
# prediction_proba = loaded_model.predict_proba(sample_input)

# # Print prediction result
# if prediction[0] == 1:
#     print(f"The model predicts: Diabetes (Class {prediction[0]})")
# else:
#     print(f"The model predicts: No Diabetes (Class {prediction[0]})")

# # Print prediction probabilities
# print(f"Prediction Probabilities: {prediction_proba[0]}")

The model predicts: No Diabetes (Class 0)
Prediction Probabilities: [0.67442677 0.32557323]
