### 1-Importing The Libraries

In [355]:
#!pip install scikit-learn
# %load_ext cudf.pandas

#Importing the data analysis libraries
import numpy as np # linear algebra
import pandas as pd # data processing

#Importing the visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
#Ensuring that we don't see any warnings while running the cells
import warnings
warnings.filterwarnings('ignore') 

#
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Importing the counter
from collections import Counter

### 2 - Importing the Dataset 

In [356]:
#Reading the data from the given files and creating a training and test dataset
df = pd.read_csv("pima_diabetes_all.csv")

### 3 - Dataset Analysis

In [357]:
df.sample(20)

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
105,1.0,126.0,56.0,29.0,152.0,28.7,0.801,21.0,0.0
453,2.0,119.0,0.0,0.0,0.0,19.6,0.832,72.0,0.0
520,2.0,68.0,70.0,32.0,66.0,25.0,0.187,25.0,0.0
426,0.0,94.0,0.0,0.0,0.0,0.0,0.256,25.0,0.0
212,7.0,179.0,95.0,31.0,0.0,34.2,0.164,60.0,0.0
626,0.0,125.0,68.0,0.0,0.0,24.7,0.206,21.0,0.0
344,8.0,95.0,72.0,0.0,0.0,36.8,0.485,57.0,0.0
19,,,,,,,,,
339,7.0,178.0,84.0,0.0,0.0,39.9,0.331,41.0,1.0
523,9.0,130.0,70.0,0.0,0.0,34.2,0.652,45.0,1.0


In [358]:
df.head()

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,1.0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,1.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [359]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   preg_count                   764 non-null    float64
 1   glucose_concentration        764 non-null    float64
 2   diastolic_bp                 764 non-null    float64
 3   triceps_skin_fold_thickness  764 non-null    float64
 4   two_hr_serum_insulin         764 non-null    float64
 5   bmi                          764 non-null    float64
 6   diabetes_pedi                764 non-null    float64
 7   age                          764 non-null    float64
 8   diabetes_class               764 non-null    float64
dtypes: float64(9)
memory usage: 54.1 KB


In [360]:
df.shape

(768, 9)

#### Checking for and Dropping Null Values

In [361]:
df.isnull().sum()

preg_count                     4
glucose_concentration          4
diastolic_bp                   4
triceps_skin_fold_thickness    4
two_hr_serum_insulin           4
bmi                            4
diabetes_pedi                  4
age                            4
diabetes_class                 4
dtype: int64

In [362]:
df = df.dropna()

In [363]:
df.shape

(764, 9)

In [364]:
df.describe(include="all")

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
count,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0,764.0
mean,3.811636,121.541505,69.12801,20.857068,83.724869,31.980969,0.468038,33.35445,0.344241
std,3.353474,32.778996,19.121333,16.022464,121.215003,7.777042,0.331167,11.852418,0.475431
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,64.0,0.0,0.0,27.375,0.238,24.0,0.0
50%,3.0,117.0,72.0,23.0,36.5,32.0,0.3655,29.0,0.0
75%,6.0,142.0,80.0,33.0,130.0,36.5,0.6245,41.0,1.0
max,17.0,199.0,122.0,99.0,744.0,67.1,2.42,81.0,1.0


<b>Observations</b>
* There are instances where certain columns have a value of 0.0 which is not physically possible and is thus a measurement error:
* These features are:
    * diastolic_bp
    * triceps_skin_fold_thickness
    * two_hr_serum_insulin
    * bmi
    
<b>Decisions</b>
* We will impute this data using an imputer technique

### 4 - Feature Engineering

#### Imputing Zero Values with Mean of Column

In [365]:
import numpy as np
from sklearn.impute import SimpleImputer

class ZeroImputer(SimpleImputer):
    def __init__(self, missing_values=0, **kwargs):
        super().__init__(missing_values=missing_values, **kwargs)
        self.missing_values = missing_values

    def fit(self, X, y=None):
        if self.strategy == 'constant':
            self.fill_value = self.fill_value if self.fill_value != 0 else np.nan
        return super().fit(X, y)

    def transform(self, X):
        if self.strategy == 'constant':
            return np.where(X == self.missing_values, self.fill_value, X)
        else:
            return super().transform(X)


In [366]:
imputer = ZeroImputer(strategy='mean')
# To impute only zero values:
features_to_impute = ["diastolic_bp", "triceps_skin_fold_thickness", 
                      "two_hr_serum_insulin", "bmi", "glucose_concentration"]

for col in features_to_impute:
    val = df[col].mean()
    print(val)
    df[col] = df[col].replace(0.0, round(val, 1))

69.12801047120419
20.857068062827224
83.72486910994765
31.980968586387434
121.54150523560209


In [367]:
df.sample(20)

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
643,4.0,90.0,69.1,20.9,83.7,28.0,0.61,31.0,0.0
736,0.0,126.0,86.0,27.0,120.0,27.4,0.515,21.0,0.0
487,0.0,173.0,78.0,32.0,265.0,46.5,1.159,58.0,0.0
519,6.0,129.0,90.0,7.0,326.0,19.6,0.582,60.0,0.0
738,2.0,99.0,60.0,17.0,160.0,36.6,0.453,21.0,0.0
196,1.0,105.0,58.0,20.9,83.7,24.3,0.187,21.0,0.0
624,2.0,108.0,64.0,20.9,83.7,30.8,0.158,21.0,0.0
582,12.0,121.0,78.0,17.0,83.7,26.5,0.259,62.0,0.0
277,0.0,104.0,64.0,23.0,116.0,27.8,0.454,23.0,0.0
538,0.0,127.0,80.0,37.0,210.0,36.3,0.804,23.0,0.0


In [368]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 764 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   preg_count                   764 non-null    float64
 1   glucose_concentration        764 non-null    float64
 2   diastolic_bp                 764 non-null    float64
 3   triceps_skin_fold_thickness  764 non-null    float64
 4   two_hr_serum_insulin         764 non-null    float64
 5   bmi                          764 non-null    float64
 6   diabetes_pedi                764 non-null    float64
 7   age                          764 non-null    float64
 8   diabetes_class               764 non-null    float64
dtypes: float64(9)
memory usage: 59.7 KB


In [369]:
# Function to detect outliers using Z-score
def detect_outliers(data, score=3):
    z_scores = (data - data.mean()) / data.std()
    outliers = (z_scores > score) | (z_scores < -score)
    return outliers

for col in df.columns:
    # Detect outliers in column 'A' using Z-score
    outliers = df[col][detect_outliers(df[col])]
    print(f"\nOutliers in column {col}\n", outliers)



Outliers in column preg_count
 88     15.0
159    17.0
298    14.0
455    14.0
Name: preg_count, dtype: float64

Outliers in column glucose_concentration
 Series([], Name: glucose_concentration, dtype: float64)

Outliers in column diastolic_bp
 43     110.0
106    122.0
125     30.0
177    110.0
549    110.0
597     24.0
691    114.0
Name: diastolic_bp, dtype: float64

Outliers in column triceps_skin_fold_thickness
 57     60.0
445    63.0
579    99.0
Name: triceps_skin_fold_thickness, dtype: float64

Outliers in column two_hr_serum_insulin
 8      543.0
9      543.1
10     543.1
11     543.1
12     543.1
13     543.1
14     543.1
15     543.1
16     543.1
111    495.0
153    485.0
186    495.0
220    478.0
228    744.0
247    680.0
286    545.0
370    465.0
409    579.0
415    474.0
486    480.0
584    600.0
645    440.0
655    540.0
695    480.0
753    510.0
Name: two_hr_serum_insulin, dtype: float64

Outliers in column bmi
 120    53.2
125    55.0
177    67.1
445    59.4
673    57.

<b>Observations</b>
* There are multiple columns with multiple outliers
* Let's remove the outliers

In [370]:
import pandas as pd

# Assuming df is your DataFrame

# Function to remove outliers from DataFrame
def remove_outliers(df, threshold=3):
    outliers = detect_outliers(df)
    df_clean = df[~outliers.any(axis=1)]
    return df_clean

# Remove outliers from DataFrame
df = remove_outliers(df)

#### Checking and Correcting for Class imbalance

In [371]:
target_column = 'diabetes_class'
# Count the class frequencies
class_counts = df[target_column].value_counts()

# Display class frequencies
print("\nClass Frequencies:")
print(class_counts)


Class Frequencies:
0.0    471
1.0    239
Name: diabetes_class, dtype: int64


In [372]:
from imblearn.over_sampling import RandomOverSampler

# Assuming X is your feature matrix and y is your target vector
X = df.drop(columns=['diabetes_class'])
y = df['diabetes_class']

# Instantiate RandomOverSampler
oversampler = RandomOverSampler()

# Perform oversampling
X, y = oversampler.fit_resample(X, y)

df = pd.concat([X, y], axis=1)

In [373]:
# Count the class frequencies
class_counts = df[target_column].value_counts()

# Display class frequencies
print("\nClass Frequencies:")
print(class_counts)


Class Frequencies:
0.0    471
1.0    471
Name: diabetes_class, dtype: int64


In [374]:
df.sample(10)

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age,diabetes_class
516,0.0,198.0,66.0,32.0,274.0,41.3,0.502,28.0,1.0
610,8.0,167.0,106.0,46.0,231.0,37.6,0.165,43.0,1.0
636,2.0,121.0,70.0,32.0,95.0,39.1,0.886,23.0,0.0
513,11.0,103.0,68.0,40.0,83.7,46.2,0.126,42.0,0.0
371,8.0,197.0,74.0,20.9,83.7,25.9,1.191,39.0,1.0
410,1.0,82.0,64.0,13.0,95.0,21.2,0.415,23.0,0.0
259,0.0,107.0,62.0,30.0,74.0,36.6,0.757,25.0,1.0
34,1.0,103.0,80.0,11.0,82.0,19.4,0.491,22.0,0.0
585,13.0,104.0,72.0,20.9,83.7,31.2,0.465,38.0,1.0
465,8.0,120.0,78.0,20.9,83.7,25.0,0.409,64.0,0.0


#### Splitting into Train and Test

In [375]:
X, y = df.drop("diabetes_class", axis=1), df["diabetes_class"]

In [376]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [377]:
X_train.head()

Unnamed: 0,preg_count,glucose_concentration,diastolic_bp,triceps_skin_fold_thickness,two_hr_serum_insulin,bmi,diabetes_pedi,age
933,11.0,143.0,94.0,33.0,146.0,36.6,0.254,51.0
834,7.0,150.0,78.0,29.0,126.0,35.2,0.692,54.0
358,4.0,158.0,78.0,20.9,83.7,32.9,0.803,31.0
2,8.0,183.0,64.0,20.9,83.7,23.3,0.672,32.0
420,5.0,88.0,78.0,30.0,83.7,27.6,0.258,37.0


#### Scaling the Features

In [378]:
#Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

### 5 - Model Training

In [379]:
#Importing sci-kit learn libraries that we will need for this project
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV,StratifiedKFold, cross_val_score, learning_curve
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

#StratifiedKFold aims to ensure each class is (approximately) equally represented across each test fold
k_fold = StratifiedKFold(n_splits=5)


diabetes_classifiers = []
# Creating objects of each classifier
LG_classifier = LogisticRegression(random_state=0)
diabetes_classifiers.append(LG_classifier)
SVC_classifier = SVC(kernel="rbf", random_state=0)
diabetes_classifiers.append(SVC_classifier)
KNN_classifier = KNeighborsClassifier()
diabetes_classifiers.append(KNN_classifier)
NB_classifier = GaussianNB()
diabetes_classifiers.append(NB_classifier)
DT_classifier = DecisionTreeClassifier(criterion="gini", random_state=0)
diabetes_classifiers.append(DT_classifier)
RF_classifier = RandomForestClassifier(n_estimators=200, criterion="gini", random_state=0)
diabetes_classifiers.append(RF_classifier)
AB_classifier = AdaBoostClassifier(DecisionTreeClassifier(random_state=0), random_state=0, learning_rate=0.1)

diabetes_classifiers.append(AB_classifier)
ET_classifier = ExtraTreesClassifier(random_state=0)
diabetes_classifiers.append(ET_classifier)
GB_classifier = GradientBoostingClassifier(random_state=0)
diabetes_classifiers.append(GB_classifier)

#This dictionary is just to grad the name of each classifier
classifier_dict = {
    0: "Logistic Regression",
    1: "Support Vector Classfication",
    2: "K Nearest Neighbor Classification",
    3: "Naive bayes Classifier",
    4: "Decision Trees Classifier",
    5: "Random Forest Classifier",
    6: "AdaBoost Classifier",
    7: "Extra Trees Classifier",
    8: "Gradient Boosting Classifier"
}

diabetes_results = pd.DataFrame({'Model': [],'Mean Accuracy': []})

#Iterating over each classifier and getting the result
for i, classifier in enumerate(diabetes_classifiers):
    classifier_scores = cross_val_score(classifier, X_train, y_train, cv=k_fold, n_jobs=4, scoring="accuracy")
    diabetes_results.loc[len(diabetes_results.index)] = [classifier_dict[i], classifier_scores.mean()]

In [380]:
diabetes_results

Unnamed: 0,Model,Mean Accuracy
0,Logistic Regression,0.73313
1,Support Vector Classfication,0.780848
2,K Nearest Neighbor Classification,0.741095
3,Naive bayes Classifier,0.741086
4,Decision Trees Classifier,0.804742
5,Random Forest Classifier,0.83668
6,AdaBoost Classifier,0.788795
7,Extra Trees Classifier,0.85132
8,Gradient Boosting Classifier,0.81011


<b>Observations</b>
* The best two models are the Random Forest Classifier and the Extra Trees Classifier

#### Hyperparameter Tuning ExtraTrees Classifier

In [381]:
# Define the Extra Trees Classifier model
ETC = ExtraTreesClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],              # Num. trees
    'max_depth': [None, 10, 20, 30],              # Max. depth
    'min_samples_split': [2, 5, 10],              # Min. samples for split
    'min_samples_leaf': [1, 2, 4]                 # Min. samples for leaf node
}

best_ETC = GridSearchCV(estimator=ETC, param_grid=param_grid, 
                             cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the training data
best_ETC.fit(X_train, y_train)

# Get the best hyperparameters
best_params = best_ETC.best_params_
print("Best hyperparameters:", best_params)

# Evaluate the model performance on the test set
y_pred = best_ETC.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Confusion Matrix:
[[82  8]
 [14 85]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.91      0.88        90
         1.0       0.91      0.86      0.89        99

    accuracy                           0.88       189
   macro avg       0.88      0.88      0.88       189
weighted avg       0.89      0.88      0.88       189



#### Hyperparameter Tuning RandomForest Classifier

In [382]:
# Define the RandomForestClassifier model
RFC = RandomForestClassifier()

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],              # Num. trees
    'max_depth': [None, 10, 20, 30],              # Max. depth
    'min_samples_split': [2, 5, 10],              # Min. samples for split
    'min_samples_leaf': [1, 2, 4]                 # Min. samples for leaf node
}

# Create GridSearchCV object
best_RFC = GridSearchCV(estimator=RFC, param_grid=param_grid, cv=5, 
                             scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the GridSearchCV object to the training data
best_RFC.fit(X_train, y_train)

# Get the best hyperparameters
best_params = best_RFC.best_params_
print("Best hyperparameters:", best_params)

# Evaluate the model performance on the test set
y_pred = best_RFC.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Confusion Matrix:
[[73 17]
 [10 89]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.81      0.84        90
         1.0       0.84      0.90      0.87        99

    accuracy                           0.86       189
   macro avg       0.86      0.86      0.86       189
weighted avg       0.86      0.86      0.86       189



#### Training an Ensemble Model of the Best Models

## Conclusion
* Both the Extra Trees Classifier and the Random Forest Classifier give the great results
* From the original dataset, without any feature engineering the precision and recalls were very low
* After the following feature engineering steps:
    * Imputing Zero values of a column with the mean of the column
    * Correction of Class Imbalance
    * Removing Outliers that are above a Z-score of 3 and below a Z-score of -3
    * Scaling the data
* After performing a gridsearchcv on the best 2 models, we conclude the best model
* The best model for our problem is the ExtraTreesClassifer with the following parameters:
    * Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}