IMPORTNIG THE DEPENDENCIES

In [185]:
# Importing necessary libraries for data manipulation, preprocessing, model building, and evaluation.
import numpy as np                                # For numerical operations
import pandas as pd                                # For data manipulation and analysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split           # For splitting the data into training and testing sets
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score   # For model evaluation metrics

EXPLANATORY DATA ANALYSIS

In [186]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

In [187]:
# printing the first 5 rows of the dataset to understand the data structure
print(diabetes_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [188]:
# number of rows and Columns in this dataset
print(diabetes_dataset.shape)

(768, 9)


In [189]:
# Getting statistical measures (mean, std deviation, etc.) for each column in the dataset
print(diabetes_dataset.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [190]:
# Checking the distribution of the target variable (Outcome) to see the number of diabetic vs. non-diabetic cases
print(diabetes_dataset['Outcome'].value_counts())

Outcome
0    500
1    268
Name: count, dtype: int64


In [191]:
# Analyzing the average values of the features grouped by the Outcome to see if there are notable differences
print(diabetes_dataset.groupby('Outcome').mean())

         Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
Outcome                                                                      
0           3.298000  109.980000      68.184000      19.664000   68.792000   
1           4.865672  141.257463      70.824627      22.164179  100.335821   

               BMI  DiabetesPedigreeFunction        Age  
Outcome                                                  
0        30.304200                  0.429734  31.190000  
1        35.142537                  0.550500  37.067164  


In [192]:
# Separating the features (input data) from the target variable (labels)
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [193]:
# Standardizing the data to have a mean of 0 and a standard deviation of 1
# This is important for models like SVM and Logistic Regression that are sensitive to feature scales.
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
X = standardized_data                       # Updating X with standardized data

In [194]:
# Splitting the data into training and testing sets
# 80% of the data will be used for training, and 20% for testing
# Stratify ensures that the train/test split maintains the same proportion of classes as in the original dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [195]:
# Checking the shapes of the training and testing sets
print(f"Shape of X: {X.shape}")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X: (768, 8)
Shape of X_train: (614, 8)
Shape of X_test: (154, 8)


TRAINING AND EVALUTION

In [196]:
# Training the SVM Model
# Using a linear kernel for the SVM model, which is well-suited for this binary classification problem
svm_classifier = svm.SVC(kernel='linear')
svm_classifier.fit(X_train, Y_train)

In [197]:
# Evaluating the SVM Model on test data
X_train_prediction_svm = svm_classifier.predict(X_train)
training_data_accuracy_svm = accuracy_score(Y_train, X_train_prediction_svm)
X_test_prediction_svm = svm_classifier.predict(X_test)
test_data_accuracy_svm = accuracy_score(Y_test, X_test_prediction_svm)

In [198]:
# Calculating additional evaluation metrics for the SVM Model on test data
accuracy_svm = accuracy_score(Y_test, X_test_prediction_svm)
precision_svm = precision_score(Y_test, X_test_prediction_svm)
recall_svm = recall_score(Y_test, X_test_prediction_svm)
f1_svm = f1_score(Y_test, X_test_prediction_svm)

In [199]:
# Printing the SVM Model's accuracy on training and test data
print('SVM Model on training data:')
print('Accuracy score of the training data :', training_data_accuracy_svm)
print('Accuracy score of the test data :', test_data_accuracy_svm)

SVM Model on training data:
Accuracy score of the training data : 0.7866449511400652
Accuracy score of the test data : 0.7727272727272727


In [200]:
print('SVM Model Evaluation Metrics:')
print('Accuracy:', accuracy_svm)
print('Precision:', precision_svm)
print('Recall:', recall_svm)
print('F1 Score:', f1_svm)

SVM Model Evaluation Metrics:
Accuracy: 0.7727272727272727
Precision: 0.7567567567567568
Recall: 0.5185185185185185
F1 Score: 0.6153846153846154


In [201]:
# Training the Logistic Regression Model
# Logistic Regression is another powerful algorithm for binary classification problems like this
log_reg_classifier = LogisticRegression(max_iter=1000)        # Setting max_iter to ensure convergence
log_reg_classifier.fit(X_train, Y_train)

In [202]:
# Evaluating the Logistic Regression Model
X_train_prediction_log_reg = log_reg_classifier.predict(X_train)
training_data_accuracy_log_reg = accuracy_score(Y_train, X_train_prediction_log_reg)
X_test_prediction_log_reg = log_reg_classifier.predict(X_test)
test_data_accuracy_log_reg = accuracy_score(Y_test, X_test_prediction_log_reg)

In [203]:
print('Logistic Regression Model on training and testing:')
print('Accuracy score of the training data :', training_data_accuracy_log_reg)
print('Accuracy score of the test data :', test_data_accuracy_log_reg)

Logistic Regression Model on training and testing:
Accuracy score of the training data : 0.7850162866449512
Accuracy score of the test data : 0.7597402597402597


In [204]:
# Calculating additional evaluation metrics for the Logistic Regression Model on test data
accuracy_log_reg = accuracy_score(Y_test, X_test_prediction_log_reg)
precision_log_reg = precision_score(Y_test, X_test_prediction_log_reg)
recall_log_reg = recall_score(Y_test, X_test_prediction_log_reg)
f1_log_reg = f1_score(Y_test, X_test_prediction_log_reg)

PREDICTIVE SYSTEM

In [205]:
# Making a Predictive System with the Best Model
# Input data corresponds to one patient's medical information (e.g., number of pregnancies, glucose level, etc.)
input_data = (0,141,0,0,0,42.4,0.205,29)

In [206]:
# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

In [207]:
# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

In [208]:
# standardize the input data  (important since the model was trained on standardized data
std_data = scaler.transform(input_data_reshaped)



In [209]:
# Predicting with the SVM Model
svm_prediction = svm_classifier.predict(std_data)
print('SVM Prediction:', svm_prediction)
if (svm_prediction[0] == 0):
    print('SVM: The person is not diabetic')
else:
    print('SVM: The person is diabetic')

SVM Prediction: [1]
SVM: The person is diabetic


In [210]:
# Predicting with the Logistic Regression Model
log_reg_prediction = log_reg_classifier.predict(std_data)
print('Logistic Regression Prediction:', log_reg_prediction)
if (log_reg_prediction[0] == 0):
    print('Logistic Regression: The person is not diabetic')
else:
    print('Logistic Regression: The person is diabetic')

Logistic Regression Prediction: [1]
Logistic Regression: The person is diabetic
