In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.metrics import accuracy_score
import pickle
import sklearn
import plotly.express as px
import plotly.graph_objects as go

In [5]:
print('Numpy Version:', np.__version__)
print('Pandas Version:', pd.__version__)
print('Sklearn Version:', sklearn.__version__)

Numpy Version: 1.26.4
Pandas Version: 2.2.2
Sklearn Version: 1.5.2


In [6]:
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('diabetes.csv')

In [7]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(768, 9)

In [9]:
min_value = diabetes_dataset['Glucose'].min()
max_value = diabetes_dataset['Glucose'].max()

print("Minimum value:", min_value)
print("Maximum value:", max_value)

Minimum value: 0
Maximum value: 199


In [10]:
min_value = diabetes_dataset['BloodPressure'].min()
max_value = diabetes_dataset['BloodPressure'].max()

print("Minimum value:", min_value)
print("Maximum value:", max_value)

Minimum value: 0
Maximum value: 122


In [11]:
min_value = diabetes_dataset['SkinThickness'].min()
max_value = diabetes_dataset['SkinThickness'].max()

print("Minimum value:", min_value)
print("Maximum value:", max_value)

Minimum value: 0
Maximum value: 99


In [12]:
min_value = diabetes_dataset['Insulin'].min()
max_value = diabetes_dataset['Insulin'].max()

print("Minimum value:", min_value)
print("Maximum value:", max_value)

Minimum value: 0
Maximum value: 846


In [13]:
min_value = diabetes_dataset['BMI'].min()
max_value = diabetes_dataset['BMI'].max()

print("Minimum value:", min_value)
print("Maximum value:", max_value)

Minimum value: 0.0
Maximum value: 67.1


In [14]:
min_value = diabetes_dataset['DiabetesPedigreeFunction'].min()
max_value = diabetes_dataset['DiabetesPedigreeFunction'].max()

print("Minimum value:", min_value)
print("Maximum value:", max_value)

Minimum value: 0.078
Maximum value: 2.42


In [15]:
min_value = diabetes_dataset['Insulin'].min()
max_value = diabetes_dataset['Insulin'].max()

print("Minimum value:", min_value)
print("Maximum value:", max_value)

Minimum value: 0
Maximum value: 846


In [16]:
diabetes_dataset.fillna(diabetes_dataset.mean(), inplace=True)
# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(diabetes_dataset)
pca = PCA(n_components=2)  # Choose the desired number of components
pca.fit(scaled_data)
principal_components = pca.transform(scaled_data)
print(principal_components)

[[ 1.75694707  1.11174258]
 [-1.50742148 -0.55940565]
 [ 0.65082201  1.92957633]
 ...
 [-0.57408846  0.03381634]
 [-0.17269908  1.32289215]
 [-1.32126715 -1.02748861]]


In [17]:
Age_Outcome = diabetes_dataset.groupby(["Age", "Outcome"]).size().reset_index(name="count")

# Map Outcome to Disease and No Disease
Age_Outcome["Outcome"] = Age_Outcome["Outcome"].map({0: "No Disease", 1: "Disease"})

# Plot the scatter plot for Age vs Outcome
fig = px.scatter(
    Age_Outcome,
    x="Age",
    y="count",
    color="Outcome",
    title="Age vs Diabetes Outcome Distribution",
    labels={"Age": "Age", "count": "Count", "Outcome": "Diabetes Outcome"},
    color_discrete_map={"No Disease": "green", "Disease": "red"}  # Custom colors for Outcome
)
fig.show()

In [18]:

# Group by BloodPressure and Outcome
BloodPressure_Outcome = diabetes_dataset.groupby(["BloodPressure", "Outcome"]).size().reset_index(name="count")

# Map Outcome to Disease and No Disease
BloodPressure_Outcome["Outcome"] = BloodPressure_Outcome["Outcome"].map({0: "No Disease", 1: "Disease"})

# Plot the pie chart for Outcome distribution
fig = px.pie(
    BloodPressure_Outcome,
    names="Outcome",
    values="count",
    title="Diabetes Outcome Distribution by Blood Pressure",
    color="Outcome",
    color_discrete_map={"No Disease": "green", "Disease": "red"}  # Custom colors for Outcome
)

fig.show()


In [19]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [20]:
diabetes_dataset['Outcome'].value_counts()

Unnamed: 0_level_0,count
Outcome,Unnamed: 1_level_1
0,500
1,268


In [21]:
diabetes_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [22]:
# Data Preprocessing - Replace zero values with NaN, then fill with column median
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    diabetes_dataset[column].replace(0, np.nan, inplace=True)
    diabetes_dataset[column].fillna(diabetes_dataset[column].median(), inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





0 --> Non-Diabetic

1 --> Diabetic

In [23]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,110.682,70.92,27.726,127.792,30.8856,0.429734,31.19
1,4.865672,142.130597,75.123134,31.686567,164.701493,35.383582,0.5505,37.067164


In [24]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [25]:
print(X)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6    148.0           72.0           35.0    125.0  33.6   
1              1     85.0           66.0           29.0    125.0  26.6   
2              8    183.0           64.0           29.0    125.0  23.3   
3              1     89.0           66.0           23.0     94.0  28.1   
4              0    137.0           40.0           35.0    168.0  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10    101.0           76.0           48.0    180.0  32.9   
764            2    122.0           70.0           27.0    125.0  36.8   
765            5    121.0           72.0           23.0    112.0  26.2   
766            1    126.0           60.0           29.0    125.0  30.1   
767            1     93.0           70.0           31.0    125.0  30.4   

     DiabetesPedigreeFunction  Age  
0                       0.627   50  
1                       0.351   31  


In [26]:
print(Y)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


In [27]:
scaler = StandardScaler()

In [28]:
scaler.fit(X)

In [29]:
standardized_data = scaler.transform(X)

In [30]:
print(standardized_data)

[[ 0.63994726  0.86604475 -0.03198993 ...  0.16661938  0.46849198
   1.4259954 ]
 [-0.84488505 -1.20506583 -0.5283186  ... -0.85219976 -0.36506078
  -0.19067191]
 [ 1.23388019  2.01666174 -0.69376149 ... -1.33250021  0.60439732
  -0.10558415]
 ...
 [ 0.3429808  -0.02157407 -0.03198993 ... -0.910418   -0.68519336
  -0.27575966]
 [-0.84488505  0.14279979 -1.02464727 ... -0.34279019 -0.37110101
   1.17073215]
 [-0.84488505 -0.94206766 -0.19743282 ... -0.29912651 -0.47378505
  -0.87137393]]


In [31]:
X = standardized_data
Y = diabetes_dataset['Outcome']

In [32]:
print(X)
print(Y)

[[ 0.63994726  0.86604475 -0.03198993 ...  0.16661938  0.46849198
   1.4259954 ]
 [-0.84488505 -1.20506583 -0.5283186  ... -0.85219976 -0.36506078
  -0.19067191]
 [ 1.23388019  2.01666174 -0.69376149 ... -1.33250021  0.60439732
  -0.10558415]
 ...
 [ 0.3429808  -0.02157407 -0.03198993 ... -0.910418   -0.68519336
  -0.27575966]
 [-0.84488505  0.14279979 -1.02464727 ... -0.34279019 -0.37110101
   1.17073215]
 [-0.84488505 -0.94206766 -0.19743282 ... -0.29912651 -0.47378505
  -0.87137393]]
0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


Train Test Split

In [33]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [34]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


Training the Model

In [35]:
classifier = svm.SVC(kernel='linear')

In [36]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [37]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [38]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7801302931596091


In [39]:
# accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [40]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.7727272727272727


In [41]:
from sklearn.metrics import confusion_matrix

# Predict the labels for the test set
Y_pred = classifier.predict(X_test)

# Compute the confusion matrix
cm = confusion_matrix(Y_test, Y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[91  9]
 [26 28]]


In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)

# Calculate precision
precision = precision_score(Y_test, Y_pred)

# Calculate recall
recall = recall_score(Y_test, Y_pred)

# Calculate F1 score
f1 = f1_score(Y_test, Y_pred)

# Calculate specificity (Specificity = TN / (TN + FP))
# First, we need to extract the confusion matrix components
TN, FP, FN, TP = cm.ravel()
specificity = TN / (TN + FP)

# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Specificity: {specificity}")


Accuracy: 0.7727272727272727
Precision: 0.7567567567567568
Recall: 0.5185185185185185
F1 Score: 0.6153846153846154
Specificity: 0.91


Making a Predictive System

In [43]:
input_data = (2,141,58,34,128,25.4,0.699,24)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[[-0.54791859  0.63592135 -1.19009016  0.55681856 -0.14678958 -1.02685447
   0.68594052 -0.78628618]]
[0]
The person is not diabetic



X does not have valid feature names, but StandardScaler was fitted with feature names



In [44]:
with open('diabetes_model.pkl', 'wb') as file:
    pickle.dump(classifier, file)

In [45]:
# Using RandomForestClassifier with hyperparameter tuning
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=2), param_grid_rf, refit=True, cv=5, verbose=1)
grid_rf.fit(X_train, Y_train)

# Best Random Forest model after tuning
classifier_rf = grid_rf.best_estimator_

# Model evaluation with Random Forest
X_train_prediction_rf = classifier_rf.predict(X_train)
training_data_accuracy_rf = accuracy_score(X_train_prediction_rf, Y_train)
print('Random Forest - Accuracy score of the training data: ', training_data_accuracy_rf)

X_test_prediction_rf = classifier_rf.predict(X_test)
test_data_accuracy_rf = accuracy_score(X_test_prediction_rf, Y_test)
print('Random Forest - Accuracy score of the test data: ', test_data_accuracy_rf)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Random Forest - Accuracy score of the training data:  0.993485342019544
Random Forest - Accuracy score of the test data:  0.7272727272727273
