<a href="https://colab.research.google.com/github/vrindaBindal2712/Diabetes-Prediction-using-Machine-Learning-with-Python-/blob/main/Copy_of_Untitled6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [None]:
diabetes_dataset = pd.read_csv('/content/diabetes.csv')

In [None]:
print(diabetes_dataset.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [None]:
print("Dataset shape:", diabetes_dataset.shape)

Dataset shape: (768, 9)


In [None]:
print(diabetes_dataset.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [None]:
print(diabetes_dataset['Outcome'].value_counts())

Outcome
0    500
1    268
Name: count, dtype: int64


In [None]:
X = diabetes_dataset.drop(columns='Outcome', axis=1)
Y = diabetes_dataset['Outcome']

In [None]:
# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Handle Imbalanced Dataset using SMOTE
smote = SMOTE(random_state=42)  # Added SMOTE for balancing the dataset
X, Y = smote.fit_resample(X, Y)

In [None]:
# Splitting the dataset into training and testing data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print("Training set shape:", X_train.shape, "Test set shape:", X_test.shape)

Training set shape: (800, 8) Test set shape: (200, 8)


In [None]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': [1, 0.1, 0.01, 0.001]
}
grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=3, cv=5)
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.731 total time=   0.1s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.775 total time=   0.1s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.725 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.744 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.769 total time=   0.0s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.738 total time=   0.2s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.713 total time=   0.2s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.744 total time=   0.1s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.750 total time=   0.3s
[CV 5/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.738 total time=   0.4s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.625 total time=   0.1s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf

In [None]:
print("Best Parameters:", grid.best_params_)

Best Parameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}


In [None]:
classifier = grid.best_estimator_

In [None]:
classifier.fit(X_train, Y_train)


In [None]:
# Model Evaluation
# Accuracy score on training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print('Accuracy score of the training data:', training_data_accuracy)

Accuracy score of the training data: 1.0


In [None]:
# Accuracy score on test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print('Accuracy score of the test data:', test_data_accuracy)


Accuracy score of the test data: 0.82


In [None]:
# Detailed evaluation using classification report
print("Classification Report:\n", classification_report(Y_test, X_test_prediction))

# Making a Predictive System
input_data = (5, 166, 72, 19, 175, 25.8, 0.587, 51)

# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshaping the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1, -1)

# Standardizing the input data
std_data = scaler.transform(input_data_reshaped)

# Making a prediction
prediction = classifier.predict(std_data)
print(prediction)

if prediction[0] == 0:
    print('The person is not diabetic')
else:
    print('The person is diabetic')

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.79      0.81       100
           1       0.80      0.85      0.83       100

    accuracy                           0.82       200
   macro avg       0.82      0.82      0.82       200
weighted avg       0.82      0.82      0.82       200

[1]
The person is diabetic


