In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('/Users/astha/Desktop/Diabetes Prediction/Datasets/Dataset1/diabetes.csv')

# printing the first 5 rows of the dataset
print("First 5 rows of the dataset:")
print(diabetes_dataset.head())

# number of rows and Columns in this dataset
print("\nShape of the dataset (rows, columns):")
print(diabetes_dataset.shape)

# getting the statistical measures of the data
print("\nStatistical summary of the data:")
print(diabetes_dataset.describe())

# Count of Outcome values
print("\nValue counts of Outcome:")
print(diabetes_dataset['Outcome'].value_counts())

# Group by Outcome and compute mean
print("\nGroup by Outcome and compute mean:")
print(diabetes_dataset.groupby('Outcome').mean())

# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

# Data Standardization
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
X = standardized_data

# Train Test Split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)
print(f"Shape of X_train: {X_train.shape}, Shape of X_test: {X_test.shape}, Shape of Y_train: {Y_train.shape}, Shape of Y_test: {Y_test.shape}")

First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Shape of the dataset (rows, columns):
(768, 9)

Statistical summary of the data:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.1

In [3]:
# Train the Support Vector Machine Classifier
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

# Accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction) * 100
print(f'Accuracy score of the training data: {training_data_accuracy:.2f}%')

# Accuracy score on the test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction) * 100
print(f'Accuracy score of the test data: {test_data_accuracy:.2f}%')

Accuracy score of the training data: 78.66%
Accuracy score of the test data: 77.27%
