In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Loading the diabetes dataset into a pandas DataFrame
diabetes_dataset = pd.read_csv('/Users/astha/Desktop/Diabetes Prediction/Datasets/Dataset2/diabetes_prediction_dataset.csv')

# Printing the first 5 rows of the dataset
print(diabetes_dataset.head())

# Getting the number of rows and columns in the dataset
print(f"Dataset Shape: {diabetes_dataset.shape}")

# Getting the statistical measures of the data
print("\nStatistical Measures:")
print(diabetes_dataset.describe())

# Checking the value counts of the target variable 'diabetes'
print("\nDiabetes Outcome Value Counts:")
print(diabetes_dataset['diabetes'].value_counts())

# Encoding the 'gender' column using LabelEncoder
label_encoder = LabelEncoder()
diabetes_dataset['gender_encoded'] = label_encoder.fit_transform(diabetes_dataset['gender'])
print("\nGender Encoding:")
print(diabetes_dataset[['gender', 'gender_encoded']].head())

# Grouping the data by 'gender' and 'smoking_history' and calculating the mean
grouped_data = diabetes_dataset.groupby(['gender', 'smoking_history']).mean()
print("\nGrouped Data:")
print(grouped_data)

# Separating the data (features) and labels (target variable)
X = diabetes_dataset.drop(columns='diabetes', axis=1)
Y = diabetes_dataset['diabetes']

# Displaying the features and target variable
print("\nFeatures (X):")
print(X.head())
print("\nTarget (Y):")
print(Y.head())

# Data Standardization
numeric_features = X.select_dtypes(include=['number'])
scaler = StandardScaler()
standardized_data = scaler.fit_transform(numeric_features)

# Creating a DataFrame with standardized data
X_standardized = pd.DataFrame(standardized_data, columns=numeric_features.columns)

# Separating categorical features
categorical_features = X.select_dtypes(exclude=['number']).reset_index(drop=True)

# Combining standardized numeric data with categorical data
X_final = pd.concat([X_standardized, categorical_features], axis=1)
print("\nFinal Feature Set (Standardized + Categorical):")
print(X_final.head())

# Updating the features and target variable
X = standardized_data
Y = diabetes_dataset['diabetes']

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(f"\nX shape: {X.shape}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
Dataset Shape: (100000, 9)

Statistical Measures:
                 age  hypertension  heart_disease            bmi  \
count  100000.000000  100000.00000  100000.000000  100000.000000   
mean       41.885856       0.07485       0.039420      27.320767   
std        22.516840       0.26

In [8]:
# Training the SVM model with a linear kernel
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

# Accuracy on training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)
print(f'\nAccuracy score of the training data: {training_data_accuracy * 100:.2f}%')

# Accuracy on test data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)
print(f'Accuracy score of the test data: {test_data_accuracy * 100:.2f}%')


Accuracy score of the training data: 96.04%
Accuracy score of the test data: 96.14%
