In [3]:
# SVM for Diabetes Dataset
#Step 1: Load the Dataset

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



# Load the dataset 

file_path = "F:\\AWFERA\\Machine learning\\AwferaMachineLearningProjects\\diabetes.csv"
df = pd.read_csv(file_path)

# Display basic information
print("Dataset Information")
print(df.info())
print("\nFirst 5 rows")
print(df.head())


Dataset Information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

First 5 rows
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29    

In [4]:
#Step 2: Handle Missing Values
print("\nChecking for missing values:")
print(df.isnull().sum())

#Fill missing numerical values with the median

df.fillna(df.median(numeric_only = True), inplace=True)

# Fill missing categorical values with the mode (if any)
for col in df.select_dtypes(include = ['object']):
    df[col].fillna(df[col].mode()[0], inplace = true)


Checking for missing values:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [5]:
#Step 3: Prepare Data
#Separate feature and target variable

x= df.drop(columns = ['Outcome'])
y = df['Outcome']

# Step 4: Apply Standard Scaling
scaler = StandardScaler()

x_scaled = scaler.fit_transform(x)

# Step 5: Split Data into Traning and Testing sets
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.3, random_state = 42)

# Step 6: (Modified): Train a SVM Classifier
from sklearn.svm import SVC

print("\nTraining SVM Classifier................")
svm_model = SVC(kernel = 'linear', random_state= 42)
svm_model.fit(x_train, y_train)

#Step 7: Model Evaluation for SVM

y_pred_svm = svm_model.predict(x_test)

# Calculate accuracy
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print("\nSVM Model Accuracy: {svm accuracy:.2f}")

# Classification Report
print("\nSVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

#Confusion Matrix
print("\nSVM Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))









Training SVM Classifier................

SVM Model Accuracy: {svm accuracy:.2f}

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.81      0.81       151
           1       0.64      0.62      0.63        80

    accuracy                           0.75       231
   macro avg       0.72      0.72      0.72       231
weighted avg       0.75      0.75      0.75       231


SVM Confusion Matrix:
[[123  28]
 [ 30  50]]
