In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [105]:
df = pd.read_csv('diabetes_data_upload.csv')

In [106]:
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 520 entries, 0 to 519
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Age                 520 non-null    int64 
 1   Gender              520 non-null    object
 2   Polyuria            520 non-null    object
 3   Polydipsia          520 non-null    object
 4   sudden weight loss  520 non-null    object
 5   weakness            520 non-null    object
 6   Polyphagia          520 non-null    object
 7   Genital thrush      520 non-null    object
 8   visual blurring     520 non-null    object
 9   Itching             520 non-null    object
 10  Irritability        520 non-null    object
 11  delayed healing     520 non-null    object
 12  partial paresis     520 non-null    object
 13  muscle stiffness    520 non-null    object
 14  Alopecia            520 non-null    object
 15  Obesity             520 non-null    object
 16  class               520 no

In [108]:
df.columns

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')

In [109]:

def preprocess_data(df):
    # Convert 'Gender' column to binary (1 for female, 0 for male)
    df['Gender'] = df['Gender'].apply(lambda x: 1 if str(x).lower() == 'female' else 0)

    # Convert other binary columns to 1s and 0s
    binary_columns = ['Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness',
                      'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching',
                      'Irritability', 'delayed healing', 'partial paresis',
                      'muscle stiffness', 'Alopecia', 'Obesity']


    for col in binary_columns:
        # Convert 'yes' to 1 and 'no' to 0
        df[col] = df[col].apply(lambda x: 1 if str(x).lower() == 'yes' else 0)

    # Convert 'class' column to binary (1 for positive, 0 for negative)
    df['class'] = df['class'].apply(lambda x: 1 if str(x).lower() == 'positive' else 0)

    return df

# Example usage:
# Assuming df is your DataFrame containing the dataset
# df = preprocess_data(df)


df = preprocess_data(df)
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,0,0,1,0,1,0,0,0,1,0,1,0,1,1,1,1
1,58,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,1
2,41,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1
3,45,0,0,0,1,1,1,1,0,1,0,1,0,0,0,0,1
4,60,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1


In [110]:
x = df.drop('class', axis = 1)
y = df.iloc[:,-1]

In [111]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [112]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [113]:
pipeline_knn = Pipeline([('scaler_1', StandardScaler()), ('knn_classifier', KNeighborsClassifier())])
pipeline_lr = Pipeline([('scaler_2', StandardScaler()), ('lr_classifier', LogisticRegression())])
pipeline_svc =Pipeline ([('scaler_3', StandardScaler()), ('svc_classifier', SVC())])
pipeline_nb = Pipeline([('scaler_4', StandardScaler()), ('nb_classifier', GaussianNB())])
pipeline_dt = Pipeline([('dt_classifier', DecisionTreeClassifier())])
pipeline_rf = Pipeline([('rf_classifier', RandomForestClassifier())])
pipeline_gb = Pipeline([('gb_classifier', GradientBoostingClassifier())])

In [114]:
pipelines = [pipeline_knn,pipeline_lr,pipeline_svc,pipeline_nb,pipeline_dt,pipeline_rf,pipeline_gb]
pipeline_names = ['pipeline_knn','pipeline_lr','pipeline_svc','pipeline_nb','pipeline_dt','pipeline_rf','pipeline_gb']
pipelines

[Pipeline(steps=[('scaler_1', StandardScaler()),
                 ('knn_classifier', KNeighborsClassifier())]),
 Pipeline(steps=[('scaler_2', StandardScaler()),
                 ('lr_classifier', LogisticRegression())]),
 Pipeline(steps=[('scaler_3', StandardScaler()), ('svc_classifier', SVC())]),
 Pipeline(steps=[('scaler_4', StandardScaler()),
                 ('nb_classifier', GaussianNB())]),
 Pipeline(steps=[('dt_classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('rf_classifier', RandomForestClassifier())]),
 Pipeline(steps=[('gb_classifier', GradientBoostingClassifier())])]

In [115]:
for pipe in pipelines:
    pipe.fit(x_train, y_train)

In [116]:
for pipeline_name, pipeline in zip(pipeline_names, pipelines):
    print(f"classification report for pipeline {pipeline_name}:")
    y_pred = pipeline.predict(x_test)
    print(classification_report(y_test, y_pred))
    print("="*60)

classification report for pipeline pipeline_knn:
              precision    recall  f1-score   support

           0       0.79      0.91      0.85        33
           1       0.95      0.89      0.92        71

    accuracy                           0.89       104
   macro avg       0.87      0.90      0.88       104
weighted avg       0.90      0.89      0.90       104

classification report for pipeline pipeline_lr:
              precision    recall  f1-score   support

           0       0.90      0.85      0.88        33
           1       0.93      0.96      0.94        71

    accuracy                           0.92       104
   macro avg       0.92      0.90      0.91       104
weighted avg       0.92      0.92      0.92       104

classification report for pipeline pipeline_svc:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       0.99      1.00      0.99        71

    accuracy                        