In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
print("Loading the dataset from the specified path.")
df = pd.read_csv('/Users/astha/Desktop/Diabetes Prediction/Datasets/Dataset2/diabetes_prediction_dataset.csv')

# Display the first few rows of the dataset
print("\nDisplaying the first 5 rows of the dataset to understand its structure.")
print(df.head())

# Encode categorical columns if present
print("\nEncoding categorical columns into numeric values.")
categorical_columns = df.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
    print(f"Categorical columns found: {categorical_columns}")
    for column in categorical_columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
else:
    print("No categorical columns found.")

# Splitting features and labels
print("\nSeparating the dataset into features (first 8 columns) and labels (last column).")
features = df.iloc[:, :8]
labels = df.iloc[:, 8]

# Displaying the shapes of features and labels
print("\nDisplaying the shape of the features (rows, columns) and labels (rows).")
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)

# Splitting the dataset into training and testing sets
print("\nSplitting the dataset into training and testing sets (80% training, 20% testing).")
features_train, features_test, label_train, label_test = train_test_split(
    features, labels, test_size=0.2, random_state=0)

# Displaying the shapes of the training and testing sets
print("\nDisplaying the shapes of training and testing sets for features and labels.")
print("Features training set shape:", features_train.shape)
print("Labels training set shape:", label_train.shape)
print("Features testing set shape:", features_test.shape)
print("Labels testing set shape:", label_test.shape)

# Training the Naive Bayes classifier
print("\nTraining the Naive Bayes classifier with specified priors (80% class 0, 20% class 1).")
naive_class = GaussianNB(priors=[0.8, 0.2])
naive_class.fit(features_train, label_train)

# Predicting the labels for the test set
print("\nPredicting the labels for the test dataset.")
labels_test_pred = naive_class.predict(features_test)
print("Predicted labels for the test dataset:")
print(labels_test_pred)

# Calculating and displaying the accuracy of the model on the test dataset
print("\nCalculating the accuracy of the model on the test dataset.")
test_accuracy = accuracy_score(label_test, labels_test_pred)
print("Test Accuracy of the model: " + str(round(test_accuracy * 100, 2)) + "%")

# Predicting the labels for the training set
print("\nPredicting the labels for the training dataset.")
labels_train_pred = naive_class.predict(features_train)
print("Predicted labels for the training dataset:")
print(labels_train_pred)

# Calculating and displaying the accuracy of the model on the training dataset
print("\nCalculating the accuracy of the model on the training dataset.")
train_accuracy = accuracy_score(label_train, labels_train_pred)
print("Training Accuracy of the model: " + str(round(train_accuracy * 100, 2)) + "%")

Loading the dataset from the specified path.

Displaying the first 5 rows of the dataset to understand its structure.
   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  

Encoding categorical columns into numeric values.
Categorical columns found: Index(['gender', 'smoking_history'], dtype='object')

Separating the dataset into feature