In [19]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif

from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, roc_curve, auc
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# machine learning algorithm lib usage
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier

import warnings
warnings.filterwarnings('ignore')

In [33]:
db = pd.read_csv("./diabetes.csv")

In [34]:
db['CLASS'] = db['CLASS'].str.strip()
db["Gender"] = db["Gender"].str.strip()

db["Gender"].replace({'M': 1, 'F': 0, 'f': 0}, inplace=True)
db["CLASS"].replace({'N': 0, 'Y': 1, 'P': 2}, inplace=True)
db.head()

Unnamed: 0,ID,No_Pation,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,CLASS
0,502,17975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
1,735,34221,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0
2,420,47975,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
3,680,87656,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
4,504,34223,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0


In [35]:
db.columns

Index(['ID', 'No_Pation', 'Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG',
       'HDL', 'LDL', 'VLDL', 'BMI', 'CLASS'],
      dtype='object')

In [36]:
db.drop(['ID', 'No_Pation'], axis=1, inplace=True)

In [37]:
# Impute missing values with the mean
db = db.fillna(db.mean())

In [38]:
# Separate the features (X) and the target (y)
X = db.drop(columns=['CLASS'])
y = db['CLASS']

# Feature Selection using information gain

In [39]:
# Perform feature selection using Information Gain
num_features_to_select = 10  # selecting 5 features
selector = SelectKBest(score_func=mutual_info_classif, k=num_features_to_select)
X_new = selector.fit_transform(X, y)

# Get the indices of the selected features
selected_indices = selector.get_support(indices=True)

# Get the names of the selected features
selected_features = X.columns[selected_indices]

# Display the selected features
print("Selected Features:")
print(selected_features)

Selected Features:
Index(['AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL',
       'BMI'],
      dtype='object')


In [40]:
# Scale the selected features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Decision Tree Classifier

In [41]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train )
dt_train_pred = dt_model.predict(X_train)
dt_test_pred = dt_model.predict(X_test)

# Calculate training accuracy
dt_train_accuracy = accuracy_score(y_train, dt_train_pred)
print("Decision Tree Training Accuracy: {:.2f}%".format(dt_train_accuracy * 100))

# Calculate test accuracy
dt_test_accuracy = accuracy_score(y_test, dt_test_pred)
print("Decision Tree Test Accuracy: {:.2f}%".format(dt_test_accuracy * 100))

Decision Tree Training Accuracy: 100.00%
Decision Tree Test Accuracy: 98.00%


# Naives Bayes

In [42]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train )
nb_train_pred = nb_model.predict(X_train)
nb_test_pred = nb_model.predict(X_test)

# Calculate training accuracy
nb_train_accuracy = accuracy_score(y_train, nb_train_pred)
print("Naive Bayes Training Accuracy: {:.2f}%".format(nb_train_accuracy * 100))

# Calculate test accuracy
nb_test_accuracy = accuracy_score(y_test, nb_test_pred)
print("Naive Bayes Test Accuracy: {:.2f}%".format(nb_test_accuracy * 100))

Naive Bayes Training Accuracy: 95.00%
Naive Bayes Test Accuracy: 94.50%


# stacking decision tree and naive bayes

In [43]:
# Create the base classifiers
base_classifiers = [
    ('decision_tree', DecisionTreeClassifier(random_state=42)),
    ('gaussian_nb', GaussianNB())
]

# Create the stacking classifier
stacking_model = StackingClassifier(estimators=base_classifiers, final_estimator=GaussianNB())

# Fit the stacking model on the training data
stacking_model.fit(X_train, y_train)

# Make predictions on both training and testing data
stacking_train_pred = stacking_model.predict(X_train)
stacking_test_pred = stacking_model.predict(X_test)

# Calculate training accuracy
stacking_train_accuracy = accuracy_score(y_train, stacking_train_pred)
print("Stacking Training Accuracy: {:.2f}%".format(stacking_train_accuracy * 100))

# Calculate test accuracy
stacking_test_accuracy = accuracy_score(y_test, stacking_test_pred)
print("Stacking Test Accuracy: {:.2f}%".format(stacking_test_accuracy * 100))

Stacking Training Accuracy: 96.50%
Stacking Test Accuracy: 95.50%


In [44]:
# Export the model using pickle
with open('db_model.pkl', 'wb') as file:
    pickle.dump(stacking_model, file)

print("Model exported successfully to 'db_model.pkl'")

Model exported successfully to 'db_model.pkl'
