In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer , StandardScaler,LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import joblib
import pickle 
import matplotlib.pyplot as plt
import tensorflow as tf




In [2]:
# Loading the datasets and visualising 
df = pd.read_csv("Final_Augmented_dataset_Diseases_and_Symptoms.csv")
df.head()

Unnamed: 0,diseases,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,stuttering or stammering,problems with orgasm,nose deformity,lump over jaw,sore in nose,hip weakness,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Cleaning column names
df.columns = df.columns.str.strip() 

#Ensuring that 'Disease' column exists
assert 'Disease' in df.columns or 'diseases' in df.columns, "No 'Disease' column found."
if 'diseases' in df.columns:
    df.rename(columns={'diseases': 'Disease'}, inplace=True)

In [4]:
# Removing duplicates
df = df.drop_duplicates()



In [5]:
# Removing rare diseases (less than 5 samples)
min_count = 5
disease_counts = df['Disease'].value_counts()
df = df[df['Disease'].isin(disease_counts[disease_counts >= min_count].index)]


In [6]:

# Rebuild X and y AFTER filtering
#taken feature in diff variable
feature_columns = [col for col in df.columns if col != 'Disease']
X = df[feature_columns]
y_raw = df['Disease']



In [7]:
#  Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_raw)
num_classes = len(np.unique(y_encoded))


In [8]:
# Save it as label_encoder.pkl
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

In [9]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42)

In [10]:
# One-hot encode y for DNN
y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=num_classes)
y_test_oh = tf.keras.utils.to_categorical(y_test, num_classes=num_classes)

In [11]:
# Train Random Forest
model = RandomForestClassifier(n_estimators=10, random_state=42)
model.fit(X_train, y_train)

In [12]:
#DNN Model
# Build model
dnn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

dnn_model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Train
dnn_model.fit(X_train, y_train_oh, epochs=20, batch_size=64, validation_split=0.1)

Epoch 1/20
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.4792 - loss: 2.7544 - val_accuracy: 0.7975 - val_loss: 0.6119
Epoch 2/20
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8092 - loss: 0.5498 - val_accuracy: 0.8128 - val_loss: 0.5100
Epoch 3/20
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8246 - loss: 0.4655 - val_accuracy: 0.8150 - val_loss: 0.4753
Epoch 4/20
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8298 - loss: 0.4394 - val_accuracy: 0.8199 - val_loss: 0.4654
Epoch 5/20
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8339 - loss: 0.4218 - val_accuracy: 0.8191 - val_loss: 0.4611
Epoch 6/20
[1m2131/2131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8361 - loss: 0.4107 - val_accuracy: 0.8207 - val_loss: 0.4588
Epoch 7/20
[1m2

In [None]:
# Getting prediction and evaluation from DNN
y_pred_probs = dnn_model.predict(X_test)

# Convert probabilities to class indices
y_pred_dnn = np.argmax(y_pred_probs, axis=1)

# Accuracy score :)
dnn_accuracy = accuracy_score(y_test, y_pred_dnn)
print(f" DNN Accuracy: {dnn_accuracy * 100:.2f}%")

# Classification report
print("\n=== DNN Classification Report ===")
print(classification_report(y_test, y_pred_dnn, target_names=le.classes_))

[1m1184/1184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 701us/step
 DNN Accuracy: 81.89%

=== DNN Classification Report ===
                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      0.88      0.93         8
                                        abdominal hernia       0.94      0.92      0.93        53
                                         abscess of nose       0.73      0.73      0.73        30
                                     abscess of the lung       1.00      1.00      1.00         1
                                  abscess of the pharynx       0.70      0.79      0.74        33
                                    acanthosis nigricans       0.50      1.00      0.67         2
                                               acariasis       1.00      0.50      0.67         2
                                               achalasia       0.50      0.40  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
#  Prediction using Random Forest model
y_pred_rf = model.predict(X_test)

# Accuracy of the model :)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"\n Random Forest Accuracy: {rf_accuracy * 100:.2f}%")

#  Classification Report of the model
print("\n Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))



 Random Forest Accuracy: 78.10%

 Classification Report (Random Forest):
                                                          precision    recall  f1-score   support

                               abdominal aortic aneurysm       1.00      0.62      0.77         8
                                        abdominal hernia       0.96      0.92      0.94        53
                                         abscess of nose       0.51      0.67      0.58        30
                                     abscess of the lung       0.50      1.00      0.67         1
                                  abscess of the pharynx       0.71      0.67      0.69        33
                                    acanthosis nigricans       1.00      0.50      0.67         2
                                               acariasis       1.00      0.50      0.67         2
                                               achalasia       0.12      0.20      0.15         5
                                           

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(model, f)

dnn_model.save("dnn_model.keras")
