In [24]:
import pandas as pd, numpy as np, joblib, random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.metrics import SparseTopKCategoricalAccuracy
from tensorflow.keras.models import load_model
import tensorflow as tf

In [2]:
df = pd.read_csv("Disease_Final_cleaned.csv")

In [3]:
df.head()

Unnamed: 0,disease,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,...,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness,gender_enc,smoking_enc,alcohol_enc,age_scaled,height_scaled,weight_scaled
0,panic disorder,1,0,1,1,0,0,0,0,1,...,0,0,0,0,0,2,2,-0.685567,0.582079,1.634411
1,panic disorder,0,0,1,1,0,1,1,0,0,...,0,0,0,0,0,2,2,-0.685567,0.582079,1.634411
2,panic disorder,1,1,1,1,0,1,1,0,0,...,0,0,0,0,0,2,2,-0.685567,0.582079,1.634411
3,panic disorder,1,0,0,1,0,1,1,1,0,...,0,0,0,0,0,2,2,-0.685567,0.582079,1.634411
4,panic disorder,1,1,0,0,0,0,1,1,1,...,0,0,0,0,0,2,2,-0.685567,0.582079,1.634411


In [4]:
df["disease"].value_counts()

Unnamed: 0_level_0,count
disease,Unnamed: 1_level_1
cystitis,1219
nose disorder,1218
vulvodynia,1218
complex regional pain syndrome,1217
spondylosis,1216
...,...
uterine cancer,2
open wound of the abdomen,2
heart contusion,2
cushing syndrome,2


In [5]:
X = df.drop(columns=['disease'])
y = df['disease']

In [6]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

pd.Series(y_encoded).to_csv("disease_labels.csv", index=False)

In [7]:
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [25]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42
)

In [26]:
# Define dimensions
num_features = X.shape[1]
num_classes = len(np.unique(y_encoded))

# Build deeper, wider model
model = Sequential([
    Dense(2048, input_shape=(num_features,), activation='relu'),
    BatchNormalization(),
    Dropout(0.5),

    Dense(1024, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),

    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(num_classes, activation='softmax')
])

# Compile with Adam optimizer and learning rate scheduler
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=0.0005),  # slightly reduced to stabilize
    metrics=[
        'accuracy',
        SparseTopKCategoricalAccuracy(k=3, name='top_3_accuracy'),
        SparseTopKCategoricalAccuracy(k=5, name='top_5_accuracy')
    ]
)

# Callbacks
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=6,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=512,  # reduced for finer weight updates
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 35ms/step - accuracy: 0.3619 - loss: 4.5353 - top_3_accuracy: 0.4506 - top_5_accuracy: 0.4881 - val_accuracy: 0.1987 - val_loss: 3.7311 - val_top_3_accuracy: 0.3846 - val_top_5_accuracy: 0.4846 - learning_rate: 5.0000e-04
Epoch 2/100
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8955 - loss: 0.7672 - top_3_accuracy: 0.9361 - top_5_accuracy: 0.9472 - val_accuracy: 0.9622 - val_loss: 0.2687 - val_top_3_accuracy: 0.9760 - val_top_5_accuracy: 0.9791 - learning_rate: 5.0000e-04
Epoch 3/100
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9613 - loss: 0.2993 - top_3_accuracy: 0.9780 - top_5_accuracy: 0.9818 - val_accuracy: 0.9858 - val_loss: 0.0777 - val_top_3_accuracy: 0.9914 - val_top_5_accuracy: 0.9933 - learning_rate: 5.0000e-04
Epoch 4/100
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0

In [28]:
# Evaluate the model on validation data
val_metrics = model.evaluate(X_val, y_val, verbose=0)

# Print evaluation results
print(f"\nValidation Accuracy (Top-1): {val_metrics[1]:.4f}")
print(f"Validation Accuracy (Top-3): {val_metrics[2]:.4f}")
print(f"Validation Accuracy (Top-5): {val_metrics[3]:.4f}")

# Save the model in HDF5 format
model.save("disease_model.h5")
print("\nModel saved to 'disease_model.h5'")

# Save the model in native Keras format
model.save("disease_model.keras")
print("Model also saved to 'disease_model.keras'")




Validation Accuracy (Top-1): 0.9992
Validation Accuracy (Top-3): 0.9999
Validation Accuracy (Top-5): 1.0000

Model saved to 'disease_model.h5'
Model also saved to 'disease_model.keras'


In [29]:
pd.set_option('display.max_rows', None)
input_columns = X.columns.tolist()

In [30]:
model = load_model("disease_model.h5")
label_encoder = joblib.load("label_encoder.pkl")
scaler = joblib.load("scaler.pkl")
le_gender = joblib.load("le_gender.pkl")
le_smoking = joblib.load("le_smoking.pkl")
le_alcohol = joblib.load("le_alcohol.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [31]:
non_symptom_cols = [
    'gender_enc', 'smoking_enc', 'alcohol_enc',
    'age_scaled', 'height_scaled', 'weight_scaled'
]

sample_input = {
    col: random.randint(0, 1)
    for col in input_columns if col not in non_symptom_cols
}

In [32]:
gender = "M"
smoking = "Never"
alcohol = "Never"

gender_encoded = le_gender.transform([gender])[0]
smoking_encoded = le_smoking.transform([smoking])[0]
alcohol_encoded = le_alcohol.transform([alcohol])[0]

age = 37
height_cm = 175
weight_kg = 70

scaled_values = scaler.transform([[age, height_cm, weight_kg]])
age_scaled, height_scaled, weight_scaled = scaled_values[0]



In [33]:
sample_input.update({
    'gender_enc': gender_encoded,
    'smoking_enc': smoking_encoded,
    'alcohol_enc': alcohol_encoded,
    'age_scaled': age_scaled,
    'height_scaled': height_scaled,
    'weight_scaled': weight_scaled
})

In [34]:
sample_input

{'anxiety and nervousness': 0,
 'depression': 1,
 'shortness of breath': 1,
 'depressive or psychotic symptoms': 0,
 'sharp chest pain': 0,
 'dizziness': 0,
 'insomnia': 0,
 'abnormal involuntary movements': 0,
 'chest tightness': 1,
 'palpitations': 1,
 'irregular heartbeat': 0,
 'breathing fast': 1,
 'hoarse voice': 1,
 'sore throat': 1,
 'difficulty speaking': 1,
 'cough': 1,
 'nasal congestion': 0,
 'throat swelling': 0,
 'diminished hearing': 1,
 'lump in throat': 0,
 'throat feels tight': 1,
 'difficulty in swallowing': 0,
 'skin swelling': 1,
 'retention of urine': 0,
 'groin mass': 1,
 'leg pain': 0,
 'hip pain': 1,
 'suprapubic pain': 1,
 'blood in stool': 1,
 'lack of growth': 0,
 'emotional symptoms': 0,
 'elbow weakness': 1,
 'back weakness': 0,
 'pus in sputum': 1,
 'symptoms of the scrotum and testes': 1,
 'swelling of scrotum': 0,
 'pain in testicles': 1,
 'flatulence': 0,
 'pus draining from ear': 0,
 'jaundice': 1,
 'mass in scrotum': 0,
 'white discharge from eye': 0,

In [35]:
input_df = pd.DataFrame([sample_input])
input_df.head()

Unnamed: 0,anxiety and nervousness,depression,shortness of breath,depressive or psychotic symptoms,sharp chest pain,dizziness,insomnia,abnormal involuntary movements,chest tightness,palpitations,...,back swelling,ankle stiffness or tightness,ankle weakness,neck weakness,gender_enc,smoking_enc,alcohol_enc,age_scaled,height_scaled,weight_scaled
0,0,1,1,0,0,0,0,0,1,1,...,1,1,1,0,2,2,2,-0.725525,0.798173,-0.087499


In [36]:
input_array = np.array(input_df).reshape(1, -1)
print(input_array.shape)

(1, 383)


In [37]:
# Get predicted probabilities (1, 727)
predicted_probs = model.predict(input_array)[0]  # Shape: (num_classes,)

# Get top 3 class indices (sorted by probability descending)
top_3_indices = np.argsort(predicted_probs)[-3:][::-1]  # Last 3, reversed

# Decode the class labels
top_3_labels = label_encoder.inverse_transform(top_3_indices)

# Print top 3 results with their confidence scores
print("\nTop 3 Predicted Diseases:")
for i, idx in enumerate(top_3_indices):
    print(f"{i+1}. {top_3_labels[i]} — Confidence: {predicted_probs[idx]*100:.2f}%")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 615ms/step

Top 3 Predicted Diseases:
1. impetigo — Confidence: 55.75%
2. stye — Confidence: 5.97%
3. deviated nasal septum — Confidence: 5.51%
