#TO PREDICT WATERBRONE DIEASES BASED ON THE MEDICAL REPORT USING DEEP LEARNING


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D,LSTM, Flatten
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam


In [26]:
dataset = pd.read_csv('/content/2.waterborne_diseases_lab_10k.csv')
dataset.tail(20)

Unnamed: 0,Patient_ID,Age,Gender,Symptoms_Text,Sodium_mmol_L,Potassium_mmol_L,Chloride_mmol_L,WBC_109_per_L,Hemoglobin_g_dL,Platelets_109_per_L,Urea_mg_dL,Creatinine_mg_dL,Bilirubin_mg_dL,ALT_U_L,AST_U_L,Water_Source,Hygiene_Score,Disease
9980,P09981,20,Male,"Patient reports routine checkup, normal appeti...",137.0,5.0,103.0,4.5,14.3,214,18,0.99,0.4,11,12,Tap,1,Healthy
9981,P09982,79,Female,No fever. Patient reports symptoms for 2 days.,135.6,3.7,98.8,8.9,13.3,251,18,0.66,1.0,37,14,River,4,Healthy
9982,P09983,70,Male,"No significant complaints, normal appetite. Pa...",141.0,3.8,106.6,8.8,14.7,327,22,0.85,0.6,12,26,Tap,1,Healthy
9983,P09984,61,Female,"No significant complaints, no fever. Patient r...",138.8,4.1,106.8,4.9,12.1,271,15,0.96,0.5,44,26,Tap,4,Healthy
9984,P09985,70,Female,"Patient reports routine checkup, normal appeti...",143.3,4.5,99.9,6.2,15.3,269,23,0.69,0.2,20,44,Bottled,1,Healthy
9985,P09986,74,Female,No fever. Patient reports symptoms for 1 day.,138.5,4.2,104.3,4.3,12.3,223,17,0.91,0.4,41,22,Well,5,Healthy
9986,P09987,83,Female,"Patient reports no fever, no significant compl...",135.8,4.1,103.6,7.6,14.8,165,19,0.99,0.8,21,36,Well,4,Healthy
9987,P09988,25,Female,"Patient reports routine checkup, recent exposu...",137.2,4.4,104.9,8.9,13.3,260,21,0.73,0.9,17,16,Tap,1,Healthy
9988,P09989,42,Female,"Patient reports normal appetite, no significan...",136.7,4.7,99.0,8.4,13.7,240,27,0.95,0.5,43,44,River,1,Healthy
9989,P09990,74,Male,"Patient reports no significant complaints, nor...",144.1,4.4,99.0,6.1,14.8,290,18,0.89,0.7,43,35,Tap,4,Healthy


In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Patient_ID           10000 non-null  object 
 1   Age                  10000 non-null  int64  
 2   Gender               10000 non-null  object 
 3   Symptoms_Text        10000 non-null  object 
 4   Sodium_mmol_L        10000 non-null  float64
 5   Potassium_mmol_L     10000 non-null  float64
 6   Chloride_mmol_L      10000 non-null  float64
 7   WBC_109_per_L        10000 non-null  float64
 8   Hemoglobin_g_dL      10000 non-null  float64
 9   Platelets_109_per_L  10000 non-null  int64  
 10  Urea_mg_dL           10000 non-null  int64  
 11  Creatinine_mg_dL     10000 non-null  float64
 12  Bilirubin_mg_dL      10000 non-null  float64
 13  ALT_U_L              10000 non-null  int64  
 14  AST_U_L              10000 non-null  int64  
 15  Water_Source         10000 non-null  

In [28]:
dataset['Disease'].value_counts()

Unnamed: 0_level_0,count
Disease,Unnamed: 1_level_1
Healthy,1112
Cholera,1111
Typhoid,1111
Dysentery,1111
Hepatitis A,1111
Giardiasis,1111
E. coli Infection,1111
Leptospirosis,1111
Shigellosis,1111


In [29]:
dataset.isna().sum()

Unnamed: 0,0
Patient_ID,0
Age,0
Gender,0
Symptoms_Text,0
Sodium_mmol_L,0
Potassium_mmol_L,0
Chloride_mmol_L,0
WBC_109_per_L,0
Hemoglobin_g_dL,0
Platelets_109_per_L,0


In [30]:
dataset.shape

(10000, 18)

In [31]:
import re
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

dataset['text'] = dataset['Symptoms_Text'].apply(clean_text)


In [32]:

train_dataset, test_dataset = train_test_split(dataset, test_size=0.20, stratify=dataset['Disease'], random_state=42)
train_dataset = train_dataset.reset_index(drop=True)
test_dataset  = test_dataset.reset_index(drop=True)


In [33]:
# tokenizer + sequences

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 20000
max_len = 60
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_dataset['text'])
X_train_text = pad_sequences(tokenizer.texts_to_sequences(train_dataset['text']), maxlen=max_len, padding='post', truncating='post')
X_test_text  = pad_sequences(tokenizer.texts_to_sequences(test_dataset['text']),  maxlen=max_len, padding='post', truncating='post')


In [34]:
num_cols = ['Sodium_mmol_L','Potassium_mmol_L','Chloride_mmol_L','WBC_109_per_L',
            'Hemoglobin_g_dL','Platelets_109_per_L','Urea_mg_dL','Creatinine_mg_dL',
            'Bilirubin_mg_dL','ALT_U_L','AST_U_L','Age','Hygiene_Score']
cat_cols = ['Gender','Water_Source']

train_num = train_dataset[num_cols].fillna(train_dataset[num_cols].median())
test_num  = test_dataset[num_cols].fillna(train_dataset[num_cols].median())


In [35]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_num = scaler.fit_transform(train_num)
X_test_num  = scaler.transform(test_num)

X_train_cat = pd.get_dummies(train_dataset[cat_cols], drop_first=True)
X_test_cat  = pd.get_dummies(test_dataset[cat_cols], drop_first=True)
X_train_cat, X_test_cat = X_train_cat.align(X_test_cat, join='left', axis=1, fill_value=0)


In [36]:
X_train_tab = np.hstack([X_train_num, X_train_cat.values])
X_test_tab  = np.hstack([X_test_num,  X_test_cat.values])

In [37]:
# C encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_dataset['Disease'])
y_test  = le.transform(test_dataset['Disease'])
num_classes = len(le.classes_)


#MAKING NEURAL NETWORK

In [38]:
#build BiLSTM + tabular fusion model

from tensorflow.keras import layers, models
import tensorflow as tf

max_words = 20000
max_len = 60
vocab_size = min(max_words, len(tokenizer.word_index) + 1)
embed_dim = 128
tab_input_dim = X_train_tab.shape[1]

text_input = layers.Input(shape=(max_len,), name='text_input')
x = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_len)(text_input)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=False))(x)
x = layers.Dropout(0.4)(x)
x = layers.Dense(64, activation='relu')(x)

tab_input = layers.Input(shape=(tab_input_dim,), name='tab_input')
y = layers.Dense(128, activation='relu')(tab_input)
y = layers.BatchNormalization()(y)
y = layers.Dropout(0.3)(y)
y = layers.Dense(64, activation='relu')(y)

combined = layers.concatenate([x, y])
z = layers.Dense(64, activation='relu')(combined)
z = layers.Dropout(0.3)(z)
output = layers.Dense(num_classes, activation='softmax')(z)

model = models.Model(inputs=[text_input, tab_input], outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()




In [39]:
# training with callbacks and class weights
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.utils import class_weight

es = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
mc = ModelCheckpoint('best_multimodal_model.h5', monitor='val_loss', save_best_only=True)

cw_vals = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = {i: w for i,w in enumerate(cw_vals)}

history = model.fit(
    {'text_input': X_train_text, 'tab_input': X_train_tab},
    y_train,
    validation_split=0.15,
    epochs=30,
    batch_size=64,
    callbacks=[es, rlr, mc],
    class_weight=class_weights
)


Epoch 1/30
[1m106/107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 48ms/step - accuracy: 0.4426 - loss: 1.5141



[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 78ms/step - accuracy: 0.4466 - loss: 1.5033 - val_accuracy: 0.9308 - val_loss: 0.2254 - learning_rate: 0.0010
Epoch 2/30
[1m103/107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9367 - loss: 0.1698



[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - accuracy: 0.9372 - loss: 0.1686 - val_accuracy: 0.9750 - val_loss: 0.0652 - learning_rate: 0.0010
Epoch 3/30
[1m106/107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.9652 - loss: 0.0892



[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9652 - loss: 0.0891 - val_accuracy: 0.9758 - val_loss: 0.0544 - learning_rate: 0.0010
Epoch 4/30
[1m106/107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9748 - loss: 0.0661



[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9747 - loss: 0.0661 - val_accuracy: 0.9792 - val_loss: 0.0516 - learning_rate: 0.0010
Epoch 5/30
[1m106/107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9744 - loss: 0.0720



[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9745 - loss: 0.0719 - val_accuracy: 0.9767 - val_loss: 0.0493 - learning_rate: 0.0010
Epoch 6/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.9761 - loss: 0.0607 - val_accuracy: 0.9783 - val_loss: 0.0497 - learning_rate: 0.0010
Epoch 7/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step - accuracy: 0.9817 - loss: 0.0525 - val_accuracy: 0.9758 - val_loss: 0.0556 - learning_rate: 0.0010
Epoch 8/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9778 - loss: 0.0488 - val_accuracy: 0.9792 - val_loss: 0.0517 - learning_rate: 5.0000e-04
Epoch 9/30
[1m105/107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9791 - loss: 0.0464



[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9792 - loss: 0.0463 - val_accuracy: 0.9808 - val_loss: 0.0488 - learning_rate: 5.0000e-04
Epoch 10/30
[1m106/107[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9810 - loss: 0.0457



[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.9810 - loss: 0.0456 - val_accuracy: 0.9792 - val_loss: 0.0476 - learning_rate: 5.0000e-04
Epoch 11/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9841 - loss: 0.0401 - val_accuracy: 0.9775 - val_loss: 0.0516 - learning_rate: 5.0000e-04
Epoch 12/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9857 - loss: 0.0364 - val_accuracy: 0.9775 - val_loss: 0.0503 - learning_rate: 5.0000e-04
Epoch 13/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.9845 - loss: 0.0360 - val_accuracy: 0.9792 - val_loss: 0.0537 - learning_rate: 2.5000e-04
Epoch 14/30
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.9836 - loss: 0.0390 - val_accuracy: 0.9775 - val_loss: 0.0479 - le

In [40]:
 #evaluation
model.load_weights('best_multimodal_model.h5')
y_pred_probs = model.predict({'text_input': X_test_text, 'tab_input': X_test_tab})
y_pred = y_pred_probs.argmax(axis=1)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_test, y_pred)
print(cm)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
                   precision    recall  f1-score   support

          Cholera       1.00      1.00      1.00       222
        Dysentery       0.95      0.97      0.96       222
E. coli Infection       0.96      0.96      0.96       222
       Giardiasis       1.00      1.00      1.00       222
          Healthy       1.00      1.00      1.00       223
      Hepatitis A       1.00      1.00      1.00       222
    Leptospirosis       1.00      0.99      1.00       222
      Shigellosis       0.93      0.91      0.92       223
          Typhoid       1.00      1.00      1.00       222

         accuracy                           0.98      2000
        macro avg       0.98      0.98      0.98      2000
     weighted avg       0.98      0.98      0.98      2000

[[222   0   0   0   0   0   0   0   0]
 [  0 216   0   0   0   0   0   6   0]
 [  0   0 213   0   0   0   0   9   0]
 [  0   0   0 222   0   0   0   0   0]


In [41]:
print()




In [42]:
# Example new patient data (replace with actual data)
new_patient_data = {
    'Age': 45,
    'Gender': 'Female',
    'Symptoms_Text': 'Patient reports fever and abdominal pain',
    'Sodium_mmol_L': 135.0,
    'Potassium_mmol_L': 4.0,
    'Chloride_mmol_L': 100.0,
    'WBC_109_per_L': 9.0,
    'Hemoglobin_g_dL': 14.0,
    'Platelets_109_per_L': 250,
    'Urea_mg_dL': 20,
    'Creatinine_mg_dL': 0.8,
    'Bilirubin_mg_dL': 0.5,
    'ALT_U_L': 30,
    'AST_U_L': 25,
    'Water_Source': 'Tap',
    'Hygiene_Score': 3
}

# Create a pandas DataFrame from the new patient data
new_patient_df = pd.DataFrame([new_patient_data])

# Preprocess the new patient data
new_patient_df['text'] = new_patient_df['Symptoms_Text'].apply(clean_text)

# Tokenize and pad the text data
X_new_patient_text = pad_sequences(tokenizer.texts_to_sequences(new_patient_df['text']), maxlen=max_len, padding='post', truncating='post')

# Select numerical columns and handle missing values
new_patient_num = new_patient_df[num_cols].fillna(train_dataset[num_cols].median())

# Scale numerical features
X_new_patient_num = scaler.transform(new_patient_num)

# One-hot encode categorical features and align columns
new_patient_cat = pd.get_dummies(new_patient_df[cat_cols], drop_first=True)
new_patient_cat, _ = new_patient_cat.align(X_train_cat, join='right', axis=1, fill_value=0)


# Combine numerical and categorical features
X_new_patient_tab = np.hstack([X_new_patient_num, new_patient_cat.values])

# Make a prediction
y_new_patient_pred_probs = model.predict({'text_input': X_new_patient_text, 'tab_input': X_new_patient_tab})
y_new_patient_pred = y_new_patient_pred_probs.argmax(axis=1)

# Decode the predicted label
predicted_disease = le.inverse_transform(y_new_patient_pred)

print(f"Predicted Disease: {predicted_disease[0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Predicted Disease: Typhoid


In [43]:
import joblib

# Save preprocessing tools
joblib.dump(tokenizer, "tokenizer.joblib")
joblib.dump(scaler, "scaler.joblib")
joblib.dump(le, "label_encoder.joblib")


['label_encoder.joblib']