In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense


In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/waffleliew/Medical_Diagnosis_Deep_Learning/main/dataset.csv')
data.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,
1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,,,,,,,,,,,,,
3,Fungal infection,itching,skin_rash,dischromic _patches,,,,,,,,,,,,,,
4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,,,,,,,,,,,,,


In [4]:

# for patient in data['Disease']:
#     for symptom_col in ['Symptom_' + str(i+1) for i in range(0, 5)]:
#         symptom_data = data[symptom_col].astype(str)  # Convert to string type
#         symptom_data = symptom_data.fillna("a")  # Replace NaN with empty string


In [31]:
data.fillna("", inplace = True) #inplace=true amends original df
texts = data['Symptom_1'] + data['Symptom_2'] + data['Symptom_3'] + data['Symptom_4'] + data['Symptom_5'] + data['Symptom_6'] + data['Symptom_7'] + data['Symptom_8'] + data['Symptom_9'] + data['Symptom_10'] + data['Symptom_11'] + data['Symptom_12'] + data['Symptom_13'] + data['Symptom_14'] + data['Symptom_15'] + data['Symptom_16'] + data['Symptom_17']

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>", filters =',')
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Print the results
print("Texts:", texts)
print("Tokenizer word index:", tokenizer.word_index)
print("Sequences:", sequences)



Texts: 0       itching skin_rash nodal_skin_eruptions dischro...
1        skin_rash nodal_skin_eruptions dischromic _pa...
2        itching nodal_skin_eruptions dischromic _patches
3                   itching skin_rash dischromic _patches
4                  itching skin_rash nodal_skin_eruptions
                              ...                        
4915     vomiting headache nausea spinning_movements l...
4916     skin_rash pus_filled_pimples blackheads scurring
4917     burning_micturition bladder_discomfort foul_s...
4918     skin_rash joint_pain skin_peeling silver_like...
4919     skin_rash high_fever blister red_sore_around_...
Length: 4920, dtype: object
Tokenizer word index: {'<OOV>': 1, 'fatigue': 2, 'vomiting': 3, 'high_fever': 4, 'loss_of_appetite': 5, 'nausea': 6, 'headache': 7, 'abdominal_pain': 8, 'yellowish_skin': 9, 'yellowing_of_eyes': 10, 'chills': 11, 'skin_rash': 12, 'malaise': 13, 'chest_pain': 14, 'joint_pain': 15, 'itching': 16, 'sweating': 17, 'dark_urine': 1

In [7]:
max_length = max(len(x) for x in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
print(padded_sequences)

[[ 16  12 114 ...   0   0   0]
 [ 12 114 115 ...   0   0   0]
 [ 16 114 115 ...   0   0   0]
 ...
 [ 52 105 134 ...   0   0   0]
 [ 12  15 107 ...   0   0   0]
 [ 12   4 111 ...   0   0   0]]


In [8]:
# Encoding the labels
label_encoder_disease = LabelEncoder()

disease_labels = label_encoder_disease.fit_transform(data['Disease'])
print(disease_labels)
# Converting labels to categorical
disease_labels_categorical = to_categorical(disease_labels)
print(disease_labels_categorical)


[15 15 15 ... 38 35 27]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [9]:
word_counts = tokenizer.word_counts
num_unique_words = len(word_counts)
print(num_unique_words) #find out number of unique words to decide input_dim

134


In [53]:

input_layer = Input(shape=(max_length))

embedding = Embedding(input_dim=200, output_dim=41)(input_layer)
lstm_layer = LSTM(40)(embedding)

disease_output = Dense(len(label_encoder_disease.classes_), activation = 'softmax',
name='disease_output')(lstm_layer)


In [54]:
model = Model(inputs=input_layer, outputs=[disease_output])

model.compile(
    loss={'disease_output': 'categorical_crossentropy'},
    optimizer='adam',
    metrics={'disease_output': ['accuracy']}
)

model.summary()


Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, 17)]              0         
                                                                 
 embedding_10 (Embedding)    (None, 17, 41)            8200      
                                                                 
 lstm_10 (LSTM)              (None, 40)                13120     
                                                                 
 disease_output (Dense)      (None, 41)                1681      
                                                                 
Total params: 23001 (89.85 KB)
Trainable params: 23001 (89.85 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [55]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
model.fit(padded_sequences, {'disease_output': disease_labels_categorical}, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7b39b9bb0e80>

In [64]:
def make_prediction(text):
    # Preprocessing the input
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post')

    # Making prediction
    prediction = model.predict(padded_sequence)

    # Decoding the prediction
    disease_index = np.argmax(prediction, axis=1) # takes highest probability from prediction table
    print(disease_index)

    disease_predicted = label_encoder_disease.inverse_transform([disease_index])


    print(f"Predicted Disease: {disease_predicted}")

    #for checking
    print('Symptoms:',text)
    print('Sequence:',sequence)
    print('Prediction table:',prediction)


patient_input = "vomiting, fatigue, anxiety, sweating, headache, nausea, blurred_and_distorted_vision, excessive_hunger, drying_and_tingling_lips, slurred_speech, irritability, palpitations"
make_prediction(patient_input)


[25]
Predicted Disease: ['Hypoglycemia']
Symptoms: vomiting, fatigue, anxiety, sweating, headache, nausea, blurred_and_distorted_vision, excessive_hunger, drying_and_tingling_lips, slurred_speech, irritability, palpitations
Sequence: [[3, 2, 99, 17, 7, 6, 30, 23, 100, 71, 21, 72]]
Prediction table: [[3.8763872e-09 6.2807981e-09 7.5510613e-12 1.2554065e-07 7.1105228e-09
  1.0169329e-10 1.7185408e-11 2.4487679e-10 1.8192213e-10 1.1869146e-06
  1.3967036e-10 6.1545249e-09 3.7815582e-06 2.5959392e-09 2.5320223e-11
  4.8396971e-09 3.1542612e-11 3.7018470e-12 3.0004561e-08 3.6165170e-07
  3.1463812e-09 6.7368353e-07 1.8061516e-07 1.9470110e-09 9.0540755e-07
  9.9999058e-01 7.2274825e-10 1.1811787e-07 4.1198653e-09 4.3611834e-09
  5.1872362e-10 1.3232221e-07 7.1630762e-10 7.5961792e-11 3.4506773e-09
  1.8992325e-06 9.9074038e-10 4.8632050e-11 2.1105679e-09 3.2046507e-10
  1.9749212e-08]]


  y = column_or_1d(y, warn=True)
