## Deeplearning Classifier

In [33]:
# Import necessary libraries
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np

In [12]:
# Load the dataset
df = pd.read_csv('Symptom2Disease.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [14]:
num_categories = df['label'].nunique()
print("The number of categories of diseases are:", num_categories)

The number of categories of diseases are: 24


In [15]:
# Split the dataset into input (text) and output (label) columns
text_data = df['text'].values
labels = df['label'].values

# Encode the labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [16]:
# Split the dataset into training and testing sets
text_train, text_test, labels_train, labels_test = train_test_split(text_data, labels, test_size=0.2, random_state=42)

# Create a tokenizer and fit it on the training text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_train)

# Convert the text data to sequences of tokens
sequences_train = tokenizer.texts_to_sequences(text_train)
sequences_test = tokenizer.texts_to_sequences(text_test)

# Pad the sequences to have the same length
max_sequence_length = max(len(seq) for seq in sequences_train)
sequences_train = pad_sequences(sequences_train, maxlen=max_sequence_length)
sequences_test = pad_sequences(sequences_test, maxlen=max_sequence_length)

# Determine the number of classes
num_classes = len(label_encoder.classes_)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
    tf.keras.layers.Dropout(0.5),  # Add dropout layer
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.5),  # Add dropout layer
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Add dropout layer
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),  # Add L2 regularization
    tf.keras.layers.Dropout(0.5),  # Add dropout layer
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Add dropout layer
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

# Train the model
model.fit(sequences_train, labels_train, epochs=50, batch_size=32)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2c93fd41310>

In [36]:
_, accuracy = model.evaluate(sequences_test, labels_test)
print("Accuracy:", accuracy)

Accuracy: 0.8708333373069763


In [34]:
# Predict on the test set
predictions = model.predict(sequences_test)
predicted_labels = np.argmax(predictions, axis=1)

# Decode the predicted labels
predicted_labels = label_encoder.inverse_transform(predicted_labels)

# Generate the classification report
report = classification_report(label_encoder.inverse_transform(labels_test), predicted_labels)
print(report)


                                 precision    recall  f1-score   support

                           Acne       1.00      1.00      1.00         7
                      Arthritis       1.00      1.00      1.00        10
               Bronchial Asthma       1.00      1.00      1.00        11
           Cervical spondylosis       1.00      1.00      1.00         7
                    Chicken pox       0.55      0.50      0.52        12
                    Common Cold       1.00      0.83      0.91        12
                         Dengue       0.56      0.75      0.64        12
          Dimorphic Hemorrhoids       1.00      1.00      1.00         7
               Fungal infection       1.00      0.85      0.92        13
                   Hypertension       1.00      1.00      1.00        10
                       Impetigo       0.83      0.91      0.87        11
                       Jaundice       1.00      1.00      1.00        11
                        Malaria       1.00      1.

In [37]:
# Create a function for prediction
def predict_disease(text):
    # Load the trained model
    model = tf.keras.models.load_model('disease_model.h5')

    # Preprocess the input text
    sequence = tokenizer.texts_to_sequences([text])
    sequence = pad_sequences(sequence, maxlen=max_sequence_length)

    # Make the prediction
    predicted_probs = model.predict(sequence)[0]
    predicted_class = tf.argmax(predicted_probs).numpy()
    predicted_label = label_encoder.inverse_transform([predicted_class])[0]
    # Calculate the confidence score
    predicted_prob = predicted_probs[predicted_class]

    # Return the predicted class and confidence score
    return predicted_label, predicted_prob

In [101]:
# Test the predict_disease function
input_text = "My nails have small dents or pits in them, and they often feel inflammatory and tender to the touch. Even there are minor rashes on my arms."
predicted_class = predict_disease(input_text)
print(predicted_class)

('Psoriasis', 0.9839914)


## Random Forest Classifier

In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [23]:
# Read the data from the CSV file
data = pd.read_csv('Symptom2Disease.csv')

In [24]:
data.head()

Unnamed: 0.1,Unnamed: 0,label,text
0,0,Psoriasis,I have been experiencing a skin rash on my arm...
1,1,Psoriasis,"My skin has been peeling, especially on my kne..."
2,2,Psoriasis,I have been experiencing joint pain in my fing...
3,3,Psoriasis,"There is a silver like dusting on my skin, esp..."
4,4,Psoriasis,"My nails have small dents or pits in them, and..."


In [28]:

# Extract symptom texts and disease names from the dataframe
symptoms = data['text'].tolist()
diseases = data['label'].tolist()

# Convert symptom texts into numerical feature vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(symptoms)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, diseases, test_size=0.2, random_state=42)

# Initialize a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = rf_classifier.score(X_test, y_test)
print("Accuracy:", accuracy)


Accuracy: 0.9708333333333333


In [29]:
# Calculate precision, recall, and F1-score
classification_report = classification_report(y_test, y_pred, digits=4)
print("Classification Report:\n", classification_report)

Classification Report:
                                  precision    recall  f1-score   support

                           Acne     1.0000    1.0000    1.0000         7
                      Arthritis     1.0000    1.0000    1.0000        10
               Bronchial Asthma     1.0000    1.0000    1.0000        11
           Cervical spondylosis     1.0000    1.0000    1.0000         7
                    Chicken pox     0.8462    0.9167    0.8800        12
                    Common Cold     1.0000    0.9167    0.9565        12
                         Dengue     0.9000    0.7500    0.8182        12
          Dimorphic Hemorrhoids     1.0000    1.0000    1.0000         7
               Fungal infection     1.0000    1.0000    1.0000        13
                   Hypertension     1.0000    1.0000    1.0000        10
                       Impetigo     1.0000    1.0000    1.0000        11
                       Jaundice     1.0000    1.0000    1.0000        11
                        Ma

In [51]:
# Export the trained model to a file
import pickle
model_filename = 'random_forest_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(rf_classifier, file)

# Function to predict disease label from input text
def predict_disease(text):
    # Load the saved model
    with open(model_filename, 'rb') as file:
        model = pickle.load(file)
    
    # Transform the input text into numerical features
    input_vector = vectorizer.transform([text])
    
    # Predict the disease label
    predicted_label = model.predict(input_vector)
    confidence_scores = model.predict_proba(input_vector)
    return predicted_label[0], confidence_scores[0][predicted_label[0]]

In [None]:
predict_disease("I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.")