In [6]:
import pandas as pd
import os
import numpy as np

file = ('scraped_data.csv')
df = pd.read_csv(file) 
df
df.dropna()
df['Disease']
df


Unnamed: 0,Disease,Keyword,Website,Symptoms
0,Benign Tumors,Seborrheic Keratosis,https://www.mayoclinic.org/diseases-conditions...,"A round or oval-shaped waxy or rough bump, typ..."
1,Benign Tumors,Epidermal Cyst,https://www.mayoclinic.org/diseases-conditions...,"\nA small, round bump under the skin, usually ..."
2,Benign Tumors,Sebaceous Hyperplasia,https://www.healthline.com/health/skin-lumps/s...,yellowish or flesh-colored bumps on the skin. ...
3,Benign Tumors,Keloid,https://www.mayoclinic.org/diseases-conditions...,"\nThick, irregular scarring, typically on the ..."
4,Lupus and other Connective Tissue diseases,Chilblains Perniosis,https://dermnetnz.org/topics/chilblains,Itch and/or burning pain\nLocalised swelling\n...
...,...,...,...,...
441,Atopic Dermatitis,diaper Derm,https://www.nhs.uk/conditions/baby/caring-for-...,Symptoms of nappy rash can include:red or raw ...
442,Atopic Dermatitis,hyper Linear Crease,https://www.nhs.uk/conditions/joint-hypermobil...,You or your child may have joint hypermobility...
443,Atopic Dermatitis,kerPilaris Florid,https://www.nhs.uk/conditions/keratosis-pilaris/,Symptoms of keratosis pilaris may include:You ...
444,Atopic Dermatitis,keratosis Pilaris,https://www.nhs.uk/conditions/keratosis-pilaris/,Symptoms of keratosis pilaris may include:You ...


# BERT MODEL

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Convert symptoms column to a list
texts = df["Symptoms"].tolist()

# Convert your disease column to integers
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["Disease"].tolist())

# Split data into train and validation sets
#train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

# For now, train and validation sets are the same
train_texts = texts
val_texts = texts
train_labels = list(labels)
val_labels = list(labels)

train_texts = [str(text) for text in train_texts if text is not None]
val_texts = [str(text) for text in val_texts if text is not None]

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(text=train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(text=val_texts, truncation=True, padding=True, max_length=256)

# Create a torch dataset
class SkinDiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SkinDiseaseDataset(train_encodings, train_labels)
val_dataset = SkinDiseaseDataset(val_encodings, val_labels)

# Initialize BERT for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Define training arguments and train
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

# Save the model and the label encoder for later use
model.save_pretrained("./skin_disease_model")
label_encoder.classes_.dump("./label_classes.pkl")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|▌         | 10/168 [01:19<22:12,  8.44s/it]

{'loss': 1.3686, 'learning_rate': 4.7023809523809525e-05, 'epoch': 0.18}


 12%|█▏        | 20/168 [02:44<19:44,  8.00s/it]

{'loss': 1.3083, 'learning_rate': 4.404761904761905e-05, 'epoch': 0.36}


 18%|█▊        | 30/168 [04:01<17:38,  7.67s/it]

{'loss': 1.1515, 'learning_rate': 4.107142857142857e-05, 'epoch': 0.54}


 24%|██▍       | 40/168 [05:28<18:48,  8.82s/it]

{'loss': 1.1909, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.71}


 30%|██▉       | 50/168 [07:13<20:03, 10.20s/it]

{'loss': 0.9999, 'learning_rate': 3.511904761904762e-05, 'epoch': 0.89}


                                                
 30%|██▉       | 50/168 [10:01<20:03, 10.20s/it]

{'eval_loss': 0.7350618243217468, 'eval_runtime': 168.2429, 'eval_samples_per_second': 2.651, 'eval_steps_per_second': 0.333, 'epoch': 0.89}


 36%|███▌      | 60/168 [11:31<20:28, 11.37s/it]  

{'loss': 0.7254, 'learning_rate': 3.2142857142857144e-05, 'epoch': 1.07}


 42%|████▏     | 70/168 [13:08<16:08,  9.89s/it]

{'loss': 0.6498, 'learning_rate': 2.916666666666667e-05, 'epoch': 1.25}


 48%|████▊     | 80/168 [14:51<15:57, 10.88s/it]

{'loss': 0.4298, 'learning_rate': 2.6190476190476192e-05, 'epoch': 1.43}


 54%|█████▎    | 90/168 [16:38<14:26, 11.11s/it]

{'loss': 0.5885, 'learning_rate': 2.3214285714285715e-05, 'epoch': 1.61}


 60%|█████▉    | 100/168 [18:37<12:37, 11.14s/it]

{'loss': 0.4036, 'learning_rate': 2.023809523809524e-05, 'epoch': 1.79}


                                                 
 60%|█████▉    | 100/168 [21:06<12:37, 11.14s/it]

{'eval_loss': 0.2914796471595764, 'eval_runtime': 149.7727, 'eval_samples_per_second': 2.978, 'eval_steps_per_second': 0.374, 'epoch': 1.79}


 65%|██████▌   | 110/168 [22:51<12:57, 13.40s/it]  

{'loss': 0.3758, 'learning_rate': 1.7261904761904763e-05, 'epoch': 1.96}


 71%|███████▏  | 120/168 [24:40<07:58,  9.97s/it]

{'loss': 0.2811, 'learning_rate': 1.4285714285714285e-05, 'epoch': 2.14}


 77%|███████▋  | 130/168 [26:27<05:32,  8.75s/it]

{'loss': 0.1989, 'learning_rate': 1.130952380952381e-05, 'epoch': 2.32}


 83%|████████▎ | 140/168 [28:21<05:36, 12.01s/it]

{'loss': 0.2027, 'learning_rate': 8.333333333333334e-06, 'epoch': 2.5}


 89%|████████▉ | 150/168 [30:17<02:57,  9.84s/it]

{'loss': 0.3419, 'learning_rate': 5.357142857142857e-06, 'epoch': 2.68}


                                                 
 89%|████████▉ | 150/168 [37:01<02:57,  9.84s/it]

{'eval_loss': 0.19987738132476807, 'eval_runtime': 403.4105, 'eval_samples_per_second': 1.106, 'eval_steps_per_second': 0.139, 'epoch': 2.68}


 95%|█████████▌| 160/168 [38:10<01:33, 11.73s/it] 

{'loss': 0.3645, 'learning_rate': 2.3809523809523808e-06, 'epoch': 2.86}


100%|██████████| 168/168 [39:07<00:00, 13.98s/it]


{'train_runtime': 2347.9098, 'train_samples_per_second': 0.57, 'train_steps_per_second': 0.072, 'train_loss': 0.6362936667033604, 'epoch': 3.0}


In [7]:
## Evaluation of training dataset

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Predict using the trained model
predictions = trainer.predict(val_dataset)

# Get predicted labels
pred_labels = np.argmax(predictions.predictions, axis=1)

# Calculate metrics
accuracy = accuracy_score(val_labels, pred_labels)
precision = precision_score(val_labels, pred_labels, average='weighted')
recall = recall_score(val_labels, pred_labels, average='weighted')
f1 = f1_score(val_labels, pred_labels, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


100%|██████████| 56/56 [01:55<00:00,  2.06s/it]

Accuracy: 0.9395
Precision: 0.9420
Recall: 0.9395
F1 Score: 0.9395





In [None]:
## Classification model that uses BERT as the base, with a final layer for multi-class classification, corresponding to the various skin diseases.

from transformers import TFBertModel
import tensorflow as tf

# Load the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define the maximum sequence length for your inputs. This will depend on your dataset.
max_length = 64  # You should adjust this value based on your actual data.

# The number of possible skin disease labels
num_labels = len(set(labels))  # 'labels' should be your list of diseases. Each disease is a possible label.

# Model construction: Input layers
input_ids_layer = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_masks_layer = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

# BERT layer: Extracting BERT's output
bert_output = bert_model(input_ids_layer, attention_mask=attention_masks_layer)[1]

# The output layer for classification
output_layer = tf.keras.layers.Dense(num_labels, activation='softmax')(bert_output)

# Combining everything into a Keras model
model = tf.keras.Model(inputs=[input_ids_layer, attention_masks_layer], outputs=output_layer)

# Compile the model with the optimizer, loss, and metrics you want to train with
model.compile(optimizer=tf.keras.optimizers.Adam(lr=5e-5),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Now, your model is ready for training with 'input_ids', 'attention_masks', and 'labels' (the actual diseases)


In [None]:
## Evaluation of 

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

# Assuming test_inputs and test_labels are your input and true labels for the test set
# and that your model has been trained with the name 'model'

test_input_ids, test_attention_masks = encode_texts(test_symptom_descriptions)  # use your encode_texts function

# Generate predictions
predictions = model.predict([test_input_ids, test_attention_masks])

# The predictions are in one-hot format (probabilities for each class), 
# so we'll convert them to labels
predicted_labels = np.argmax(predictions, axis=1)

# Assuming that your test_labels are also in a one-hot format, we need to convert them
true_labels = np.argmax(test_labels, axis=1)  # Remove this line if your labels are not one-hot encoded

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')  # use 'micro', 'macro', 'weighted', or 'samples'
recall = recall_score(true_labels, predicted_labels, average='weighted')  # use 'micro', 'macro', 'weighted', or 'samples'
f1 = f1_score(true_labels, predicted_labels, average='weighted')  # use 'micro', 'macro', 'weighted', or 'samples'

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Strategies for Improvement:

Data Quality:
Cleaning and Preprocessing: Ensure that the data is well-preprocessed. This process includes removing irrelevant information, correcting spelling mistakes, and possibly balancing the dataset.

Data Augmentation: Use techniques to artificially augment your data (e.g., paraphrasing sentences) to increase the variety of wording and context, which helps the model generalize better.

Model Fine-Tuning and Architecture:
Learning Rate and Epochs: Adjust the learning rate and the number of epochs. Sometimes, less aggressive learning rates with more epochs or learning rate scheduling can help.
Custom Layers: Consider adding additional layers on top of the BERT model or customizing the head layers to better suit the specific task.
Different Pre-trained Models: Try other BERT variations or other transformer models (e.g., RoBERTa, DistilBERT for faster training, or GPT-3 for diverse pre-training).

Hyperparameter Tuning:
Systematically tune hyperparameters using approaches like grid search, random search, or Bayesian optimization to find the optimal configuration.


Examples: 
Regularization through Weight Decay: The optimizer uses a form of L2 regularization/weight decay.
Handling Imbalanced Data: If your data is imbalanced, the class_weight parameter is used during training. It adjusts the weight given to different classes during training, addressing the imbalance issue.
Learning Rate: A specific learning rate is set, which might be different from the default. Fine-tuning the learning rate is often essential for achieving the best performance.
Data Truncation and Padding: Ensuring that the input sequences are uniformly shaped by truncating/padding all text sequences to a certain number of tokens.

# GPT model

In [8]:
from transformers import GPT2Tokenizer, TFGPT2ForSequenceClassification
import tensorflow as tf

# Load a pre-trained GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_diseases)

# Assume you have your dataset loaded in the `symptom_texts` and `labels` (disease names in string)
# The dataset is split into training and testing datasets

# Preparing the tokenized input for GPT-2 from your dataset of symptom descriptions
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels  # this should be numerical IDs, not the disease names in string
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Training the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16), epochs=3, batch_size=16, validation_data=val_dataset.batch(16))

# After training, you can save the model
model.save_pretrained("./gpt_finetuned_skin_diseases/")


2023-11-03 22:33:57.328277: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading (…)olve/main/vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 4.08MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 42.9MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 43.9MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 665/665 [00:00<00:00, 158kB/s]


ImportError: 
TFGPT2ForSequenceClassification requires the TensorFlow library but it was not found in your environment.
However, we were able to find a PyTorch installation. PyTorch classes do not begin
with "TF", but are otherwise identically named to our TF classes.
If you want to use PyTorch, please use those classes instead!

If you really do want to use TensorFlow, please follow the instructions on the
installation page https://www.tensorflow.org/install that match your environment.


# FastText

In [None]:
# Assume 'combined_data' is a list of tuples (or similar) with the symptom description and the disease name
# Example: [("The skin is itchy...", "Eczema"), (...)]

with open("fasttext_train_data.txt", "w") as f:
    for description, disease in df:
        # Ensure the text is cleaned and normalized, if it's not already
        f.write(f"__label__{disease.replace(' ', '_')} {description}\n")  # FastText expects labels in this '__label__<class>' format.



In [None]:
import fasttext

# Train the model
model = fasttext.train_supervised("fasttext_train_data.txt", epoch=25, wordNgrams=2)

# Saving the model
model.save_model("disease_prediction_model.ftz")


In [None]:
# Predicting with the model
result = model.predict("The skin is red and itchy...")  # insert a real symptom description here

# 'result' will contain the labels and associated probabilities
disease_predicted = result[0][0].replace("__label__", "")  # we remove the label prefix to get the disease name
