# BERT MODEL

In [None]:
## Data Preparation and Tokenization:

from transformers import BertTokenizer

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # you may need to adjust this depending on your data
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',  # Return pytorch tensors, use 'tf' for TensorFlow
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert lists to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

# symptom_descriptions: list of descriptions of symptoms
# labels: list of diseases corresponding to each symptom description
input_ids, attention_masks = encode_texts(symptom_descriptions)


In [None]:
## Classification model that uses BERT as the base, with a final layer for multi-class classification, corresponding to the various skin diseases.

from transformers import TFBertModel
import tensorflow as tf

# Load the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define the maximum sequence length for your inputs. This will depend on your dataset.
max_length = 64  # You should adjust this value based on your actual data.

# The number of possible skin disease labels
num_labels = len(set(labels))  # 'labels' should be your list of diseases. Each disease is a possible label.

# Model construction: Input layers
input_ids_layer = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='input_ids')
attention_masks_layer = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int32, name='attention_mask')

# BERT layer: Extracting BERT's output
bert_output = bert_model(input_ids_layer, attention_mask=attention_masks_layer)[1]

# The output layer for classification
output_layer = tf.keras.layers.Dense(num_labels, activation='softmax')(bert_output)

# Combining everything into a Keras model
model = tf.keras.Model(inputs=[input_ids_layer, attention_masks_layer], outputs=output_layer)

# Compile the model with the optimizer, loss, and metrics you want to train with
model.compile(optimizer=tf.keras.optimizers.Adam(lr=5e-5),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Now, your model is ready for training with 'input_ids', 'attention_masks', and 'labels' (the actual diseases)


In [None]:
## Model Training
history = model.fit(
    {'input_ids': input_ids, 'attention_mask': attention_masks},
    labels,  # Ensure your labels are numerical IDs
    epochs=4,  # Adjust as needed
    batch_size=16,  # Adjust based on your hardware's capabilities
    validation_split=0.1  # Optional: if you have a separate validation set, use that instead
)


In [None]:
## Evaluation 

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

# Assuming test_inputs and test_labels are your input and true labels for the test set
# and that your model has been trained with the name 'model'

test_input_ids, test_attention_masks = encode_texts(test_symptom_descriptions)  # use your encode_texts function

# Generate predictions
predictions = model.predict([test_input_ids, test_attention_masks])

# The predictions are in one-hot format (probabilities for each class), 
# so we'll convert them to labels
predicted_labels = np.argmax(predictions, axis=1)

# Assuming that your test_labels are also in a one-hot format, we need to convert them
true_labels = np.argmax(test_labels, axis=1)  # Remove this line if your labels are not one-hot encoded

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')  # use 'micro', 'macro', 'weighted', or 'samples'
recall = recall_score(true_labels, predicted_labels, average='weighted')  # use 'micro', 'macro', 'weighted', or 'samples'
f1 = f1_score(true_labels, predicted_labels, average='weighted')  # use 'micro', 'macro', 'weighted', or 'samples'

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Strategies for Improvement:

Data Quality:
Cleaning and Preprocessing: Ensure that the data is well-preprocessed. This process includes removing irrelevant information, correcting spelling mistakes, and possibly balancing the dataset.

Data Augmentation: Use techniques to artificially augment your data (e.g., paraphrasing sentences) to increase the variety of wording and context, which helps the model generalize better.

Model Fine-Tuning and Architecture:
Learning Rate and Epochs: Adjust the learning rate and the number of epochs. Sometimes, less aggressive learning rates with more epochs or learning rate scheduling can help.
Custom Layers: Consider adding additional layers on top of the BERT model or customizing the head layers to better suit the specific task.
Different Pre-trained Models: Try other BERT variations or other transformer models (e.g., RoBERTa, DistilBERT for faster training, or GPT-3 for diverse pre-training).

Hyperparameter Tuning:
Systematically tune hyperparameters using approaches like grid search, random search, or Bayesian optimization to find the optimal configuration.


Examples: 
Regularization through Weight Decay: The optimizer uses a form of L2 regularization/weight decay.
Handling Imbalanced Data: If your data is imbalanced, the class_weight parameter is used during training. It adjusts the weight given to different classes during training, addressing the imbalance issue.
Learning Rate: A specific learning rate is set, which might be different from the default. Fine-tuning the learning rate is often essential for achieving the best performance.
Data Truncation and Padding: Ensuring that the input sequences are uniformly shaped by truncating/padding all text sequences to a certain number of tokens.

# GPT model

In [None]:
from transformers import GPT2Tokenizer, TFGPT2ForSequenceClassification
import tensorflow as tf

# Load a pre-trained GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=num_diseases)

# Assume you have your dataset loaded in the `symptom_texts` and `labels` (disease names in string)
# The dataset is split into training and testing datasets

# Preparing the tokenized input for GPT-2 from your dataset of symptom descriptions
train_encodings = tokenizer(train_symptom_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_symptom_texts, truncation=True, padding=True)

# Convert to TensorFlow Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels  # this should be numerical IDs, not the disease names in string
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Training the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16), epochs=3, batch_size=16, validation_data=val_dataset.batch(16))

# After training, you can save the model
model.save_pretrained("./gpt_finetuned_skin_diseases/")


# FastText

In [None]:
# Assume 'combined_data' is a list of tuples (or similar) with the symptom description and the disease name
# Example: [("The skin is itchy...", "Eczema"), (...)]

with open("fasttext_train_data.txt", "w") as f:
    for description, disease in combined_data:
        # Ensure the text is cleaned and normalized, if it's not already
        f.write(f"__label__{disease.replace(' ', '_')} {description}\n")  # FastText expects labels in this '__label__<class>' format.



In [None]:
import fasttext

# Train the model
model = fasttext.train_supervised("fasttext_train_data.txt", epoch=25, wordNgrams=2)

# Saving the model
model.save_model("disease_prediction_model.ftz")


In [None]:
# Predicting with the model
result = model.predict("The skin is red and itchy...")  # insert a real symptom description here

# 'result' will contain the labels and associated probabilities
disease_predicted = result[0][0].replace("__label__", "")  # we remove the label prefix to get the disease name
