In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

# Provide the path to your dataset in Google Drive
file_path = '/content/drive/My Drive/textual_dataset/cleaned_skindiseasesdataset.csv'

# Load the CSV dataset with a specified encoding
dataset = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the dataset structure
print(dataset.head())


              Output                                              Input
0           Vitiligo  "I've had these light patches on my neck and f...
1           Vitiligo                                 "I've patchy skin"
2            Scabies  "Doctor, I've noticed these small, red bumps o...
3           Vitiligo  "Doctor, I noticed a pale patch around my knee...
4  Hives (Urticaria)  Hives, also known as urticaria, typically pres...


In [5]:
ffrom sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from transformers import GPT2Tokenizer

# Initialize the tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to be the same as the eos token

# Tokenization function
def tokenize_function(inputs, outputs):
    input_encodings = tokenizer(inputs, truncation=True, padding='max_length', max_length=256)
    output_encodings = tokenizer(outputs, truncation=True, padding='max_length', max_length=256)
    return input_encodings, output_encodings

# Tokenize the dataset
input_texts = dataset['Input'].tolist()
output_texts = dataset['Output'].tolist()

tokenized_inputs, tokenized_outputs = tokenize_function(input_texts, output_texts)

# Verify the shape of tokenized texts
print(f"Number of tokenized input texts: {len(tokenized_inputs['input_ids'])}")
print(f"Number of tokenized output texts: {len(tokenized_outputs['input_ids'])}")

# Determine the number of unique labels
unique_labels = list(set(dataset['Output'].str.strip()))
num_labels = len(unique_labels)

# Create a mapping from label to ID
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}

# Prepare the dataset for training
def format_labels(labels):
    return [label_to_id[label.strip()] for label in labels]

# Create labels
formatted_labels = format_labels(dataset['Output'].tolist())

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create a train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    tokenized_inputs['input_ids'], formatted_labels, test_size=0.1
)

# Create datasets
train_encodings = {
    'input_ids': train_texts,
    'attention_mask': tokenized_inputs['attention_mask'][:len(train_texts)]
}
val_encodings = {
    'input_ids': val_texts,
    'attention_mask': tokenized_inputs['attention_mask'][len(train_texts):]
}

train_dataset = CustomDataset(encodings=train_encodings, labels=train_labels)
val_dataset = CustomDataset(encodings=val_encodings, labels=val_labels)



Number of tokenized input texts: 483
Number of tokenized output texts: 483


In [10]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset

# Determine the number of unique labels
unique_labels = list(set(dataset['Output'].str.strip()))
num_labels = len(unique_labels)

# Create a mapping from label to ID
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}

# Prepare the dataset for training
def format_labels(labels):
    return [label_to_id[label.strip()] for label in labels]

# Create labels
formatted_labels = format_labels(dataset['Output'].tolist())

# Create a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [12]:
# Create a train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    tokenized_inputs['input_ids'], formatted_labels, test_size=0.1
)

# Create datasets
train_encodings = {
    'input_ids': train_texts,
    'attention_mask': tokenized_inputs['attention_mask'][:len(train_texts)]
}
val_encodings = {
    'input_ids': val_texts,
    'attention_mask': tokenized_inputs['attention_mask'][len(train_texts):]
}

train_dataset = CustomDataset(encodings=train_encodings, labels=train_labels)
val_dataset = CustomDataset(encodings=val_encodings, labels=val_labels)


In [15]:
from transformers import GPT2ForSequenceClassification

# Define the model and set padding token
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id

# Freeze all layers except the last one
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last transformer layer
for param in model.transformer.h[-1].parameters():  # Unfreeze the last transformer block
    param.requires_grad = True

# Unfreeze the classification head (score layer)
for param in model.score.parameters():  # Unfreeze the score layer
    param.requires_grad = True

# Verify that only the last layers are unfrozen
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Training parameter: {name}")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training parameter: transformer.h.11.ln_1.weight
Training parameter: transformer.h.11.ln_1.bias
Training parameter: transformer.h.11.attn.c_attn.weight
Training parameter: transformer.h.11.attn.c_attn.bias
Training parameter: transformer.h.11.attn.c_proj.weight
Training parameter: transformer.h.11.attn.c_proj.bias
Training parameter: transformer.h.11.ln_2.weight
Training parameter: transformer.h.11.ln_2.bias
Training parameter: transformer.h.11.mlp.c_fc.weight
Training parameter: transformer.h.11.mlp.c_fc.bias
Training parameter: transformer.h.11.mlp.c_proj.weight
Training parameter: transformer.h.11.mlp.c_proj.bias
Training parameter: score.weight


In [16]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/My Drive/gpt_fine_tune',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Experiment with smaller batch size
    num_train_epochs=200,  # Set the number of epochs to 200
    logging_dir='/content/drive/My Drive/logs_gpt',
    report_to='none',
    logging_steps=10  # Log every 10 steps
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)




In [17]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,4.931,4.343917
2,3.1291,3.052096
3,2.9967,2.914535
4,2.9035,2.85258
5,2.605,2.820974
6,2.6018,2.792178
7,2.6583,2.759446
8,2.6368,2.738565
9,2.5671,2.722962
10,2.6221,2.701558


TrainOutput(global_step=11000, training_loss=0.6806682673692703, metrics={'train_runtime': 1838.2548, 'train_samples_per_second': 47.219, 'train_steps_per_second': 5.984, 'total_flos': 1.13415076970496e+16, 'train_loss': 0.6806682673692703, 'epoch': 200.0})

In [18]:
from sklearn.metrics import accuracy_score

# Evaluate the model on the validation dataset
eval_results = trainer.evaluate()

# Predict on the validation dataset
predictions, labels, _ = trainer.predict(val_dataset)

# Get predicted labels (class with the highest probability)
predicted_labels_val = predictions.argmax(axis=1)

# Calculate validation accuracy
validation_accuracy = accuracy_score(labels, predicted_labels_val)

# Predict on the training dataset (you may need to pass train_dataset for this)
predictions_train, labels_train, _ = trainer.predict(train_dataset)

# Get predicted labels for training (class with the highest probability)
predicted_labels_train = predictions_train.argmax(axis=1)

# Calculate training accuracy
training_accuracy = accuracy_score(labels_train, predicted_labels_train)

# Print the evaluation results (loss and other metrics for validation)
print("Evaluation Results:", eval_results)

# Print both training and validation accuracies in percentage format
print(f"Training Accuracy: {training_accuracy * 100:.2f}%")
print(f"Validation Accuracy: {validation_accuracy * 100:.2f}%")


Evaluation Results: {'eval_loss': 1.4926520586013794, 'eval_runtime': 0.7737, 'eval_samples_per_second': 63.332, 'eval_steps_per_second': 9.047, 'epoch': 200.0}
Training Accuracy: 99.77%
Validation Accuracy: 61.22%


In [19]:
# Save the model to Google Drive
model.save_pretrained('/content/drive/My Drive/saved_model_gpt')
tokenizer.save_pretrained('/content/drive/My Drive/saved_model_gpt')


('/content/drive/My Drive/saved_model_gpt/tokenizer_config.json',
 '/content/drive/My Drive/saved_model_gpt/special_tokens_map.json',
 '/content/drive/My Drive/saved_model_gpt/vocab.json',
 '/content/drive/My Drive/saved_model_gpt/merges.txt',
 '/content/drive/My Drive/saved_model_gpt/added_tokens.json')

In [25]:
import torch

# Function to test the model with a custom input
def test_model_with_input(input_text):
    # Ensure the model is on the correct device (GPU if available, otherwise CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)  # Move the model to the correct device

    # Tokenize the input text and move the tensors to the same device as the model
    inputs = tokenizer(input_text, truncation=True, padding='max_length', max_length=256, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move input tensors to the same device

    # Set the model to evaluation mode
    model.eval()

    # Pass the input through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class (the index of the highest logit)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()

    # Map the predicted class ID to the label
    predicted_label = unique_labels[predicted_class]

    # Print the input and the predicted label
    print(f"Input Text: {input_text}")
    print(f"Predicted Label: {predicted_label}")

# Example test
input_example = "I barely get enough sleep between classes and work, and I think it's taking a toll on my skin. My acne seems to be getting worse, and I have these dark circles under my eyes. I know I need to get more sleep, but it's hard to find the time. Do you have any advice?"
#"I have these red, itchy patches on my elbows and knees that won't go away. They're flaky and sometimes bleed when I scratch. I'm really self-conscious about them, especially in the summer."
#"I noticed red circles on my feet after walking barefoot in the park."
#"I feel embarrassed showing my skin; the hives make it look really bad."
#"Doctor, I've noticed these small, red bumps on my wrists and elbows. They itch like crazy, and I'm starting to worry about bed bugs or something. Could it be something more serious?"
test_model_with_input(input_example)


Input Text: I barely get enough sleep between classes and work, and I think it's taking a toll on my skin. My acne seems to be getting worse, and I have these dark circles under my eyes. I know I need to get more sleep, but it's hard to find the time. Do you have any advice?
Predicted Label: Acne
