Setup PyTorch to use best hardware option

In [None]:
import torch

torch.backends.cuda.matmul.allow_tf32 = True

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

In [52]:
ARTIFACTS_BASE = '../../../artifacts'

In [53]:
from os import path
from datasets import load_from_disk

dataset_path = path.join(ARTIFACTS_BASE, 'datasets', 'jayavibhav/prompt-injection')

train_dataset_split = load_from_disk(path.join(dataset_path, 'train')).train_test_split(test_size=0.2)
train_dataset = train_dataset_split['train'].rename_column('text', 'prompt').select(range(1000))
eval_dataset = train_dataset_split['test'].rename_column('text', 'prompt').select(range(1000))

train_dataset = train_dataset.rename_column('label', 'labels')
eval_dataset = eval_dataset.rename_column('label', 'labels')

test_dataset = load_from_disk(path.join(dataset_path, 'test'))
test_dataset = test_dataset.rename_column('text', 'prompt')


In [None]:
train_dataset

In [None]:
eval_dataset

In [None]:
test_dataset

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert/distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True, max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
model.eval()

In [55]:
# A utility function to receive a batch of data and tokenize the prompts
def tokenize_batch(batch):
    return tokenizer(batch['prompt'], padding="max_length", truncation=True, max_length=512)

In [64]:
# Tokenize prompts in both training and testing datasets
prompts_train_tokenized = train_dataset.map(tokenize_batch, batched=True) 
prompts_eval_tokenized = eval_dataset.map(tokenize_batch, batched=True) 

In [67]:
# Import TrainingArguments to handle the various training configurations
from transformers import TrainingArguments

# Define training arguments for fine-tuning
# GPU NVIDIA RTX A6000
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./results/logs",
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    tf32=True,
    num_train_epochs=3,
    torch_compile=True,
    load_best_model_at_end=True,
    dataloader_num_workers=4
)

In [16]:
import pandas as pd

# Initialize a DataFrame to track the model's performance
results_df = pd.DataFrame(columns=["epoch","accuracy","precision","recall","f1"])

In [None]:
# Import accuracy metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# A utility function for model evaluation during fine-tuning
def evaluate_model(trainer, epoch):
    
    # Extract predictions and labels
    predictions = trainer.predictions.argmax(axis=1)

    labels = trainer.label_ids
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)
    
    # Calculate precision, recall, and f1 score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    
    # Append current metrics to results
    global results_df
    results_df.loc[len(results_df)] = [epoch, accuracy, precision, recall, f1]
        
    # Return
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [69]:
# Import the Trainer class
from transformers import Trainer

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=prompts_train_tokenized,
    eval_dataset=prompts_eval_tokenized,
    tokenizer=tokenizer,
    compute_metrics=lambda p: evaluate_model(p, trainer.state.epoch),
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
results_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Plot the accuracy in each epoch

# Create a figure
plt.figure(figsize=(10,5))

# Plot samples
sns.lineplot(data=results_df, x="epoch", y="accuracy")

# Set figure title and axes labels
plt.title("Model accuracy in each epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")

# Show the plot
plt.show()

In [None]:
final_results = trainer.evaluate()
print(final_results)

In [None]:
output_dir = "./final_results"
model.save_pretrained(output_dir + "/distilbert-base-uncased-finetuned-sst-2-english-model")
tokenizer.save_pretrained(output_dir + "/distilbert-base-uncased-finetuned-sst-2-english-tokenizer")