Setup PyTorch to use best hardware option

In [None]:
import torch

torch.backends.cuda.matmul.allow_tf32 = True

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

In [2]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/jayavibhav/prompt-injection/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/jayavibhav/prompt-injection/" + splits["test"])

In [3]:
test_df.rename(columns={"text":"prompt"}, inplace=True)
train_df.rename(columns={"text":"prompt"}, inplace=True)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True, max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

In [34]:
# A utility function to receive a batch of data and tokenize the prompts
def tokenize_batch(batch):
    return tokenizer(batch['prompt'], padding="max_length", truncation=True, max_length=512)

In [35]:
# Tokenize prompts in both training and testing datasets
prompts_train_tokenized = tokenize_batch(train_df.to_dict(orient='list'))
prompts_test_tokenized = tokenize_batch(test_df.to_dict(orient='list'))

In [37]:
from transformers import Dataloader

train_dataset = DataLoader(prompts_train_tokenized, train_df['label'], batch_size=16, pin_memory=True, num_workers=4)
test_dataset = DataLoader(prompts_test_tokenized, test_df['label'], batch_size=16, pin_memory=True, num_workers=4)

In [38]:
# Import TrainingArguments to handle the various training configurations
from transformers import TrainingArguments

# Define training arguments for fine-tuning
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./results/logs"
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    tf32=True,
    num_train_epochs=3,
    torch_compile=True,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True
    **default_args
)

In [39]:
# Initialize a DataFrame to track the model's performance
results_df = pd.DataFrame(columns=["epoch","accuracy","precision","recall","f1"])

In [40]:
# Import accuracy metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# A utility function for model evaluation during fine-tuning
def evaluate_model(trainer, epoch):
    
    # Extract predictions and labels
    predictions = trainer.predictions.argmax(axis=1)

    labels = trainer.label_ids
    
    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)
    
    # Calculate precision, recall, and f1 score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    
    # Append current metrics to results
    global results_df
    results_df.loc[len(results_df)] = [epoch, accuracy, precision, recall, f1]
        
    # Return
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [41]:
# Import the Trainer class
from transformers import Trainer

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: evaluate_model(p, trainer.state.epoch),
)

In [None]:
# Fine-tune the model
trainer.train(resume_from_checkpoint=True)

In [None]:
results_df

In [None]:
output_dir = "./training_results"
model.save_pretrained(output_dir + "/ms-deberta-v3-model")
tokenizer.save_pretrained(output_dir + "/ms-deberta-v3-tokenizer")