Setup PyTorch to use best hardware option

In [1]:
import torch

torch.backends.cuda.matmul.allow_tf32 = True

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

mps


In [2]:
ARTIFACTS_BASE = '../../../artifacts'

In [7]:
from os import path
from datasets import load_from_disk

dataset_path = path.join(ARTIFACTS_BASE, 'datasets', 'jayavibhav/prompt-injection')

test_dataset = load_from_disk(path.join(dataset_path, 'test'))

test_dataset = test_dataset.rename_column('text', 'prompt')

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True, max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [5]:
def tokenize_batch(batch):
    return tokenizer(batch["prompt"], padding='max_length', truncation=True, max_length=512)

In [10]:
prompts_test_tokenized = test_dataset.map(tokenize_batch, batched=True)

Map:   0%|          | 0/65416 [00:00<?, ? examples/s]

In [11]:
from tqdm import tqdm

batch_size = 8
predictions = []
total_batches = len(prompts_test_tokenized) // batch_size + (1 if len(prompts_test_tokenized) % batch_size != 0 else 0)

with torch.no_grad():
    progress_bar = tqdm(total=total_batches, desc="Running inference", unit="batch")
    for batch in prompts_test_tokenized.select_columns(["input_ids", "attention_mask"]).with_format("torch").iter(batch_size=batch_size):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions.extend(outputs.logits.argmax(dim=-1).cpu().tolist())
        progress_bar.update(1)
    
    progress_bar.close()

Running inference: 100%|██████████| 8177/8177 [25:27<00:00,  5.35batch/s]


In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

true_labels = prompts_test_tokenized["label"]
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
import pandas as pd

metrics_df = pd.DataFrame({
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

# Format all columns to display 4 decimal places
for column in metrics_df.columns:
    metrics_df[column] = metrics_df[column].apply(lambda x: f"{x:.4f}")

metrics_df

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
0,0.33,0.3406,0.3816,0.3599
