Setup PyTorch to use best hardware option

In [None]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

Dataset download

In [2]:
import pandas as pd
import swifter

splits = {'train': 'data/train-00000-of-00001.parquet', 'test': 'data/test-00000-of-00001.parquet'}
train_df = pd.read_parquet("hf://datasets/jayavibhav/prompt-injection/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/jayavibhav/prompt-injection/" + splits["test"])

In [3]:
test_df.rename(columns={"text":"prompt"}, inplace=True)
train_df.rename(columns={"text":"prompt"}, inplace=True)

In [None]:
test_df.head()

In [None]:
train_df.head()

Get model and tokenizer from Hugginface

In [None]:
from transformers import DebertaV2Tokenizer, DebertaV2Model

model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True)
model = DebertaV2Model.from_pretrained(model_name).to(device)

In [7]:
def get_embedding(prompt):
    tokens = tokenizer(prompt, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    last_hidden_states = outputs.last_hidden_state
    embedding_vector = last_hidden_states.mean(dim=1).squeeze().cpu().numpy()

    return embedding_vector

In [None]:
train_df['embedding'] = train_df['prompt'].swifter.apply(get_embedding)

In [None]:
test_df['embedding'] = test_df['prompt'].swifter.apply(get_embedding)

Save generated embedding into artifacts for reuse

In [None]:
ARTIFACTS_BASE = '../../artifacts/step-1-classic-ml/distilbert-base-uncased-finetuned-sst-2-english'

In [None]:
train_df.to_pickle(ARTIFACTS_BASE + 'train_df.pkl')
test_df.to_pickle(ARTIFACTS_BASE + 'test_df.pkl')