Setup PyTorch to use best hardware option

In [75]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

mps


In [76]:
ARTIFACTS_BASE = '../../../artifacts'

Dataset load

In [77]:
from os import path
from datasets import load_from_disk

dataset_path = path.join(ARTIFACTS_BASE, 'datasets', 'jayavibhav', 'prompt-injection')

dataset = load_from_disk(dataset_path)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [78]:
train_dataset = train_dataset.rename_column('text', 'prompt')
test_dataset = test_dataset.rename_column('text', 'prompt')

In [79]:
train_dataset.shape

(261738, 2)

In [80]:
test_dataset.shape

(65416, 2)

In [81]:
test_dataset.select(range(10))

Dataset({
    features: ['prompt', 'label'],
    num_rows: 10
})

In [82]:
train_dataset.select(range(10))

Dataset({
    features: ['prompt', 'label'],
    num_rows: 10
})

Get model and tokenizer from Hugginface

In [66]:
from transformers import AutoTokenizer, AutoModel

model_name = "distilbert/distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-5): 6 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout)

In [67]:
def get_embedding(batch):
    tokens = tokenizer(batch['prompt'], return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return { 'embedding': embeddings.cpu().numpy().tolist() }

In [70]:
train_dataset = train_dataset.map(get_embedding, batched=True, batch_size=32)

Map:   0%|          | 0/261738 [00:00<?, ? examples/s]

In [72]:
test_dataset = test_dataset.map(get_embedding, batched=True, batch_size=32)

Map:   0%|          | 0/65416 [00:00<?, ? examples/s]

Save generated embedding into artifacts for reuse

In [73]:
import os

dataset_embeddings_path = path.join(ARTIFACTS_BASE, 'step-1-classic-ml', 'distilroberta-base')
os.makedirs(ARTIFACTS_BASE, exist_ok=True)

In [74]:
train_dataset.save_to_disk(path.join(dataset_embeddings_path, 'train'))
test_dataset.save_to_disk(path.join(dataset_embeddings_path, 'test'))

Saving the dataset (0/4 shards):   0%|          | 0/261738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/65416 [00:00<?, ? examples/s]