Setup PyTorch to use best hardware option

In [2]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(device)

mps


In [3]:
ARTIFACTS_BASE = '../../../artifacts'

Dataset load

In [4]:
from os import path
from datasets import load_from_disk

dataset_path = path.join(ARTIFACTS_BASE, 'datasets', 'jayavibhav', 'prompt-injection')

dataset = load_from_disk(dataset_path)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [5]:
train_dataset = train_dataset.rename_column('text', 'prompt')
test_dataset = test_dataset.rename_column('text', 'prompt')

In [6]:
train_dataset.shape

(261738, 2)

In [7]:
test_dataset.shape

(65416, 2)

In [8]:
test_dataset.select(range(10))

Dataset({
    features: ['prompt', 'label'],
    num_rows: 10
})

In [9]:
train_dataset.select(range(10))

Dataset({
    features: ['prompt', 'label'],
    num_rows: 10
})

Get model and tokenizer from Hugginface

In [10]:
from transformers import DebertaV2Tokenizer, DebertaV2Model

model_name = "microsoft/deberta-v3-base"
tokenizer = DebertaV2Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=False, use_fast=True)
model = DebertaV2Model.from_pretrained(model_name).to(device)
model.eval()

DebertaV2Model(
  (embeddings): DebertaV2Embeddings(
    (word_embeddings): Embedding(128100, 768, padding_idx=0)
    (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
    (dropout): StableDropout()
  )
  (encoder): DebertaV2Encoder(
    (layer): ModuleList(
      (0-11): 12 x DebertaV2Layer(
        (attention): DebertaV2Attention(
          (self): DisentangledSelfAttention(
            (query_proj): Linear(in_features=768, out_features=768, bias=True)
            (key_proj): Linear(in_features=768, out_features=768, bias=True)
            (value_proj): Linear(in_features=768, out_features=768, bias=True)
            (pos_dropout): StableDropout()
            (dropout): StableDropout()
          )
          (output): DebertaV2SelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
            (dropout): StableDropout()
          )
        )
        (intermedia

In [11]:
def get_embedding(batch):
    tokens = tokenizer(batch['prompt'], return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**tokens)

    embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return { 'embedding': embeddings.cpu().numpy().tolist() }

In [12]:
train_dataset = train_dataset.map(get_embedding, batched=True, batch_size=32)

Map:   0%|          | 0/261738 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
test_dataset = test_dataset.map(get_embedding, batched=True, batch_size=32)

Save generated embedding into artifacts for reuse

In [6]:
import os

dataset_embeddings_path = path.join(ARTIFACTS_BASE, 'step-1-classic-ml', 'deberta-v3-base')
os.makedirs(ARTIFACTS_BASE, exist_ok=True)

In [None]:
train_dataset.save_to_disk(path.join(dataset_embeddings_path, 'train'))
test_dataset.save_to_disk(path.join(dataset_embeddings_path, 'test'))