In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModel, AutoTokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
test_labels_df = pd.read_csv('data/test_labels.csv')

train_df = train_df[['id', 'comment_text', 'toxic']]
negative_sample_train = train_df[train_df['toxic'] == 0].sample(frac=0.1)
positive_sample_train = train_df[train_df['toxic'] == 1]
train_df = pd.concat([negative_sample_train, positive_sample_train])
test_labels_df = test_labels_df[['id', 'toxic']]

test_df = pd.merge(test_df, test_labels_df, on='id', how='inner')
test_df = test_df[test_df['toxic'] != -1]

In [None]:
class TextClassifier(nn.Module):
    def __init__(self, transformer_model, freeze_transformer=True):
        super(TextClassifier, self).__init__()
        self.model = AutoModel.from_pretrained(transformer_model)
        # Freeze the transformer model
        if freeze_transformer:
            for param in self.model.parameters():
                param.requires_grad = False
        self.tokenizer = AutoTokenizer.from_pretrained(transformer_model)
        self.fc = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def get_static_embeddings_matrix(self):
        return self.model.get_input_embeddings().weight
    
    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds)
        cls_token = outputs.last_hidden_state[:, 0]
        cls_token = self.fc(cls_token)
        return self.sigmoid(cls_token)

## Example Embedding Manipulation

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextClassifier('bert-base-uncased').to(device)

# Example input
suffix_len = 20
text = "Example input text"
suffix = "!"*suffix_len
text = text + suffix
inputs = model.tokenizer(text, return_tensors='pt').to(device)

In [32]:
# go back to the original text
decoded_text = model.tokenizer.decode(inputs['input_ids'][0])
print(decoded_text)

[CLS] example input text!!!!!!!!!!!!!!!!!!!! [SEP]


In [25]:
# Step 1: Access static embeddings matrix
embeddings_matrix = model.get_static_embeddings_matrix()

# Step 2: Manually embed tokens by indexing
with torch.no_grad():
    inputs_embeds = embeddings_matrix[inputs['input_ids']]

before_embeds = inputs_embeds.clone()
# Add perturbation to the embeddings of the suffix
perturbation = torch.randn((1, suffix_len, 768)).to(device)
inputs_embeds[:, -(suffix_len+1):-1] += perturbation

# Step 3: Feed manually created embeddings into model
outputs = model(inputs_embeds=inputs_embeds, attention_mask=inputs['attention_mask'])

sub = inputs_embeds - before_embeds
print(sub)

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [ 0.0947, -1.0562,  0.2119,  ..., -2.8292,  0.5945,  1.0166],
         [-0.5731,  1.5620, -0.7489,  ..., -0.9496, -2.4846,  1.6685],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0')


In [39]:
# now create the pert using gradient step
# Step 1: Access static embeddings matrix
embeddings_matrix = model.get_static_embeddings_matrix()

# Step 2: Manually embed tokens by indexing
with torch.no_grad():
    inputs_embeds = embeddings_matrix[inputs['input_ids']]
    inputs_embeds.requires_grad = True
before_embeds = inputs_embeds.clone()

# Step 3: Feed manually created embeddings into model
outputs = model(inputs_embeds=inputs_embeds, attention_mask=inputs['attention_mask'])
loss = nn.BCELoss()(outputs, torch.tensor([[1.0]]).to(device))
loss.backward()

In [40]:
print(f"P: {embeddings_matrix.shape}")
PP_T = embeddings_matrix.T @ embeddings_matrix
# normalize using frobenius norm
PP_T = PP_T / torch.norm(PP_T, p='fro')
print(f"PP_T: {PP_T.shape}")

P: torch.Size([30522, 768])


PP_T: torch.Size([768, 768])


In [41]:
# Step 4: Update embeddings with gradient using PGD
epsilon = 0.1
alpha = 0.01
pert = inputs_embeds.grad
# take only the perturbation for the suffix
pert = pert[:, -(suffix_len+1):-1]
# clamp the perturbation
pert = torch.clamp(pert, -epsilon, epsilon)

# pert input embeddings with gradient
with torch.no_grad():
    inputs_embeds[:, -(suffix_len+1):-1] += alpha * pert
    # Project into vocabulary space using PP_T
    inputs_embeds[:, -(suffix_len+1):-1] = torch.einsum(
    'ij,bsj->bsi', PP_T, inputs_embeds[:, -(suffix_len+1):-1])

In [42]:
sub = inputs_embeds - before_embeds
print(sub)

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         ...,
         [-0.0365,  0.0260,  0.0249,  ..., -0.0246, -0.0293, -0.0253],
         [-0.0365,  0.0260,  0.0249,  ..., -0.0246, -0.0293, -0.0253],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       device='cuda:0', grad_fn=<SubBackward0>)


In [8]:
class TextDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data['comment_text'].iloc[idx]
        label = self.data['toxic'].iloc[idx]
        inputs = self.tokenizer(text, return_tensors='pt', padding='max_length', max_length=128, truncation=True)
        inputs['label'] = torch.tensor(label)
        return inputs

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TextClassifier('bert-base-uncased').to(device)


train_dataset = TextDataset(train_df, model.tokenizer)
test_dataset = TextDataset(test_df, model.tokenizer)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)


criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

In [10]:
for epoch in range(5):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        input_ids = input_ids.squeeze(1)
        attention_mask = batch['attention_mask'].to(device)
        attention_mask = attention_mask.view(input_ids.shape)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{5}, Loss: {epoch_loss/len(train_loader)}')

100%|██████████| 59/59 [00:33<00:00,  1.78it/s]


Epoch 1/5, Loss: 0.7076968116275335


100%|██████████| 59/59 [00:34<00:00,  1.70it/s]


Epoch 2/5, Loss: 0.6995430750361944


100%|██████████| 59/59 [00:34<00:00,  1.70it/s]


Epoch 3/5, Loss: 0.6921232272002656


100%|██████████| 59/59 [00:34<00:00,  1.71it/s]


Epoch 4/5, Loss: 0.6852594927205877


100%|██████████| 59/59 [00:34<00:00,  1.69it/s]

Epoch 5/5, Loss: 0.6793299610331908





In [11]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        input_ids = input_ids.squeeze(1)
        attention_mask = batch['attention_mask'].to(device)
        attention_mask = attention_mask.view(input_ids.shape)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        predicted = torch.round(outputs.squeeze())
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print(f'Accuracy: {correct/total}')

100%|██████████| 125/125 [01:14<00:00,  1.67it/s]

Accuracy: 0.29766482228265967



