In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch

# 1. Introduction

We will be using https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english for sentiment analysis.

This model is a fine-tune checkpoint of DistilBERT-base-uncased, trained on SST-2 dataset which is a dataset for binary sentiment classification. It is composed of sentences extracted from movie reviews and annotated with a sentiment label. The task is to predict the sentiment of a given sentence. This model reaches an accuracy of 91.3 on the dev set (for comparison, Bert bert-base-uncased version reaches an accuracy of 92.7).

In [3]:
from datasets import load_dataset, load_dataset_builder

ds_builder = load_dataset_builder('stanfordnlp/sst2')
ds_builder.info.features

{'idx': Value(dtype='int32', id=None),
 'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None)}

In [4]:
initial_dataset = load_dataset('stanfordnlp/sst2')
initial_dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 872
    })
    test: Dataset({
        features: ['idx', 'sentence', 'label'],
        num_rows: 1821
    })
})

In [5]:
initial_dataset['train'][0]

{'idx': 0,
 'sentence': 'hide new secretions from the parental units ',
 'label': 0}

In [6]:
sentence = initial_dataset['train'][0]['sentence']
sentence

'hide new secretions from the parental units '

In [7]:
from transformers import DistilBertTokenizer

tokenizer_ckpt = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_ckpt)

# 2. Tokenization

In [8]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [9]:
tokenized_sentence = tokenizer.tokenize(sentence)
tokenized_sentence

['hide', 'new', 'secret', '##ions', 'from', 'the', 'parental', 'units']

# 3. DistilBERT (original)

DistilBERT, like BERT, uses a hidden size of 768 dimensions, meaning that the internal representations of words and tokens in the model have a dimensionality of 768.

The hidden size in a transformer-based model like BERT or DistilBERT determines the dimension of the model's embeddings and the dimension of the hidden states as the model processes input data. A higher hidden size allows the model to capture more complex patterns and relationships in the data but also makes the model larger and more computationally intensive.

For classification tasks, it is common practice to just use the hidden state corresponding to the first token of the input sequence, which is the special token [CLS] (for classification). This is because the hidden state of this token has access to the entire sequence through the attention mechanism.This means that during pre-training and fine-tuning, the model has learned to aggregate information from all tokens in the input sequence into this [CLS] token's hidden state.

### **REWRITE USING PYTORCH**

In [10]:
from transformers import AutoModel

model_ckpt = 'distilbert/distilbert-base-uncased'

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [11]:
inputs = tokenizer(sentence, return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}
inputs

{'input_ids': tensor([[  101,  5342,  2047,  3595,  8496,  2013,  1996, 18643,  3197,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [12]:
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
tokens

['[CLS]',
 'hide',
 'new',
 'secret',
 '##ions',
 'from',
 'the',
 'parental',
 'units',
 '[SEP]']

In [13]:
# encode: tokens -> hash | input_ids
# decode: ? -> tokens or tags | hahs -> tokens or tags
with torch.inference_mode():
  outputs = model(**inputs)
outputs

BaseModelOutput(last_hidden_state=tensor([[[-1.6527e-01, -2.0267e-01, -3.8784e-01,  ..., -3.4770e-02,
           1.7683e-01,  4.9799e-01],
         [ 1.9158e-01,  2.9310e-03, -1.0192e-01,  ...,  9.4399e-02,
          -4.0690e-02,  4.1901e-01],
         [-1.3455e-01, -2.6318e-01,  1.8665e-01,  ..., -8.0812e-02,
          -2.1052e-01, -6.9526e-02],
         ...,
         [ 2.3456e-01, -6.1193e-02, -1.0886e-02,  ..., -8.5200e-02,
          -1.6539e-01,  1.3718e-01],
         [-9.2496e-04, -2.2951e-01, -1.4710e-01,  ...,  5.5233e-02,
          -9.3542e-02,  3.7744e-01],
         [ 1.0679e+00,  2.4419e-01, -3.1690e-01,  ...,  1.2886e-01,
          -5.1978e-01, -1.9382e-01]]]), hidden_states=None, attentions=None)

In [14]:
# output shape: [batch_size, n_tokens, hidden_dims]
outputs.last_hidden_state.shape

torch.Size([1, 10, 768])

In [15]:
len(tokens), tokens

(10,
 ['[CLS]',
  'hide',
  'new',
  'secret',
  '##ions',
  'from',
  'the',
  'parental',
  'units',
  '[SEP]'])

In [16]:
# hidden state corresponding to the first token of the input sequence [CLS]
outputs.last_hidden_state[:, 0].shape

torch.Size([1, 768])

# 3. DistilBERT finetuned on SST-2 (English)

In [17]:
model = AutoModel.from_pretrained(tokenizer_ckpt).to(device)

In [18]:
with torch.inference_mode():
  outputs = model(**inputs)
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.0309,  0.4670, -0.7988,  ..., -0.0455, -0.4391,  0.0750],
         [ 0.0161,  0.5940, -0.2869,  ...,  0.1686, -0.5022,  0.3976],
         [-0.2654,  0.4771, -0.2804,  ..., -0.0954, -0.6223, -0.0153],
         ...,
         [ 0.1277,  0.6889, -0.4992,  ..., -0.3389, -0.3758,  0.3130],
         [-0.0790,  0.4412, -0.4479,  ..., -0.2395, -0.2491,  0.2796],
         [ 0.4794,  0.5617, -0.4332,  ..., -0.0540, -0.2366, -0.1133]]]), hidden_states=None, attentions=None)

# 4. Using Hugging Face SDK

In [None]:
from transformers import pipeline

text = 'I am positive about this.'

pipe = pipeline('text-classification', model=tokenizer_ckpt, tokenizer=tokenizer)

result = pipe(text)
result

In [18]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(tokenizer_ckpt).to(device)

In [None]:
with torch.inference_mode():
  outputs = model(**inputs)
outputs

In [None]:
input_ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
input_ids = torch.tensor(input_ids).unsqueeze(0)
input_ids

In [None]:
attention_mask = torch.ones_like(input_ids)
attention_mask

In [None]:
from transformers import pipeline

pipe = pipeline('text-classification', model=model_ckpt, tokenizer=tokenizer)

result = pipe(sentence)
result

In [None]:
# selects the embedding for the first token in the sequence [CLS]
# it is used to represent the entire sequence for classification tasks
pooled_outputs = outputs.last_hidden_state[:, 0]
pooled_outputs.shape

In [None]:
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# model_ckpt = 'distilbert/distilbert-base-uncased-finetuned-sst-2-english'

# model = DistilBertForSequenceClassification.from_pretrained(model_ckpt)
# tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)

# 2. Tokenization

The first step is to tokenize the full SST-2 dataset. We will use the tokenizer provided by the model to tokenize the dataset.

NOTES:
- max_position_embeddings (int, optional, defaults to 512) — we will use the default value, thus we need to cap any inputs at this length.

## 2.1. Tokenization of the SST-2 dataset

In [None]:
def encode(example):
    return tokenizer(example['sentence'], truncation=True, padding="max_length")

tokenized_dataset = initial_dataset['train'].map(encode, batched=True)
tokenized_dataset

In [None]:
tokenizer.decode(tokenized_dataset[0]['input_ids'])

In [None]:
tokenized_dataset.set_format(type='torch', columns=['sentence', 'input_ids'])
dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size=32)

## 2. Create mapping dataset for first-layer network encryption

The idea here is to first tokenize and then encrypt the tokens, so that we have a mapping of plain tokens to encrypted tokens. This mapping will be used to train the first-layer network.

In [None]:
# Simple Cesear cipher encryption just for testing purposes
def encrypt_tokens(example, shift=3):
    encrypted_input_ids = [(token_id + shift) % tokenizer.vocab_size for token_id in example['input_ids']]
    example['encrypted_input_ids'] = encrypted_input_ids
    return example


In [None]:
encrypted_dataset = tokenized_dataset.map(encrypt_tokens, batched=True)

In [None]:
encrypted_dataset.set_format(type='torch', columns=['encrypted_input_ids', 'input_ids'])

In [None]:
train_dataset = encrypted_dataset.shuffle(seed=42).select(range(1000))
val_dataset = encrypted_dataset.shuffle(seed=42).select(range(200)) 

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader

vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
num_layers = 2

class TokenTranslator(nn.Module):
    def __init__(self):
        super(TokenTranslator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.decoder = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.encoder(embedded)
        outputs, (hidden, cell) = self.decoder(outputs, (hidden, cell))
        predictions = self.fc_out(outputs)
        return predictions 


In [None]:
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TokenTranslator().to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_epoch(model, data_loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in data_loader:
        src = batch['encrypted_input_ids'].to(device)
        trg = batch['input_ids'].to(device)

        optimizer.zero_grad()
        output = model(src)

        output = output.view(-1, vocab_size)
        trg = trg.view(-1)

        loss = criterion(output, trg)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss / len(data_loader)


In [None]:
def evaluate(model, data_loader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            src = batch['encrypted_input_ids'].to(device)
            trg = batch['input_ids'].to(device)

            output = model(src)
            output = output.view(-1, vocab_size)
            trg = trg.view(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(data_loader)


In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss = evaluate(model, val_loader, criterion)
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
