In [None]:
import torch as t
from torch import einsum
from einops import rearrange, reduce, repeat
import math
import torch
import einops
import re
from collections import OrderedDict
import random

import bert_utils

In [None]:
def raw_attention_pattern(token_activations, num_heads, project_query, project_key):
    # shape: batch_size, seq_length, 768 -> batch_size, 12, seq_length, 64
    keys = rearrange(project_key(token_activations), 'batch_size seq_length (num_heads head_size) -> batch_size num_heads seq_length head_size', num_heads=num_heads)
    queries = rearrange(project_query(token_activations), 'batch_size seq_length (num_heads head_size) -> batch_size num_heads seq_length head_size', num_heads=num_heads)
    # Matrix multiplication Q's seq_length * head_size matmul with K.T's head_size * seq_length
    # AB[ij] = \sum_k A[ik]B[kj]
    head_size = keys.shape[-1]
    return einsum('b l k h, b l q h -> b l k q', keys, queries)/math.sqrt(head_size)

In [None]:
def bert_attention(token_activations, 
                    num_heads: int, 
                    attention_pattern, 
                    project_value, 
                    project_output):
    # Shape: batch_size * num_heads * k seq_length * q seq_length
    # Softmax over k
    softmaxed_attention_pattern = t.nn.Softmax(dim=2)(attention_pattern)
        
    # Shape of values: batch_size b, num_heads l, seq_length k/q?, head_size h
    values = rearrange(project_value(token_activations), 'b seq_length (num_heads head_size) -> b num_heads seq_length head_size', num_heads=num_heads)
    
    # Shape of softmax attention_pattern: batch_size b, num_heads l, key seq_length k, query seq_length q
    # Check this?? Because we did a softmax over dim k, we're doing a matrix multiplication over dim k
    raw_output = einsum('b l k q, b l k h -> b q l h', softmaxed_attention_pattern, values)
    # We want output shape: batch_size, seq_length, hidden_size (i.e. num_heads * head_size)
    raw_output = rearrange(raw_output, 'b q l h -> b q (l h)')
    return project_output(raw_output)


In [None]:
class MultiHeadedSelfAttention(torch.nn.Module):
    def __init__(self, num_heads: int, hidden_size:int):
        super().__init__()
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        # By default, Linear layers have bias=True
        self.project_query = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.project_key = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.project_value = torch.nn.Linear(self.hidden_size, self.hidden_size)
        self.project_output = torch.nn.Linear(self.hidden_size, self.hidden_size)
        
    def forward(self, input: torch.Tensor):
        attention_pattern = raw_attention_pattern(input, self.num_heads, self.project_query, self.project_key)
        return bert_attention(input, self.num_heads, attention_pattern, self.project_value, self.project_output)

In [None]:
def bert_mlp(token_activations: torch.Tensor, linear_1: torch.nn.Module, linear_2: torch.nn.Module
) -> torch.Tensor:
    """
    BERT MLP block: Linear, GELU activation (following GPT), Linear 
    """
    out = linear_1(token_activations)
    out = torch.nn.GELU()(out)
    return linear_2(out)

In [None]:
class BertMLP(torch.nn.Module):
    """
    It should map a vector of length input_size to intermediate_size and then 
     back to input_size, with a bias on each Linear layer and a Gelu between them
    """
    def __init__(self, input_size: int, intermediate_size: int):
        super().__init__()
        self.linear_1 = torch.nn.Linear(input_size, intermediate_size)
        self.linear_2 = torch.nn.Linear(intermediate_size, input_size)
        
    def forward(self, x):
        return bert_mlp(x, self.linear_1, self.linear_2)

In [None]:
class LayerNorm(torch.nn.Module):
    def __init__(self, normalized_dim: int, eps = 1e-05):
        """
        Create parameters weight and bias, both of shape [normalized_dim], 
         initialized as ones and zeros respectively
        """
        super().__init__()
        self.normalized_dim = normalized_dim
        self.weight = torch.nn.Parameter(torch.ones((normalized_dim,)))
        self.bias = torch.nn.Parameter(torch.zeros((normalized_dim,)))
        self.eps = eps
        
    def forward(self, x):
        """
        5 things we need: input, mean, var, weight, bias
        
        input.shape = [batch_size, seq_length, hidden_size] --> normalize over input[i,j,:] has mean 0, var 1 
          if normalized_dim == hidden_size
        """
        mean_x = t.mean(x, dim=-1, keepdim=True).detach()
        std_x = t.std(x, dim=-1, unbiased=False, keepdim=True).detach()
        x_out = (x-mean_x)/t.sqrt(std_x**2 + self.eps)
        return x_out*self.weight + self.bias                         


In [None]:
class BertBlock(t.nn.Module):
    def __init__(self, hidden_size: int, intermediate_size: int, num_heads: int, dropout: float):
        super().__init__()
        # TODO: Clean up
        self.layer1 = MultiHeadedSelfAttention(num_heads, hidden_size)
        self.layer_norm1 = LayerNorm(hidden_size)
        self.layer2 = t.nn.Sequential(
            BertMLP(hidden_size, intermediate_size),
            t.nn.Dropout(dropout),
        )
        self.layer_norm2 = LayerNorm(hidden_size)
    def forward(self, x):
        out = self.layer1(x) + x
        out = self.layer_norm1(out)
        out = self.layer2(out) + out
        return self.layer_norm2(out)

In [None]:
class Embedding(torch.nn.Module):
    def __init__(self, vocab_size, embed_size):
        super().__init__()
        self.token_embedding_matrix = torch.nn.Parameter(torch.randn(vocab_size, embed_size))
        
    def forward(self, input: torch.LongTensor) -> torch.FloatTensor:
        token_embeddings = self.token_embedding_matrix[input]
        return token_embeddings

In [None]:
def bert_embedding(
        input_ids, 
        token_type_ids, 
        position_embedding: Embedding,
        token_embedding: Embedding, 
        token_type_embedding: Embedding, 
        layer_norm: LayerNorm, 
        dropout: torch.nn.Dropout):
    # Get device tensor is stored on
    device = input_ids.get_device()
    with torch.cuda.device(device):
        position_inds = einops.repeat(torch.arange(input_ids.shape[-1]), 'l -> b l', b=input_ids.shape[0]).cuda()
        # input_ids shape: batch_size, seq_length
        # For each of these, we're indexing the lookup table
        out = position_embedding(position_inds)
        out += token_embedding(input_ids)
        out += token_type_embedding(token_type_ids)
        out = layer_norm(out)
        return dropout(out)

In [None]:
class BertEmbedding(t.nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int, max_position_embeddings: int, type_vocab_size: int, dropout: float):
        super().__init__()
        self.vocab_size = vocab_size
        self.token_embedding = Embedding(vocab_size, hidden_size)
        self.position_embedding = Embedding(max_position_embeddings, hidden_size)
        self.token_type_embedding = Embedding(type_vocab_size, hidden_size)
        self.layer_norm = LayerNorm(hidden_size)
        self.dropout = t.nn.Dropout(dropout)
        
    def forward(self, input_ids, token_type_ids):
        return bert_embedding(input_ids, token_type_ids, self.position_embedding, 
                              self.token_embedding, self.token_type_embedding, self.layer_norm, self.dropout)

In [None]:
class Bert(t.nn.Module):
    def __init__(
            self, vocab_size: int, hidden_size: int, 
            max_position_embeddings: int, type_vocab_size: int, 
            dropout: float, intermediate_size: int, num_heads: int, 
            num_layers: int
        ):
        super().__init__()
        self.embedding = BertEmbedding(vocab_size, hidden_size, max_position_embeddings, type_vocab_size, dropout)
        self.bertblocks = t.nn.Sequential(
            *[BertBlock(hidden_size, intermediate_size, num_heads, dropout) for _ in range(num_layers)]
        )
       
        self.lin = t.nn.Linear(hidden_size, hidden_size)
        self.norm = LayerNorm(hidden_size)
        self.unembed = t.nn.Linear(hidden_size, vocab_size)
        
    def forward(self, input_ids):
        device = input_ids.get_device()
        with torch.cuda.device(device):
            token_type_ids = torch.zeros(input_ids.shape, dtype=int)
            out = self.embedding(input_ids, token_type_ids)
            out = self.bertblocks(out)
            # Last layers to map to output
            out = self.lin(out)
            out = t.nn.functional.gelu(out)
            out = self.norm(out)
            out = self.unembed(out)
            return out

In [None]:
my_bert = Bert(
    vocab_size=28996, hidden_size=768, max_position_embeddings=512, 
    type_vocab_size=2, dropout=.1, intermediate_size=3072, 
    num_heads=12, num_layers=12
)
pretrained_bert = bert_utils.get_pretrained_bert()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# my_bert

In [None]:
# pretrained_bert

In [None]:
for param1, param2 in zip(pretrained_bert.state_dict(), my_bert.state_dict()):
    if param1.startswith('transformer') and not(param1.startswith('transformer.0')):
        continue
    print(param1.ljust(55), param2)

embedding.token_embedding.weight                        embedding.token_embedding.token_embedding_matrix
embedding.position_embedding.weight                     embedding.position_embedding.token_embedding_matrix
embedding.token_type_embedding.weight                   embedding.token_type_embedding.token_embedding_matrix
embedding.layer_norm.weight                             embedding.layer_norm.weight
embedding.layer_norm.bias                               embedding.layer_norm.bias
transformer.0.layer_norm.weight                         bertblocks.0.layer1.project_query.weight
transformer.0.layer_norm.bias                           bertblocks.0.layer1.project_query.bias
transformer.0.attention.pattern.project_query.weight    bertblocks.0.layer1.project_key.weight
transformer.0.attention.pattern.project_query.bias      bertblocks.0.layer1.project_key.bias
transformer.0.attention.pattern.project_key.weight      bertblocks.0.layer1.project_value.weight
transformer.0.attention.pattern.pr

In [None]:
params = """embedding.token_embedding.weight embedding.token_embedding.token_embedding_matrix
embedding.position_embedding.weight embedding.position_embedding.token_embedding_matrix
embedding.token_type_embedding.weight embedding.token_type_embedding.token_embedding_matrix
embedding.layer_norm.weight embedding.layer_norm.weight
embedding.layer_norm.bias embedding.layer_norm.bias
lm_head.mlp.weight lin.weight
lm_head.mlp.bias lin.bias
lm_head.unembedding.weight unembed.weight 
lm_head.unembedding.bias unembed.bias
lm_head.layer_norm.weight norm.weight
lm_head.layer_norm.bias norm.bias
transformer.(\d+).layer_norm.weight bertblocks.\g<1>.layer_norm1.weight
transformer.(\d+).layer_norm.bias bertblocks.\g<1>.layer_norm1.bias
transformer.(\d+).attention.pattern.project_query.weight bertblocks.\g<1>.layer1.project_query.weight
transformer.(\d+).attention.pattern.project_query.bias bertblocks.\g<1>.layer1.project_query.bias
transformer.(\d+).attention.pattern.project_key.weight bertblocks.\g<1>.layer1.project_key.weight
transformer.(\d+).attention.pattern.project_key.bias bertblocks.\g<1>.layer1.project_key.bias
transformer.(\d+).attention.project_value.weight bertblocks.\g<1>.layer1.project_value.weight
transformer.(\d+).attention.project_value.bias bertblocks.\g<1>.layer1.project_value.bias
transformer.(\d+).attention.project_out.weight bertblocks.\g<1>.layer1.project_output.weight
transformer.(\d+).attention.project_out.bias bertblocks.\g<1>.layer1.project_output.bias
transformer.(\d+).residual.mlp1.weight bertblocks.\g<1>.layer2.0.linear_1.weight
transformer.(\d+).residual.mlp1.bias bertblocks.\g<1>.layer2.0.linear_1.bias
transformer.(\d+).residual.mlp2.weight bertblocks.\g<1>.layer2.0.linear_2.weight
transformer.(\d+).residual.mlp2.bias bertblocks.\g<1>.layer2.0.linear_2.bias
transformer.(\d+).residual.layer_norm.weight bertblocks.\g<1>.layer_norm2.weight
transformer.(\d+).residual.layer_norm.bias bertblocks.\g<1>.layer_norm2.bias"""


def mapkey(key):
    for line in params.split('\n'):
        param1, param2 = line.split(' ')[:2]
        # Check for match
        if re.match(param1, key):
            return re.sub(param1, param2, key)
    print(f"{key} does not have corresponding name")
        
new_state_dict  = OrderedDict({mapkey(k): v for k, v in pretrained_bert.state_dict().items() if 'classification_head' not in k})
my_bert.load_state_dict(new_state_dict, strict=False)

<All keys matched successfully>

In [None]:
bert_utils.test_same_output(my_bert, pretrained_bert)

In [None]:
# Get state_dict from pretrained BERT

# Create new dictionary with the new names for our BERT (this uses mapkey)

# my_bert.load_state_dict(new_dictionary)

In [None]:
# class MLP1(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.linear1 = torch.nn.Linear(5, 5)
#     def forward(self, x):
#         return self.linear1(x)
# class MLP2(torch.nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.linear2 = torch.nn.Linear(5, 5)
#     def forward(self, x):
#         return self.linear2(x)

In [None]:
# mlp1 = MLP1()
# mlp2 = MLP2()
# x = torch.randn((5,5))
# mlp1(x)
# mlp2(x)
# print(mlp1.state_dict())
# print(mlp2.state_dict())

In [None]:
# mlp1.load_state_dict(mlp2.state_dict(), strict=False)

# D2

In [None]:
import transformers
tokenizer = transformers.AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer_uncased = transformers.AutoTokenizer.from_pretrained("bert-base-uncased")

text = "Hi, my name is bert"

tokenized_text = tokenizer_uncased.encode(text)
print(tokenized_text)

import os
# So Jupyter doesn't crash
os.environ["TOKENIZERS_PARALLELISM"]='false'

# tokenizer.decode(): https://huggingface.co/docs/transformers/main_classes/tokenizer
print(f"Uncased: {tokenizer_uncased.decode(tokenized_text, errors='ignore')}")

# tokenizer.decode(): https://huggingface.co/docs/transformers/main_classes/tokenizer
print(f"Cased: {tokenizer.decode(tokenized_text, errors='ignore')}")

[101, 7632, 1010, 2026, 2171, 2003, 14324, 102]
Uncased: [CLS] hi, my name is bert [SEP]
Cased: [CLS] colleges 天 largest happened smile donation [SEP]


In [None]:
def ascii_art_probs(s):
    masked_token = tokenizer.encode('[MASK]')[1]
    print(masked_token)
    tokenized_text = tokenizer.encode(s)
    masked_inds = [i for i, token in enumerate(tokenized_text) if token==masked_token]
    tokenized_text = t.tensor(tokenized_text, dtype=t.long)
    # Shape: batch_size, seq_length, hidden_size
    # print(tokenized_text)
    # We need an unsqueeze because we took the first dimension as the batch in our bert_embedding code:
    #  position_inds = einops.repeat(torch.arange(input_ids.shape[-1]), 'l -> b l', b=input_ids.shape[0])

    tokenized_text = tokenized_text.unsqueeze(0)
    x = my_bert(tokenized_text)
    # Normalizing over the class
    probabilities = t.nn.Softmax(dim=2)(x)
    topk = t.topk(probabilities, k=5, dim=2).indices[0]
    probsk = t.topk(probabilities, k=5, dim=2).values[0].detach()
    # Iterating over the tokens
    for i, (ids, probs) in enumerate(zip(topk, probsk)):
        answer = [tokenizer.decode(i) for i in ids]
        # Sanity check
        # print(*[(a, float(p)) for a, p in zip(answer, list(probs))])
        if i in masked_inds:
            print(f"MASK at index {i}: {[(a, float(p)) for a, p in zip(answer, list(probs))]}")
        
ascii_art_probs("The firetruck was painted bright [MASK].")

103
MASK at index 8: [('red', 0.5602869391441345), ('yellow', 0.12122765183448792), ('white', 0.07108739763498306), ('blue', 0.06910934299230576), ('green', 0.05439632758498192)]


In [None]:
def ascii_art_probs2(s):
    masked_token = tokenizer.encode('[MASK]')[1]
    print(masked_token)
    tokenized_text = tokenizer.encode(s)
    masked_inds = [i for i, token in enumerate(tokenized_text) if token==masked_token]
    tokenized_text = t.tensor(tokenized_text, dtype=t.long)
    # Shape: batch_size, seq_length, hidden_size
    # print(tokenized_text)
    # We need an unsqueeze because we took the first dimension as the batch in our bert_embedding code:
    #  position_inds = einops.repeat(torch.arange(input_ids.shape[-1]), 'l -> b l', b=input_ids.shape[0])

    tokenized_text = tokenized_text.unsqueeze(0)
    x = my_bert(tokenized_text)
    # Normalizing over the class
    probabilities = t.nn.Softmax(dim=2)(x)
    topk = t.topk(probabilities, k=5, dim=2).indices[0]
    probsk = t.topk(probabilities, k=5, dim=2).values[0].detach()
    # Iterating over the tokens
    for rank in range(5):
        for i, (ids, probs) in enumerate(zip(topk, probsk)):
            answer = [tokenizer.decode(i) for i in ids]
            # Sanity check
            # print(*[(a, float(p)) for a, p in zip(answer, list(probs))])
            # if i in masked_inds:
            #     print(f"MASK at index {i}: {[(a, float(p)) for a, p in zip(answer, list(probs))]}")
            print(answer[rank], end=' ')
        print()
        
ascii_art_probs2("The firetruck was painted bright [MASK].")

103
. . fire ##tr ##uck was painted bright red . . 
, " fireplace ##dr ##ucks is stained brightly yellow ; ; 
the the Fire ##cre ##out wasn a dark white ! ! 
) , fires ##fo ##ub were burned yellow blue ? ? 
" ; flames ##be ##wing been still black green : " 


In [None]:
# pretrained_bert

In [None]:
class BertClassifier(t.nn.Module):
    def __init__(
            self, vocab_size: int, hidden_size: int, 
            max_position_embeddings: int, type_vocab_size: int, 
            dropout: float, intermediate_size: int, num_heads: int, 
            num_layers: int, num_classes: int
        ):
        super().__init__()
        self.embedding = BertEmbedding(vocab_size, hidden_size, max_position_embeddings, type_vocab_size, dropout)
        self.bertblocks = t.nn.Sequential(
            *[BertBlock(hidden_size, intermediate_size, num_heads, dropout) for _ in range(num_layers)]
        )
       
        # This maps embeddings to word logits -- word_logits_map
        self.lin = t.nn.Linear(hidden_size, hidden_size)
        self.norm = LayerNorm(hidden_size)
        self.unembed = t.nn.Linear(hidden_size, vocab_size)

        # This maps embeddings to classification logits -- classification_logits_map
        self.dropout = torch.nn.Dropout(dropout)
        self.classification_layer = torch.nn.Linear(hidden_size, num_classes)

        
    def forward(self, input_ids):

        device = input_ids.get_device()
        with torch.cuda.device(device):
            token_type_ids = torch.zeros(input_ids.shape, dtype=int)
            out = self.embedding(input_ids, token_type_ids)
            embeddings = self.bertblocks(out)
            
            # This maps embeddings to word logits -- word_logits_map
            out = self.lin(embeddings)
            out = t.nn.functional.gelu(out)
            out = self.norm(out)
            word_logits = self.unembed(out)
            
            # This maps embeddings to classification logits -- classification_logits_map
            out = self.dropout(embeddings)
            # out.shape = [batch, seq_length, embed_size]
            # We're only interested in the first token ([CLS]) for classification
            classification_logits = self.classification_layer(out)
            return word_logits, classification_logits[:,0] 
        


In [None]:
import torchtext
data_train, data_test = torchtext.datasets.IMDB(root='.data', split=('train', 'test'))
data_train = list(data_train)
data_test = list(data_test)

In [None]:
# tokenizer.encode(saved_data[0][1])

In [None]:
def get_data_list(data, max_seq_len):
    """Take in data_train or data_test, and output a dataloader (an iterable of sample x, label y)"""
    # Reverse sample and label
    # Tokenize
    saved_data = list(data)
    assert len(saved_data), "No data left!"
    labels = [label for label, review in saved_data]
    # reviews = [review for label, review in saved_data]
    # tokenized_reviews = tokenizer.encode(reviews)
    reviews = [review for label, review in saved_data]
    tokenized_reviews = tokenizer(reviews)['input_ids']
    
    # Truncate based on max_seq_len
    tokenized_reviews = [tokens[:max_seq_len] for tokens in tokenized_reviews]
    
    # Get padding token_id
    # Pad to longest
    longest_length = max([len(tokens) for tokens in tokenized_reviews])
    tokenized_reviews = [tokens + [tokenizer.pad_token_id]*(longest_length-len(tokens)) for tokens in tokenized_reviews]
    
    # Shuffle
    data_list = [(tokens, 0 if label == 'neg' else 1) for tokens, label in zip(tokenized_reviews, labels)]
    random.shuffle(data_list)
    return data_list
    
def get_batches(data, batch_size, max_seq_len):
    shuffled_data_list = get_data_list(data, max_seq_len)
    res = []
    for batch_idx in range(math.ceil(len(shuffled_data_list) / batch_size)):
        batch_data = shuffled_data_list[batch_idx*batch_size: (batch_idx+1)*batch_size]
        reviews = [review for review, sentiment in batch_data]
        sentiments = [sentiment for review, sentiment in batch_data]
        res.append((torch.tensor(reviews, dtype=torch.long), torch.tensor(sentiments, dtype=torch.long)))
    return res

# Train the model!


In [None]:
# Load weights with strict=False
classifier_bert = BertClassifier(
    vocab_size=28996, hidden_size=768, max_position_embeddings=512, 
    type_vocab_size=2, dropout=.1, intermediate_size=3072, 
    num_heads=12, num_layers=12, num_classes=2
)

new_state_dict  = OrderedDict({mapkey(k): v for k, v in pretrained_bert.state_dict().items() if 'classification_head' not in k})
classifier_bert.load_state_dict(new_state_dict, strict=False)


_IncompatibleKeys(missing_keys=['classification_layer.weight', 'classification_layer.bias'], unexpected_keys=[])

In [None]:
# pretrained_bert

In [None]:
max_seq_len = 256
batch_size = 16
train_dataloader = get_batches(data_train, batch_size, max_seq_len)
lr = 1e-05

Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors


In [None]:
gradient_steps = 0

In [None]:
# module.training
# module.train() --> module.training = True
# module.eval() --> module.training = False

# def forward(...):
#     if self.training:
#         ....
#     else:
#         ....

device = 'cuda'
# with t.cuda.device(0):
optimizer = t.optim.Adam(classifier_bert.parameters(), lr = lr)
loss_fn = torch.nn.CrossEntropyLoss()
classifier_bert.train()
classifier_bert.to(device)
for X, Y in train_dataloader:
    gradient_steps += 1
    optimizer.zero_grad()
    word_logits, classification_logits = classifier_bert(X.to(device))
    # Get predicted classes from logits
    # predictions = t.nn.SoftMax(dim = -1)(classification_logits)
    loss = loss_fn(classification_logits, Y.to(device))
    loss.backward()
    optimizer.step()
    if gradient_steps % 100 == 0:
        print(gradient_steps, loss)


In [None]:
tmp_params = torch.nn.Linear(1,1)
# tmp_params.to(device)
with torch.cuda.device(0):
    print(tmp_params.weight.get_device())

In [None]:
examples =  ['This movie sucked. I hated it.', 
             'I fell asleep while watching this movie because it was super boring',
             'My friends and I loved it.',
             'I was super scared by the movie',
             'This was my favorite movie ever!',
             'This was good and bad',
             'This was bad and good',
             'I must applaud the creators of this movie. It\'s incredible that they\'ve managed to write such an uninspired piece.']
classifier_bert.eval()
for example in examples:
    # Encode
    encoded_tokens = tokenizer.encode(example)
    # Convert to tensor
    input_ids = torch.tensor(encoded_tokens, dtype=torch.long).unsqueeze(0)
    # Predict
    word_logits, classification_logits = classifier_bert(input_ids)
    # softmax
    softmaxed_classification_logits = torch.nn.Softmax(dim=-1)(classification_logits)
    # Argmax
    prediction = torch.argmax(softmaxed_classification_logits, dim=-1)
    # Print
    neg, pos = softmaxed_classification_logits[0]
    print(f'neg:{neg:.2}' if prediction.item()==0 else f'pos:{pos:.2}', example)

neg:0.97 This movie sucked. I hated it.
neg:0.68 I fell asleep while watching this movie because it was super boring
pos:0.97 My friends and I loved it.
pos:0.71 I was super scared by the movie
pos:0.99 This was my favorite movie ever!
pos:0.68 This was good and bad
neg:0.53 This was bad and good
neg:0.76 I must applaud the creators of this movie. It's incredible that they've managed to write such an uninspired piece.


## Training from Scratch on Masked Language Modeling

In [None]:
wikitext2_dataset = torchtext.datasets.WikiText2(root='.data', split=('train', 'valid', 'test'))

In [None]:
train_wiki_data, val_wiki_data, test_wiki_data = wikitext2_dataset

In [None]:
train_wiki_data = list(train_wiki_data)
val_wiki_data = list(val_wiki_data)
test_wiki_data = list(test_wiki_data)

In [None]:
train_wiki_data[:20]

In [None]:
tokenizer.encode("their lives [UNK] . \n")

[101, 1147, 2491, 100, 119, 102]

In [None]:
tokenizer_with_unk = transformers.AutoTokenizer.from_pretrained("bert-base-cased", unk_token='<unk>')
tokenizer_with_unk.encode("their lives <unk> . \n")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[101, 1147, 2491, 28996, 119, 102]

In [None]:
t.rand((2,))


tensor([0.9672, 0.5151])

In [None]:
'sldkfj <unk> sldfj<unk>ldk'.replace('<unk>', '[UNK]')

'sldkfj [UNK] sldfj[UNK]ldk'

In [None]:
torch.tensor(1).lt(2)

tensor(True)

In [None]:
def pad_seq(tokens, max_seq_len):
    if isinstance(tokens, torch.Tensor):
        tokens = [token.item() for token in tokens]
    return tokens + [tokenizer.pad_token_id]*(max_seq_len-len(tokens))

In [None]:
[x.item() for x in torch.rand((2,))]

[0.3207430839538574, 0.8816239833831787]

In [None]:
tmp = list(zip([1,2,3], [4,5,6]))
list(zip(*tmp))

[(1, 2, 3), (4, 5, 6)]

In [None]:
# Try doing this on a sequence first
# Then try modifying get_data_list to do this
# Arguments
sentence = 'hello this is a sentence <unk> here something something.'
max_seq_len = 512  # TODO: Replace this
# Find id of token[MASK]


def get_wiki_data_list(sentences: list, max_seq_len: int, min_seq_len: int = 10):
    masked_token = tokenizer.encode('[MASK]')[1] # [CLS] [MASK] [END]
    masked_sentences = []
    unmasked_sentences = []
    masks = []
    vocab_size = 28996
    for sentence in sentences:
        # Replace <unk> with [UNK]
        sentence = sentence.replace('<unk>', '[UNK]')
        # Tokenize
        sentence = tokenizer.encode(sentence)
        
        # Skip those lower than minimum no. of tokens e.g. 10
        # print(sentence)
        if len(sentence) >= min_seq_len:
            sentence = torch.tensor(sentence[:max_seq_len], dtype=torch.long)
            initial_len = len(sentence)  # Sanity check
            unmasked_sentences.append(pad_seq(sentence, max_seq_len).copy())
            assert len(sentence) == initial_len  # Sanity check: new sentence was padded, not the original one
            # Randomly replace tokens with [MASK] according to BERT paper
            probs = torch.rand(len(sentence))
            # mask 15% of all tokens at random 
            # Of these, replace with [MASK] 80% of the time, a random token 10%, and unchanged 10%
            sentence[probs < 0.8*0.15] = masked_token    
            num_random_tokens = len(sentence[(probs < .9*0.15) & (probs >= .8*.15)])
            sentence[(probs < .9*0.15) & (probs >= 0.8*.15)] = torch.randint(vocab_size, (num_random_tokens,))
            # Pad sentences
            sentence = pad_seq(sentence, max_seq_len)
            masked_sentences.append(sentence.copy())
            mask = [x.item() for x in (probs < 0.15)] + [0]*(max_seq_len-initial_len)
            masks.append(mask)
            # print(tokenizer.decode(sentence))

    # Shuffle for all the outputs
    out = list(zip(masked_sentences, masks, unmasked_sentences))
    random.shuffle(out)
    
    # Return arr of zero/one for mask, and an arr for actual IDs
    return list(zip(*out))

def get_wiki_batches(data, batch_size, max_seq_len):
    masked_sentences, masks, unmasked_sentences = get_wiki_data_list(data, max_seq_len)
    res = []
    
    for batch_idx in range(math.ceil(len(masks) / batch_size)):
        i, j = batch_idx*batch_size, (batch_idx+1)*batch_size 
        # print(masked_sentences[i:j])
        # print(masks[i:j])
        # print(unmasked_sentences[i:j])
        res.append([t.tensor(x[i:j], dtype=torch.long) for x in [masked_sentences, masks, unmasked_sentences]])
    # Stack the tensors: I want dim 0 be of size batch_size
    return res

print (get_wiki_batches(train_wiki_data[:10], 3, 10))

In [None]:
tiny_bert = Bert(
    vocab_size=28996, hidden_size=256, max_position_embeddings=512, 
    type_vocab_size=1, dropout=.1, intermediate_size=1024, 
    num_heads=8, num_layers=2)
gradient_steps = 0
train_dataloader = get_wiki_batches(train_wiki_data, 16, 64)

In [None]:
device = 'cuda'
# with t.cuda.device(0):
lr = 3e-4
optimizer = t.optim.Adam(tiny_bert.parameters(), lr = lr)
loss_fn = torch.nn.CrossEntropyLoss()
tiny_bert.train()
tiny_bert.to(device)

for epoch in range(100, 500):
    for masked_sentences, masks, unmasked_sentences in train_dataloader:
        gradient_steps += 1
        optimizer.zero_grad()

        masked_sentences = masked_sentences.to(device)
        masks = masks.to(device)
        unmasked_sentences = unmasked_sentences.to(device)

        word_logits = tiny_bert(masked_sentences.to(device))

        masks = masks.to(torch.bool)

        # word_logits.shape: batch_size, seq_length, vocab_size
        # mask.shape = batch_size, seq_length
        # Mask the logits
        masked_logits = torch.masked_select(word_logits, masks.unsqueeze(-1))

        # Reshaping to work with loss function
        masked_logits = einops.rearrange(masked_logits, '(num_masks vocab_size) -> num_masks vocab_size', 
                                         vocab_size=vocab_size)
        target = torch.masked_select(unmasked_sentences, masks)

        # Averaging the loss across the sequences
        batch_size = masked_sentences.shape[0]
        loss = loss_fn(masked_logits, target) / batch_size

        loss.backward()
        optimizer.step()
    print(f"epoch: {epoch}", f"loss: {loss.item():.5}")

epoch: 100 loss: 0.16195
epoch: 101 loss: 0.14493
epoch: 102 loss: 0.16115
epoch: 103 loss: 0.12537
epoch: 104 loss: 0.1354
epoch: 105 loss: 0.13421
epoch: 106 loss: 0.1174
epoch: 107 loss: 0.21031
epoch: 108 loss: 0.14917
epoch: 109 loss: 0.21695
epoch: 110 loss: 0.18115
epoch: 111 loss: 0.15084
epoch: 112 loss: 0.15101
epoch: 113 loss: 0.20933
epoch: 114 loss: 0.55942
epoch: 115 loss: 0.51484
epoch: 116 loss: 0.52889
epoch: 117 loss: 0.49307
epoch: 118 loss: 0.47989
epoch: 119 loss: 0.46775
epoch: 120 loss: 0.46776
epoch: 121 loss: 0.44455
epoch: 122 loss: 0.47442
epoch: 123 loss: 0.4953
epoch: 124 loss: 0.47437
epoch: 125 loss: 0.48615
epoch: 126 loss: 0.46294
epoch: 127 loss: 0.44543
epoch: 128 loss: 0.41689
epoch: 129 loss: 0.38697
epoch: 130 loss: 0.38991
epoch: 131 loss: 0.36754
epoch: 132 loss: 0.32371
epoch: 133 loss: 0.30861
epoch: 134 loss: 0.31518
epoch: 135 loss: 0.28165
epoch: 136 loss: 0.30309
epoch: 137 loss: 0.50926
epoch: 138 loss: 0.50041
epoch: 139 loss: 0.49314
epo

In [None]:
a = t.Tensor([[1, 2, 3], [4, 5, 6], [4, 5, 6]])
t.masked_select(a, t.tensor([0,1, 1], dtype=t.bool).unsqueeze(-1))

tensor([4., 5., 6., 4., 5., 6.])

In [None]:
test_dataloader = get_wiki_batches(test_wiki_data, 16, 64)

In [None]:
tiny_bert.eval()
for masked_sentences, masks, unmasked_sentences in test_dataloader:    
    masked_sentences = masked_sentences.to(device)
    masks = masks.to(torch.bool)
    masks = masks.to(device)
    unmasked_sentences = unmasked_sentences.to(device)

    word_logits = tiny_bert(masked_sentences.to(device))
    print(word_logits.shape)
    predictions = torch.argmax(word_logits, dim=-1)
    for i in range(len(predictions)):
        print("INPUTS     ", tokenizer.decode(masked_sentences[i]))
        print("PREDICTIONS", tokenizer.decode(predictions[i]))
        print("ACTUAL     ", tokenizer.decode(unmasked_sentences[i]))
        print()
    break
 
    # word_logits.shape: batch_size, seq_length, vocab_size
#     # mask.shape = batch_size, seq_length
#     # Mask the logits
#     masked_logits = torch.masked_select(word_logits, masks.unsqueeze(-1))
    
#     # Reshaping to work with loss function
#     masked_logits = einops.rearrange(masked_logits, '(num_masks vocab_size) -> num_masks vocab_size', 
#                                      vocab_size=vocab_size)
#     target = torch.masked_select(unmasked_sentences, masks)

#     # Averaging the loss across the sequences
#     batch_size = masked_sentences.shape[0]
#     loss = loss_fn(masked_logits, target) / batch_size
        
#     loss.backward()
#     optimizer.step()
#     if gradient_steps % 20 == 0:
#         print(gradient_steps, loss.item())

torch.Size([16, 64, 28996])
INPUTS      [CLS] The [MASK] earned the Bulldogs a trip back to Indianapolis for the first Final [MASK] appearance [MASK] school and Horizon League performed. The win made Stevens, at [MASK] 33 [MASK] the youngest coach to lead a team to the Final Four since Bob Knight made his [MASK] Final Four appearance at age [MASK] in 1973. Butler became the [MASK] school
PREDICTIONS [CLS] The show started the had a set back to continued for the social daydale appearance had Florida and had T City. The day the recording, at thero for the S'to be a team to the position singles since men Upon made his plan international would appearance at four game in 15. An became the five At
ACTUAL      [CLS] The win earned the Bulldogs a trip back to Indianapolis for the first Final Four appearance in school and Horizon League history. The win made Stevens, at age 33, the youngest coach to lead a team to the Final Four since Bob Knight made his first Final Four appearance at age 32 in

In [None]:
tiny_bert.eval()
for masked_sentences, masks, unmasked_sentences in test_dataloader:    
    masked_sentences = masked_sentences.to(device)
    masks = masks.to(torch.bool)
    masks = masks.to(device)
    unmasked_sentences = unmasked_sentences.to(device)

    word_logits = tiny_bert(masked_sentences.to(device))
    print(word_logits.shape)
    predictions = torch.argmax(word_logits, dim=-1)
    for i in range(len(predictions)):
        print("INPUTS     ", tokenizer.decode(masked_sentences[i]))
        print("PREDICTIONS", tokenizer.decode(predictions[i]))
        print("ACTUAL     ", tokenizer.decode(unmasked_sentences[i]))
        print()
    break
 
    # word_logits.shape: batch_size, seq_length, vocab_size
#     # mask.shape = batch_size, seq_length
#     # Mask the logits
#     masked_logits = torch.masked_select(word_logits, masks.unsqueeze(-1))
    
#     # Reshaping to work with loss function
#     masked_logits = einops.rearrange(masked_logits, '(num_masks vocab_size) -> num_masks vocab_size', 
#                                      vocab_size=vocab_size)
#     target = torch.masked_select(unmasked_sentences, masks)

#     # Averaging the loss across the sequences
#     batch_size = masked_sentences.shape[0]
#     loss = loss_fn(masked_logits, target) / batch_size
        
#     loss.backward()
#     optimizer.step()
#     if gradient_steps % 20 == 0:
#         print(gradient_steps, loss.item())

torch.Size([16, 64, 28996])
INPUTS      [CLS] The [MASK] earned the Bulldogs a trip back to Indianapolis for the first Final [MASK] appearance [MASK] school and Horizon League performed. The win made Stevens, at [MASK] 33 [MASK] the youngest coach to lead a team to the Final Four since Bob Knight made his [MASK] Final Four appearance at age [MASK] in 1973. Butler became the [MASK] school
PREDICTIONS [CLS] The new performed the record a A began to record for the first the ( American and record and American Division record. The 9 four Cup, at the 9 in the head continued with make a Star with the the Tech American the record with the record the Tech 1995 the American as in record. record became the new record
ACTUAL      [CLS] The win earned the Bulldogs a trip back to Indianapolis for the first Final Four appearance in school and Horizon League history. The win made Stevens, at age 33, the youngest coach to lead a team to the Final Four since Bob Knight made his first Final Four appearan

In [None]:
def get_data_list(data: list, max_seq_len: int, min_seq_len: int = 10):
    """Take in data_train or data_test, and output a dataloader (an iterable of sample x, label y)"""
    # Reverse sample and label
    # Tokenize
    saved_data = list(data)
    assert len(saved_data), "No data left!"
    labels = [label for label, review in saved_data]
    # reviews = [review for label, review in saved_data]
    # tokenized_reviews = tokenizer.encode(reviews)
    reviews = [review for label, review in saved_data]
    tokenized_reviews = tokenizer(reviews)['input_ids']
    
    # Truncate based on max_seq_len
    tokenized_reviews = [tokens[:max_seq_len] for tokens in tokenized_reviews]
    
    # Get padding token_id
    # Pad to longest
    longest_length = max([len(tokens) for tokens in tokenized_reviews])
    tokenized_reviews = [tokens + [tokenizer.pad_token_id]*(longest_length-len(tokens)) for tokens in tokenized_reviews]
    
    # Shuffle
    data_list = [(tokens, 0 if label == 'neg' else 1) for tokens, label in zip(tokenized_reviews, labels)]
    random.shuffle(data_list)
    return data_list
    
def get_batches(data, batch_size, max_seq_len):
    shuffled_data_list = get_data_list(data, max_seq_len)
    res = []
    for batch_idx in range(math.ceil(len(shuffled_data_list) / batch_size)):
        batch_data = shuffled_data_list[batch_idx*batch_size: (batch_idx+1)*batch_size]
        reviews = [review for review, sentiment in batch_data]
        sentiments = [sentiment for review, sentiment in batch_data]
        res.append((torch.tensor(reviews, dtype=torch.long), torch.tensor(sentiments, dtype=torch.long)))
    return res