In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from transformers import BertModel, BertTokenizer, BertConfig

# We will use Chapter's from Mary Shelly's Frankenstein as example data

In [29]:
with open('../data/frankenstein/chapter_1.txt', 'rb') as file:
    ch1 = file.read().decode()
    file.close()

with open('../data/frankenstein/chapter_2.txt', 'rb') as file:
    ch2 = file.read().decode()
    file.close()

ch1[0:1000]

'I am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family. As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition and could not bear to live in poverty and oblivion in the same country where he had formerly been distinguished for his rank and m

In [30]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
print(f'Model Input Dimension: {tokenizer.max_len}')

I0102 21:27:27.861990  7748 tokenization_utils.py:380] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at C:\Users\altoz\.cache\torch\transformers\5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1


Model Input Dimension: 512


# Tokenizing using BertTokenizer provided by [HuggingFace BertTokenizer](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer)

The text in Chapter 1 contains 2,195 tokens total.
The text in Chapter 2 contains 2,782 tokens total. 

We now want to find the closest, largest, multiple of 512 (the expected input dimension of the Bert Model).
This number divided by 512 will give us the `m` dimension of our soon to be `m`x`512` matrix 

In [36]:
seqlen = tokenizer.max_len
pad_token_id = tokenizer.pad_token_id
tok_ch1 = tokenizer.encode(ch1)
tok_ch2 = tokenizer.encode(ch2)

print(f'Chapter 1 contains {len(tok_ch1)} tokens')
print(f'Chapter 2 contains {len(tok_ch2)} tokens')

W0102 21:30:47.402800  7748 tokenization_utils.py:953] Token indices sequence length is longer than the specified maximum sequence length for this model (2193 > 512). Running this sequence through the model will result in indexing errors
W0102 21:30:47.435803  7748 tokenization_utils.py:953] Token indices sequence length is longer than the specified maximum sequence length for this model (2780 > 512). Running this sequence through the model will result in indexing errors


Chapter 1 contains 2195 tokens
Chapter 2 contains 2782 tokens


In [37]:
if len(tok_ch1) > len(tok_ch2):
    m = (len(tok_ch1) // seqlen) + 1
else:
    m = (len(tok_ch2) // seqlen) + 1
print(f'm = {m}')

m = 6


# 2D-Padding

We now need to pad our n-sequence to m*512 tokens

In [49]:
tok_ch1 = np.array(tok_ch1)
tok_ch2 = np.array(tok_ch2)

print(f'tok_ch1.shape: {tok_ch1.shape}')
print(f'tok_ch2.shape: {tok_ch2.shape}')

tok_ch1.shape: (3072,)
tok_ch2.shape: (3072,)


In [50]:
def padding_2d(arr: np.array, 
               pad_token_id: int, 
               seqlen: int,
               seqdim: int) -> np.array:
    """
    Pads a 1D array to the proper length, then reshapes and returns
    a 2D array
    
    :param arr: the list to be padded.
    :param pad_token_id: the token id to be used for padding.
    :param seqlen: the length of each row in the desired matrix.
    :param seqdim: the number of rows in the desired matrix.
    """
    if len(arr) < seqlen * seqdim:
        to_pad = seqlen * seqdim - len(arr)
        arr += [pad_token_id] * to_pad
        
    arr = np.array(arr).reshape((seqdim, seqlen))
    
    return arr


In [53]:
tok_ch1 = padding_2d(tok_ch1, pad_token_id, seqlen, m)
tok_ch2 = padding_2d(tok_ch2, pad_token_id, seqlen, m)

print(f'tok_ch1.shape: {tok_ch1.shape}')
print(f'tok_ch2.shape: {tok_ch2.shape}')

tok_ch1.shape: (6, 512)
tok_ch2.shape: (6, 512)


# Tensor-Shape

Treating the tensor as a:
* batch_size = 1
* channels = seqdim
* height = 1
* width = seqlen

Essentially treating each of the `seqdim` rows as a channel

In [118]:
ten_ch1 = torch.tensor(tok_ch1, dtype=torch.float)
ten_ch1 = ten_ch1.view(1, tok_ch1.shape[0], 1, tok_ch1.shape[1])

ten_ch2 = torch.tensor(tok_ch2, dtype=torch.float)
ten_ch2 = ten_ch1.view(1, tok_ch2.shape[0], 1, tok_ch2.shape[1])

model_layers = [
    nn.Conv2d(in_channels=ten_ch1.shape[1], 
          out_channels=1,
          kernel_size=1),
    nn.LayerNorm((1,512)),
    nn.ReLU(inplace=True)
]

model = nn.Sequential(*model_layers)

x = model(ten_ch1)

In [119]:
print(f'Original tensor shape: {ten_ch1.shape}')
print(f'Output tensor shape: {x.shape}')

Original tensor shape: torch.Size([1, 6, 1, 512])
Output tensor shape: torch.Size([1, 1, 1, 512])


In [137]:
bert = BertModel.from_pretrained('bert-base-cased')

I0102 22:12:20.814961  7748 configuration_utils.py:157] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at C:\Users\altoz\.cache\torch\transformers\b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.d7a3af18ce3a2ab7c0f48f04dc8daff45ed9a3ed333b9e9a79d012a0dedf87a6
I0102 22:12:20.820956  7748 configuration_utils.py:174] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

I0102 22:12:21.0

In [174]:
class BertModel2D(nn.Module):

    def __init__(self, seqdim, seqlen, batch_size):
        super(BertModel2D, self).__init__()
        
        self.seqdim = seqdim
        self.seqlen = seqlen
        self.batch_size = batch_size
        
        self.conv1 = nn.Conv2d(seqdim, 1, 1)
        self.layer_norm = nn.LayerNorm((1, seqlen))
        self.relu = nn.ReLU(inplace=True)
        self.bert = BertModel.from_pretrained('bert-base-cased')
        
        
    def forward(self, x, mask=None):
        x = self.conv1(x)
        x = self.layer_norm(x)
        x = self.relu(x)
        x = x.view(self.batch_size, seqlen).long()
        x = self.bert(input_ids = x, attention_mask = mask)

        return x

In [175]:
model = BertModel2D(seqdim=6, seqlen=512, batch_size=1)

I0102 22:22:46.741231  7748 configuration_utils.py:157] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at C:\Users\altoz\.cache\torch\transformers\b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.d7a3af18ce3a2ab7c0f48f04dc8daff45ed9a3ed333b9e9a79d012a0dedf87a6
I0102 22:22:46.744235  7748 configuration_utils.py:174] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

I0102 22:22:46.9

In [177]:
mask = np.where(np.tril(tok_ch1), 1, 0)
mask = torch.tensor(mask, dtype=torch.long)

hidden_states, output = model(ten_ch1)  #, mask)
output.shape

torch.Size([1, 768])