In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
from transformers import BertModel, BertTokenizer, BertConfig

# We will use Chapter's from Mary Shelly's Frankenstein as example data

In [5]:
with open('./data/frankenstein/chapters/chapter_1.txt', 'rb') as file:
    ch1 = file.read().decode()
    file.close()

with open('./data/frankenstein/chapters/chapter_2.txt', 'rb') as file:
    ch2 = file.read().decode()
    file.close()

ch1[0:1000]

'I am by birth a Genevese, and my family is one of the most distinguished of that republic. My ancestors had been for many years counsellors and syndics, and my father had filled several public situations with honour and reputation. He was respected by all who knew him for his integrity and indefatigable attention to public business. He passed his younger days perpetually occupied by the affairs of his country; a variety of circumstances had prevented his marrying early, nor was it until the decline of life that he became a husband and the father of a family. As the circumstances of his marriage illustrate his character, I cannot refrain from relating them. One of his most intimate friends was a merchant who, from a flourishing state, fell, through numerous mischances, into poverty. This man, whose name was Beaufort, was of a proud and unbending disposition and could not bear to live in poverty and oblivion in the same country where he had formerly been distinguished for his rank and m

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
print(f'Model Input Dimension: {tokenizer.max_len}')

Model Input Dimension: 512


# Tokenizing using BertTokenizer provided by [HuggingFace BertTokenizer](https://huggingface.co/transformers/model_doc/bert.html#berttokenizer)

The text in Chapter 1 contains 2,195 tokens total.
The text in Chapter 2 contains 2,782 tokens total. 

We now want to find the closest, largest, multiple of 512 (the expected input dimension of the Bert Model).
This number divided by 512 will give us the `m` dimension of our soon to be `m`x`512` matrix 

In [7]:
seqlen = tokenizer.max_len
pad_token_id = tokenizer.pad_token_id
tok_ch1 = tokenizer.encode(ch1)
tok_ch2 = tokenizer.encode(ch2)

print(f'Chapter 1 contains {len(tok_ch1)} tokens')
print(f'Chapter 2 contains {len(tok_ch2)} tokens')

Token indices sequence length is longer than the specified maximum sequence length for this model (2193 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2780 > 512). Running this sequence through the model will result in indexing errors
Chapter 1 contains 2195 tokens
Chapter 2 contains 2782 tokens


In [8]:
if len(tok_ch1) > len(tok_ch2):
    m = (len(tok_ch1) // seqlen) + 1
else:
    m = (len(tok_ch2) // seqlen) + 1
print(f'm = {m}')

m = 6


# 2D-Padding

We now need to pad our n-sequence to m*512 tokens

In [9]:
tok_ch1 = np.array(tok_ch1)
tok_ch2 = np.array(tok_ch2)

print(f'tok_ch1.shape: {tok_ch1.shape}')
print(f'tok_ch2.shape: {tok_ch2.shape}')

tok_ch1.shape: (2195,)
tok_ch2.shape: (2782,)


In [10]:
def padding_2d(arr: np.array, 
               pad_token_id: int, 
               seqlen: int,
               seqdim: int) -> np.array:
    """
    Pads a 1D array to the proper length, then reshapes and returns
    a 2D array
    
    :param arr: the list to be padded.
    :param pad_token_id: the token id to be used for padding.
    :param seqlen: the length of each row in the desired matrix.
    :param seqdim: the number of rows in the desired matrix.
    """
    if len(arr) < seqlen * seqdim:
        to_pad = seqlen * seqdim - len(arr)
        arr = np.append(arr,([pad_token_id] * to_pad))
        
    arr = np.array(arr).reshape((seqdim, 1, seqlen))
    
    return arr


In [11]:
tok_ch1 = padding_2d(tok_ch1, pad_token_id, seqlen, m)
tok_ch2 = padding_2d(tok_ch2, pad_token_id, seqlen, m)

print(f'tok_ch1.shape: {tok_ch1.shape}')
print(f'tok_ch2.shape: {tok_ch2.shape}')

tok_ch1.shape: (6, 1, 512)
tok_ch2.shape: (6, 1, 512)


# Tensor-Shape

Treating the tensor as a:
* batch_size = 1
* channels = seqdim
* height = 1
* width = seqlen

Essentially treating each of the `seqdim` rows as a channel

In [12]:
ten_ch1 = torch.tensor(tok_ch1, dtype=torch.float).unsqueeze(0)

ten_ch2 = torch.tensor(tok_ch2, dtype=torch.float).unsqueeze(0)

model_layers = [
    nn.Conv2d(in_channels=ten_ch1.shape[1], 
          out_channels=1,
          kernel_size=1),
    nn.LayerNorm((1,512)),
    nn.ReLU(inplace=True)
]

model = nn.Sequential(*model_layers)

x = model(ten_ch1)

In [13]:
print(f'Original tensor shape: {ten_ch1.shape}')
print(f'Output tensor shape: {x.shape}')

Original tensor shape: torch.Size([1, 6, 1, 512])
Output tensor shape: torch.Size([1, 1, 1, 512])


In [14]:
bert = BertModel.from_pretrained('bert-base-cased')

In [148]:
class BertModel2D(nn.Module):

    def __init__(self, seqdim, seqlen, batch_size):
        super(BertModel2D, self).__init__()
        
        self.seqdim = seqdim
        self.seqlen = seqlen
        self.batch_size = batch_size
        
        self.conv_enc = nn.Conv2d(seqdim, 1, 1)
        self.layer_norm_enc = nn.LayerNorm((1, seqlen))
        self.relu = nn.ReLU(inplace=True)
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.conv_dec = nn.Conv2d(1, seqdim, 1)
        self.max_pool2d = nn.AdaptiveMaxPool2d((1, seqlen))
        self.layer_norm_dec = nn.LayerNorm((1, seqlen))

        self.encoder_layers = [
            self.conv_enc,
            self.layer_norm_enc,
            self.relu,
            self.bert
        ]

        self.decoder_layers = [
            self.conv_dec,
            self.max_pool2d,
            self.layer_norm_dec
        ]

        self.encoder = nn.Sequential(*self.encoder_layers)
        self.decoder = nn.Sequential(*self.decoder_layers)
        
    def forward(self, x, mask=None):
        # Encoder
        x = self.conv_enc(x)
        x = self.layer_norm_enc(x)
        x = self.relu(x)
        x = x.view(self.batch_size, seqlen).long()
        x = self.bert(input_ids = x, attention_mask = mask)[0]
        

        # Decoder
        x = x.view(self.batch_size, 1, 1, -1)  #self.seqdim, 1, self.seqlen).float()
        x = self.conv_dec(x)
        x = self.max_pool2d(x)
        x = self.layer_norm_dec(x)
        x = self.relu(x)

        return x

In [149]:
model = BertModel2D(seqdim=6, seqlen=512, batch_size=1)

In [212]:
mask = np.where(np.tril(tok_ch1), 1, 0)
mask = torch.tensor(mask, dtype=torch.long)

output = model(ten_ch1)
print(f'output: {output.shape}')

assert output.shape == ten_ch1.shape


output: torch.Size([1, 6, 1, 512])


# Reshaping to pass through tokenizer

In [213]:
output = output.view(-1)
output.shape

torch.Size([3072])

In [214]:
tokenizer.decode(output)

] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [PAD] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [unused1] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 