In [2]:
!which python

import sys
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
  %pip install datasets


/Users/dan/miniconda3/envs/ml/bin/python


In [517]:
import math
import random
import string
import re
import os
import itertools
import functools
import requests
import contextlib
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import torchtext
from torchtext.vocab import vocab
from collections import Counter, OrderedDict

device = 'cuda' if torch.cuda.is_available() else 'cpu'


class AttentionHead(nn.Module):
  def __init__(self, d_model, d_proj):
    super().__init__()
    self.q_proj = nn.Linear(d_model, d_proj)
    self.k_proj = nn.Linear(d_model, d_proj)
    self.v_proj = nn.Linear(d_model, d_proj)

  def forward(self, batch):
    # could we do the projections in parallel here? maybe using einsum?
    Q = self.q_proj(batch)
    K = self.k_proj(batch)
    V = self.v_proj(batch)
    scaled_QK = torch.div(Q @ K.transpose(1, 2), math.sqrt(K.shape[2]))
    mask = torch.triu(torch.full((Q.shape[1], K.shape[1]), float('-inf')), 1)
    masked_QK = torch.add(scaled_QK, mask)
    return  F.softmax(masked_QK, dim=2) @ V

class PositionalEmbedding(nn.Module):
  def __init__(self, tokenizer, vocab, d_model):
    super().__init__()
    self.tokenizer = tokenizer
    self.vocab = vocab
    self.d_model = d_model
    self.embedding = nn.Embedding(len(vocab), d_model)

  def get_positional_encoding(self, seq_length, scale=10000):
    # scale 10000 used in paper - the wavelengths will form a geometric progression from 2pi to scale * 2pi
    positions = np.zeros((seq_length, self.d_model)) #torch.zeros

    for pos in range(seq_length):
      for i in range(self.d_model):
        if i % 2 == 0:
          positions[pos][i] = np.sin(pos/(scale**((2 * i)/self.d_model)))
        else:
          positions[pos][i] = np.cos(pos/(scale**((2 * i)/self.d_model)))

    return torch.as_tensor(positions, dtype=torch.float32)
  

  def plot_positional_encodings(self, encodings):
    fig, ax = plt.subplots(figsize=(20,20))
    ax.imshow(encodings)
    plt.show()


  def forward(self, batch):
    tokens = [self.tokenizer(seq) for seq in batch]
    max_sequence_length = max([len(seq) for seq in tokens])
    indices_batch = torch.empty([len(tokens), max_sequence_length], dtype=torch.long)
    # TODO: find a better way to do padding masking, extra tensor here uses unecessary RAM
    padding_mask = torch.empty([len(tokens), max_sequence_length])
    for i, tokens in enumerate(tokens):
      indices = torch.tensor(self.vocab(tokens))
      padding_length = max_sequence_length - indices.shape[0]
      padding_mask[i] = torch.cat([torch.zeros(indices.shape[0]), torch.full([padding_length], -1e20)])
      indices_batch[i] = torch.cat([indices, torch.zeros(padding_length)])
    # print('mask', padding_mask)
    padding_mask.unsqueeze_(2)
    embeds = self.embedding(indices_batch)
    # print('embeds', embeds[0])
    # masked_embeds = torch.add(embeds, padding_mask)
    # print('masked_embeds', masked_embeds[0])
    positions = self.get_positional_encoding(max_sequence_length)
    return torch.add(embeds, positions)

class DecoderLayer(nn.Module):
  def __init__(self, d_model, n_head):
    super().__init__()
    self.d_model = d_model
    self.attention_heads = nn.ModuleList([AttentionHead(d_model, d_model // n_head) for _ in range(n_head)])
    self.attention_linear = nn.Linear(d_model, d_model)
    self.ffn = nn.Sequential(nn.Linear(d_model, d_model * 4), nn.ReLU(), nn.Linear(d_model * 4, d_model))


  def forward(self, batch):
    # Can we do heads in parallel?
    attentions = torch.cat([head(batch) for head in self.attention_heads], dim=2)
    attention_out = self.attention_linear(attentions)
    attention_with_residual = F.layer_norm(torch.add(attention_out, batch), [self.d_model])
    ffn_out = self.ffn(attention_with_residual)
    return F.layer_norm(torch.add(ffn_out, attention_with_residual), [self.d_model])


class Transformer(nn.Module):
  def __init__(self, vocab, tokenizer, d_model=64, n_head=4, n_layer=4):
    super().__init__()
    # add validation that d_model is a multiple of n_head otherwise bad things will happend after cat'ing attention heads
    self.vocab = vocab
    self.d_model = d_model
    self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, n_head) for _ in range(n_layer)])
    self.positional_embedding = PositionalEmbedding(tokenizer, vocab, d_model)

  def forward(self, batch):
    embeddings = self.positional_embedding(batch)

    decoder_output = embeddings
    for layer in self.decoder_layers:
      decoder_output = layer(decoder_output)

    # final linear layer uses same weights as embedding, but scaled
    logits = F.linear(decoder_output, torch.div(self.positional_embedding.embedding.weight, math.sqrt(self.d_model)))
    # Does deembedding layer need a bias? Probably?
    return F.log_softmax(logits, dim=2)



  

In [4]:
# maybe some way to annotate functions to not do gradients? some decorator thingy
def log_probs_to_tokens(model, log_probs, top_n=10):
  probs = torch.exp(log_probs)
  sorted_probs = torch.sort(probs, descending=True)
  predictions = []

  for token_probs, token_indices in zip(sorted_probs.values.numpy(force=True), sorted_probs.indices.numpy(force=True)):
    predictions.append([(token, prob) for token, prob in zip(model.vocab.lookup_tokens(token_indices[:top_n]), token_probs[:top_n])])

  return np.array(predictions)

In [584]:
class StringReverseDataset(torch.utils.data.Dataset):
  def __init__(self, n_items):
    # self.items = [self.generate_item() for i in range(n_items)]
    self.items = ['usieyyeisu', 'zqorvvroqz', 'usnzjjznsu', 'sbfhcchfbs', 'pesejjesep', 'iccwppwcci', 'ovqwppwqvo', 'ufaiggiafu', 'gchqjjqhcg', 'jfgyqqygfj']

  def __len__(self):
    return len(self.items)

  def __getitem__(self, idx):
    return self.items[idx]

  def generate_item(self):
    chars = random.choices(string.ascii_lowercase, k=5)
    return ''.join(chars) + ''.join(reversed(chars))

def create_letter_transformer():
  letter_tokenizer = lambda str : [*str]
  alphabet = vocab(OrderedDict([letter, 1] for letter in string.ascii_lowercase))
  torch.manual_seed(42)
  random.seed(42)
  np.random.seed(42)
  return Transformer(alphabet, letter_tokenizer, d_model=64, n_head=4, n_layer=4)

def train_string_reverser(model):
  batch_size=2
  train_dl = torch.utils.data.DataLoader(StringReverseDataset(batch_size*4), batch_size=batch_size, shuffle=False)
  loss_fn = nn.NLLLoss(reduction='none')

  # paper used variable LR, how to do that here?
  optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98))

  for epoch in range(1):
    running_loss = 0.0
    for i, data in enumerate(train_dl):
      if i > 1: break
      optimizer.zero_grad()
      # don't care about the prediction for the next (non-existent) token
      # currently Seq, letter, class - needs to be letter, class, seq
      predictions = model(data)[:, :-1]
      # print('data', data)
      print('data', data)
    

      print('preds', predictions.shape)
      # currently Seq, class - needs to be class, seq
      targets = torch.tensor([model.vocab.lookup_indices([*seq])[1:] for seq in data])
      print('targets', targets.shape)
      # loss = loss_fn(predictions, targets)
      #bro wtf
      loss = loss_fn(predictions.reshape((batch_size, 26, 9)), targets)
      print('loss', loss)
      print('')
      loss.backward()
      optimizer.step()
      running_loss += loss.item()
      if i % 100 == 99:
          print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
          running_loss = 0.0


letter_tsfmr = create_letter_transformer()
train_string_reverser(letter_tsfmr)


data ['usieyyeisu', 'zqorvvroqz']
preds torch.Size([2, 9, 26])
targets torch.Size([2, 9])
loss tensor([[5.5586, 4.0289, 4.2850, 6.7268, 4.7493, 5.8713, 7.6786, 5.5969, 4.2874],
        [4.8683, 5.9740, 5.6608, 7.3265, 4.9921, 3.3302, 3.2923, 0.1374, 5.5527]],
       grad_fn=<ViewBackward0>)



RuntimeError: grad can be implicitly created only for scalar outputs

In [577]:
def test_string_reverser(model):
  test_dl = torch.utils.data.DataLoader(StringReverseDataset(10), batch_size=1, shuffle=True)
  loss_fn = nn.NLLLoss()

  for i, data in enumerate(test_dl):
    predictions = model(data)[:, :-1]
    targets = torch.tensor([model.vocab.lookup_indices([*seq])[1:] for seq in data])    # loss = loss_fn(predictions, targets)
    # print('preds', predictions.shape)
    # print('targets', targets.shape)
    token_preds = ''.join(log_probs_to_tokens(model, predictions[0], top_n=1)[:, 0, 0])

    # Predictions should be completely wrong for the first half of the sequence, and accurate for the second half. This verifies that masking is working.
    print(data[0][:-5], '', data[0][-5:])
    print('', token_preds[:-5], '', token_preds[-5:], '\n')

test_string_reverser(letter_tsfmr)

oyhba  abhyo
 eerl  llsrp 

btsll  llstb
 rllx  xxxij 

ftxuo  ouxtf
 tbbc  zcecc 

nwrto  otrwn
 uuuu  uzuuw 

flbgp  pgblf
 tbbc  jjkkq 

dlxzj  jzxld
 jjjc  iijtt 

yxsta  atsxy
 lglg  gggit 

zabuv  vubaz
 zlzg  kkgll 

nqlhw  whlqn
 uuur  wwuuu 

zyppc  cppyz
 zljj  ggiip 



In [285]:
class ShakespeareDataset(torch.utils.data.Dataset):
  def __init__(self, use_cached=True):
    if not os.path.isdir('./data'): os.mkdir('./data')
    self.data_path = './data/shakespeare.txt'

    if not (use_cached and os.path.isfile(self.data_path)):
      self.download_shakespeare()

    self.items = self.parse_shakespeare()

  def parse_shakespeare(self):
    START = '*** START OF THE PROJECT GUTENBERG EBOOK THE COMPLETE WORKS OF WILLIAM SHAKESPEARE ***'
    END = '*** END OF THE PROJECT GUTENBERG EBOOK THE COMPLETE WORKS OF WILLIAM SHAKESPEARE ***'
    PARAGRAPH_SEP = '\n'
    MAX_PARAGRAPH_LEN = 30 # characters
    paragraphs = []

    # TODO: consider making chunks larger than a paragraph - this approach leads to disconnected header lines
    with open(self.data_path) as file:
      after_preamble = False
      current_paragraph = ''
      for line in file:
        if line == PARAGRAPH_SEP or len(current_paragraph) > MAX_PARAGRAPH_LEN:
          if len(current_paragraph) > 0: paragraphs.append(current_paragraph)
          current_paragraph = ''
        else:
          if after_preamble: current_paragraph += line
          if line.strip() == START: after_preamble = True
          if line.strip() == END: break

    return paragraphs


  def download_shakespeare(self):
    with contextlib.suppress(FileNotFoundError):
      os.remove(self.data_path)
      
    url = 'https://www.gutenberg.org/files/100/100-0.txt'
    content = requests.get(url, stream=True).content
    
    with open(self.data_path, 'wb') as file:
      file.write(content) 

  def __len__(self):
    return len(self.items)

  def __getitem__(self, idx):
    return self.items[idx]


def create_vocab_from_dataset(dataset, tokenizer):
  unk_token = '<unk>'
  pad_token = '<pad>'
  token_counts = Counter()

  for item in dataset:
    tokens = tokenizer(item)
    for token in tokens:
      token_counts[token] += 1

  sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
  ordered_dict = OrderedDict(sorted_by_freq_tuples)
  voc = vocab(ordered_dict, specials=[pad_token, unk_token], min_freq=10)
  voc.set_default_index(voc[unk_token])

  return voc

word_tokenizer = lambda str : re.split(r"\b", str)
def create_shakespeare(ds):
  shakespeare_vocab = create_vocab_from_dataset(ds, word_tokenizer)
  return Transformer(shakespeare_vocab, word_tokenizer, d_model=16, n_head=2, n_layer=2)




shakespeare_ds = ShakespeareDataset()



In [334]:
def train_shakespeare(model, ds):
  dl = torch.utils.data.DataLoader(ds, batch_size=4, shuffle=False)
  loss_fn = nn.NLLLoss()
  optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.98))

  for epoch in range(1):
    running_loss = 0.0
    for i, data in enumerate(dl):
      if i > 64: break
      optimizer.zero_grad()
      # TODO: make transformer work with batches
      # don't care about the prediction for the next (non-existent) token
      predictions = model(data)[:-1]
      targets = torch.tensor(model.vocab.lookup_indices(word_tokenizer(data)[1:]))
      loss = loss_fn(predictions, targets)

      loss.backward()
      optimizer.step()
      running_loss += loss.item()
      if i % 8 == 7:    
          print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
          running_loss = 0.0
          
shakespeare = create_shakespeare(shakespeare_ds)
train_shakespeare(shakespeare, shakespeare_ds)


torch.Size([4, 17, 16])
decoder_attn tensor([[[ 0.4336,  0.6099,  0.4545,  ..., -0.0555, -0.1028,  0.0907],
         [ 0.2386,  0.5127,  0.3044,  ..., -0.0864, -0.1106, -0.0469],
         [ 0.3021,  0.5290,  0.3806,  ...,  0.0806,  0.1570, -0.2252],
         ...,
         [ 0.3210,  0.5198,  0.4095,  ...,  0.3733,  0.1974, -0.2632],
         [ 0.3207,  0.5237,  0.3938,  ...,  0.3529,  0.2011, -0.2556],
         [ 0.3227,  0.5294,  0.3837,  ...,  0.3400,  0.2042, -0.2450]],

        [[ 0.4336,  0.6099,  0.4545,  ..., -0.0555, -0.1028,  0.0907],
         [ 0.4396,  0.6162,  0.4047,  ...,  0.1172,  0.0562, -0.3373],
         [ 0.4726,  0.6357,  0.4646,  ...,  0.1800,  0.2438, -0.3969],
         ...,
         [ 0.3796,  0.5634,  0.2880,  ...,  0.1907,  0.2453, -0.1731],
         [ 0.3756,  0.5645,  0.2798,  ...,  0.1817,  0.2460, -0.1712],
         [ 0.3745,  0.5678,  0.2764,  ...,  0.1788,  0.2464, -0.1655]],

        [[-0.0423,  0.2014,  0.3625,  ...,  0.4434,  0.0166, -0.2692],
        

TypeError: 'NoneType' object is not subscriptable