# WATTS

### Imports

In [32]:
import os
import json

import matplotlib.pyplot as plt

import torch as pt
import torch.nn as nn
import torch.nn.functional as F
from torchinfo import summary

### Hyperparams

In [33]:
EPOCHS = 1000
LR = 0.0003
BATCH_SIZE = 64
CONTEXT_SIZE = 128
EVAL_EPOCHS = 100
DROPOUT=0.2
NUM_TRANSFORMER_BLOCKS = 6
EMBEDDING_SIZE = 384
NUM_HEADS = 6


### Utils

In [34]:
plt.style.use("dark_background")
def get_device():
  """
  Returns the appropriate device for PyTorch training/inference.
  Prioritizes CUDA, then MPS, and finally CPU.
  """
  # Check for CUDA availability
  if pt.cuda.is_available():
    return pt.device("cuda")
  # Check for MPS availability (Apple Silicon only)
  elif pt.backends.mps.is_available():
    return pt.device("mps")
  # Default to CPU
  else:
    return pt.device("cpu")

# Example usage
DEVICE = get_device()

pt.manual_seed(1999)

<torch._C.Generator at 0x120493c50>

In [35]:
def plot_hist(*args):
  for x in args:
    plt.plot(x[0], x[1])
  plt.title('Loss')
  plt.xlabel('epoch')
  plt.ylabel('loss')
  plt.legend(['train', 'validation'], loc='upper right')
  plt.show()

### Data loading

In [36]:
def load_dataset(path: str, train_frac=0.9, train=True):
  """
  To load all data, pass train_frac=1 and train=False
  otherwise pass the fraction of the dataset to be used for training, and specify train=True|False to get the first (train_frac) % or last (1 - train_frac) %
  This allows to load only the specific dataset when needed to avoid memory hogging
  """
  with open(path, "r") as f:
      talks = json.load(f)

      total_samples = len(talks) # total 117 talks
      train_samples = int(total_samples * train_frac)
      if train:
          data = talks[0:train_samples]
      else:
          data = talks[train_samples:total_samples]
      return data


In [37]:
def preprocess_dataset(dataset):
  """
  returns the entire dataset concatenated into a single string
  """
  merge_json_datapoint = lambda x: x["tag"] + " " + x["title"] + " " + x["body"] # combine each title, tag and body of a talk
  dataset = [merge_json_datapoint(x).strip() for x in dataset]
  # merge into one text block
  dataset = " ".join(dataset)
  return dataset

In [38]:
!mkdir data && curl https://raw.githubusercontent.com/Can-Sahin/alanwatts-transcripts/master/transcripts.json -o data/transcripts.json

mkdir: data: File exists


In [39]:
train_ds = load_dataset("./data/transcripts.json")
eval_ds = load_dataset("./data/transcripts.json", train=False)
len(train_ds), train_ds[0], len(eval_ds), eval_ds[0]

(105,
 {'title': 'Not What Should Be',
  'body': 'I wonder what you mean, when you use the word ‘I?’ I’ve been very interested in this problem for a long long time, and I’ve come to the conclusion, that what most civilized people mean by that word, is a hallucination. That is to say, a false sense of personal identity, that is at complete variance with the facts of nature. And as a result of having a false sense of identity we act in a way that is inappropriate to our natural environment. And when that inappropriate way of action is magnified by a very powerful technology, we swiftly begin to see the results of a profound discord between man and nature. As is well known we are now in the process of destroying our environment, as a result, of an attempt to conquer it and master it. And we have not realized, therefore, that our environment is not something other than ourselves. In assuming that it is we have made a great mistake. And are now paying the price for it.\n\n \n\nBut most peop

In [40]:
train_ds = preprocess_dataset(train_ds)
eval_ds = preprocess_dataset(eval_ds)
print(train_ds[:CONTEXT_SIZE])

Tao of Philosophy Not What Should Be I wonder what you mean, when you use the word ‘I?’ I’ve been very interested in this proble


In [41]:
# get vocab from train set
vocab = sorted(list(set(train_ds)))
VOCAB_SIZE = len(vocab)
VOCAB_SIZE, "".join(vocab)

(168,
 '\n !$&()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz~\xadàáäæçéêìíïñòóöùúûĀāąīŌōŚśūǎǐǒ̥ΘΥάεμξορςστअकतथमलवशसािे्ḍṃṅṇṛṣṭἄἌἱὁὸῦ–—‘’‚“”…事念悩无無煩爲碍')

In [42]:
stoi = {ch:i for i,ch in enumerate (vocab)} # string to int, used for encoding
itos = {i:ch for i,ch in enumerate (vocab)} # int to string, used for decoding
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: "".join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [43]:
encode("Hello World!"), decode(encode("Hello World!"))

([34, 61, 68, 68, 71, 1, 49, 71, 74, 68, 60, 2], 'Hello World!')

In [44]:
# Convert to tensors
train_ds = pt.tensor(encode(train_ds), dtype=pt.long)
eval_ds = pt.tensor(encode(eval_ds), dtype=pt.long)

In [45]:
def get_batch(data, train=True):
    """
    generate a small batch of data of inputs x and targets y
    where x is a sequence of tokens, and y is the same sequence offset by 1 token.
    """
    # get random number between 0 and len(data) - CONTEXT_SIZE, because we will take CONTEXT_SIZE tokens. Repeat BATCH_SIZE times
    ix = pt.randint(len(data) - CONTEXT_SIZE, (BATCH_SIZE, ))
    x = pt.stack([data[i:i+CONTEXT_SIZE] for i in ix])
    y = pt.stack([data[i+1:i+CONTEXT_SIZE+1] for i in ix])
    return x.to(DEVICE), y.to(DEVICE)

In [46]:
# Display what the input to the model looks like at each step
x, y = get_batch(train_ds) # (BATCH_SIZE, CONTEXT_SIZE)
for b in range(BATCH_SIZE):
    for t in range(CONTEXT_SIZE):
        context = x[b, :t+1]
        target = y[b, t]
        print(f"when input is {context} the target: {target}")
    break
del x, y

when input is tensor([58], device='mps:0') the target: 71
when input is tensor([58, 71], device='mps:0') the target: 77
when input is tensor([58, 71, 77], device='mps:0') the target: 76
when input is tensor([58, 71, 77, 76], device='mps:0') the target: 1
when input is tensor([58, 71, 77, 76,  1], device='mps:0') the target: 76
when input is tensor([58, 71, 77, 76,  1, 76], device='mps:0') the target: 64
when input is tensor([58, 71, 77, 76,  1, 76, 64], device='mps:0') the target: 61
when input is tensor([58, 71, 77, 76,  1, 76, 64, 61], device='mps:0') the target: 1
when input is tensor([58, 71, 77, 76,  1, 76, 64, 61,  1], device='mps:0') the target: 75
when input is tensor([58, 71, 77, 76,  1, 76, 64, 61,  1, 75], device='mps:0') the target: 59
when input is tensor([58, 71, 77, 76,  1, 76, 64, 61,  1, 75, 59], device='mps:0') the target: 65
when input is tensor([58, 71, 77, 76,  1, 76, 64, 61,  1, 75, 59, 65], device='mps:0') the target: 61
when input is tensor([58, 71, 77, 76,  1, 

### Train utils

In [47]:
@pt.no_grad
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ('train', 'eval'):
        losses = pt.zeros(EVAL_EPOCHS)
        for k in range(EVAL_EPOCHS):
            X, Y = get_batch(train_ds if split == 'train' else eval_ds)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()

    return out

In [48]:
def train_model(model, optimizer):
  history = {"train_loss": [], "eval_loss": []}
  for epoch in range(EPOCHS):
      if epoch % EVAL_EPOCHS == 0:
          losses = estimate_loss(model)
          history["train_loss"].append(losses['train'])
          history["eval_loss"].append(losses['eval'])
          print(f"\rEpoch: {epoch}, train loss: {losses['train']:.4f}, validation loss: {losses['eval']:.4f}", end='')
      xb, yb = get_batch(train_ds)
      _, loss = model(xb, yb)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()

  return (model, optimizer, history)

### Models

#### GPT

In [49]:
class AttentionHead(nn.Module):
    def __init__(self, head_size, dropout=DROPOUT):
        super().__init__()
        self.key = nn.Linear(EMBEDDING_SIZE, head_size, bias=False)
        self.query = nn.Linear(EMBEDDING_SIZE, head_size, bias=False)
        self.value = nn.Linear(EMBEDDING_SIZE, head_size, bias=False)
        self.register_buffer('tril', pt.tril(pt.ones(CONTEXT_SIZE, CONTEXT_SIZE)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, C, E = x.shape # (BATCH_SIZE, CONTEXT_SIZE, EMBEDDING_SIZE)
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        w = q @ k.transpose(-2, -1) * E**-0.5 # scaled dot product
        w = w.masked_fill(self.tril[:C, :C] == 0, float("-inf")) # only take current context size tokens into consideration
        w = pt.softmax(w, dim=-1)
        w = self.dropout(w)
        return w @ v # (B, C, C) @ (B, C, E) -> (B, C, E)

In [50]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, num_heads, head_size, dropout=DROPOUT):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
        self.projection = nn.Linear(EMBEDDING_SIZE, EMBEDDING_SIZE)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        x = pt.cat([h(x) for h in self.heads], dim=-1)
        x = self.projection(x)
        x = self.dropout(x)
        return x

In [51]:
class FeedForward(nn.Module):
    def __init__(self, n_embed, dropout=DROPOUT):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [52]:
class TransformerBlock(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.self_attn = MultiHeadedAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embed)
        self.layer_norm1 = nn.LayerNorm(n_embed)
        self.layer_norm2 = nn.LayerNorm(n_embed)
    def forward(self, x):
        x = x + self.self_attn(self.layer_norm1((x))) # residual connection (+x), pre-layer normalization (deviation from the paper)
        x = x + self.ffwd(self.layer_norm2(x))
        return x


In [53]:
class GPT(nn.Module):
    """
    Decoder only transformer model
    """
    def __init__(self, num_blocks=NUM_TRANSFORMER_BLOCKS):
        super().__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_SIZE)
        self.pos_enc = nn.Embedding(CONTEXT_SIZE, EMBEDDING_SIZE)
        self.blocks = nn.Sequential(
            *[TransformerBlock(EMBEDDING_SIZE, 4) for _ in range(num_blocks)],
        )
        self.layer_norm = nn.LayerNorm(EMBEDDING_SIZE)
        self.lang_model_head = nn.Linear(EMBEDDING_SIZE, VOCAB_SIZE) # language model head

    def forward(self, context, targets=None):
        """
        @param idx: (pytorch tensor) of shape (BATCH_SIZE, CONTEXT_SIZE)
        @param target: (pytorch tensor) of shape (BATCH_SIZE, CONTEXT_SIZE)
        """
        B, C = context.shape

        token_embeds = self.embedding(context) # (BATCH_SIZE , CONTEXT_SIZE, EMBEDDING_SIZE)
        pos_embeds = self.pos_enc(pt.arange(C, device=DEVICE))
        x = token_embeds + pos_embeds
        x = self.blocks(x)
        x = self.layer_norm(x)
        logits = self.lang_model_head(x) # (BATCH_SIZE , CONTEXT_SIZE, VOCAB_SIZe)

        if targets is None:
            loss = None
        else:
            B, C, V = logits.shape
            logits = logits.view(B*C, V)
            targets = targets.view(B*C)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, context, max_sentence_len):
        """
        @param idx: (pytorch tensor) of shape (BATCH_SIZE, CONTEXT_SIZE), the context
        """
        for _ in range(max_sentence_len):
            logits, _ = self.forward(context[:, -CONTEXT_SIZE:])
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            idx_next = pt.multinomial(probs, num_samples=1)
            context = pt.cat((context, idx_next), dim=1)
        return context

print(summary(GPT()))


Layer (type:depth-idx)                        Param #
GPT                                           --
├─Embedding: 1-1                              64,512
├─Embedding: 1-2                              49,152
├─Sequential: 1-3                             --
│    └─TransformerBlock: 2-1                  --
│    │    └─MultiHeadedAttention: 3-1         590,208
│    │    └─FeedForward: 3-2                  1,181,568
│    │    └─LayerNorm: 3-3                    768
│    │    └─LayerNorm: 3-4                    768
│    └─TransformerBlock: 2-2                  --
│    │    └─MultiHeadedAttention: 3-5         590,208
│    │    └─FeedForward: 3-6                  1,181,568
│    │    └─LayerNorm: 3-7                    768
│    │    └─LayerNorm: 3-8                    768
│    └─TransformerBlock: 2-3                  --
│    │    └─MultiHeadedAttention: 3-9         590,208
│    │    └─FeedForward: 3-10                 1,181,568
│    │    └─LayerNorm: 3-11                   768
│    │    └─Lay

In [54]:
model = GPT().to(DEVICE)
pred = model.generate(pt.zeros((1,1), dtype=pt.long, device=DEVICE), 200)[0].tolist()
optimizer = pt.optim.Adam(model.parameters(), lr=LR)
print(decode(pred))


िæ
=`yæεἱ悩òŚHĀúiतε
Jp,P91XMpY9jōशfTΥICws$事8ṣ!cYññ+oǒ1m事PΥvUy念=zNμἄukûṭph–5R5‘:3ε4Qzἄ–̥ΘöNΘt’~lñἱὁNṭD`煩ì0ṇSlσZacxHNïPr事म事vρRwτἱṣ碍άGWMḍΘ”?Cj.äṣ[ḍρ/σἌथξ~e ्ö u-ि.Nṅ1~ṣμ_]ǐ7ρàrī碍`–F,ὁ–Θ’­無–τथ”PअCvéाτाḍû7-


In [None]:
model, optimizer, history = train_model(model, optimizer)

In [None]:
plot_hist(*[([i for i in range(len(v))], v) for v in history.values()])

In [None]:
pt.save(model.state_dict(), "./models/" + model.__class__.__name__ + ".pt")

In [60]:
def prompt(model: nn.Module, context):
  model.eval()
  encoded_str= encode(context)
  encoded_str = pt.tensor(encoded_str, dtype=pt.long, device=DEVICE)
  encoded_str=encoded_str.unsqueeze(0)
  pred=model.generate(encoded_str, 100)[0].tolist()
  model.train()
  print(decode(pred))

In [61]:
model = GPT().to(DEVICE)
model.load_state_dict(pt.load("./models/gpt.pt"))
summary(model)

Layer (type:depth-idx)                        Param #
GPT                                           --
├─Embedding: 1-1                              64,512
├─Embedding: 1-2                              49,152
├─Sequential: 1-3                             --
│    └─TransformerBlock: 2-1                  --
│    │    └─MultiHeadedAttention: 3-1         590,208
│    │    └─FeedForward: 3-2                  1,181,568
│    │    └─LayerNorm: 3-3                    768
│    │    └─LayerNorm: 3-4                    768
│    └─TransformerBlock: 2-2                  --
│    │    └─MultiHeadedAttention: 3-5         590,208
│    │    └─FeedForward: 3-6                  1,181,568
│    │    └─LayerNorm: 3-7                    768
│    │    └─LayerNorm: 3-8                    768
│    └─TransformerBlock: 2-3                  --
│    │    └─MultiHeadedAttention: 3-9         590,208
│    │    └─FeedForward: 3-10                 1,181,568
│    │    └─LayerNorm: 3-11                   768
│    │    └─Lay

In [63]:
prompt(model, 'can you make bread?')

can you make bread? Will. Where you in the equire of thing, you can a gransfvattation,  don’t low any existence: that’s
