# ESE-2000 Lab 6
TO DO add intro

We download a ~1MB file containing the entirety of Shakespeare's work. This is the dataset we will train our language model on.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os
from IPython.core.display_functions import clear_output
import matplotlib.pyplot as plt
import math
import wandb
import re
device = "cpu"
if torch.backends.mps.is_available():
    device = "mps:0"
elif torch.cuda.is_available():
    device = "cuda:0" 
print(f"Using device: {device}")

Using device: cuda:0


## Task 1: Data Loading and encoding

### Data

###  Data Loading

In [2]:
with open('input.txt') as f:
    text = f.read()
    

print("----Sample Shakespeare----")
print(text[:250])

----Sample Shakespeare----
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.



### Tokenization

Tokenization converts raw sub-sequences of text (substrings) to sequences of integers. For example, `"ll." -> 208`. We will be developing a character level language model, so we will be converting each individual word into an integer. For example, `"Hello" -> 48`.

In [3]:
def split_to_words(text):
    return re.findall(r"\w+(?:'\w+)?|[.,!?;:\"()\[\]{}<>\\/\-—–…]|\n", text)

vocab = list(set(split_to_words(text)))
c = len(vocab)
print("Number of words: {}".format(len(split_to_words(text))))
print("Number of distinct words in text: {}".format(c))

Number of words: 292072
Number of distinct words in text: 14295


### Functions to encode and decode words into indices

In [4]:
stoi = {word:i for i, word in enumerate(vocab)}
itos = {i:word for i, word in enumerate(vocab)}

def words_to_tokens(words):
    """
    Convert a list of words to a list of tokens
    """
    return [stoi[w] for w in words]

def tokens_to_words(index_list):
    """
    Convert a list of tokens to a list of words
    """
    decoded = " ".join([itos[i] for i in index_list])
    return re.sub(r'\s+([.,!?;:"(){}\[\]<>\\/\-—–…])', r'\1', decoded)

# Checking that the word to token and back conversion works
sample_words = text[:36]
token_ids = words_to_tokens(split_to_words(sample_words))
recovered_words = tokens_to_words(token_ids)
print(f"Original text: {sample_words}\n")
print(f"Encoded text: {token_ids}\n")
print(f"Recovered text: {recovered_words}\n")

Original text: First Citizen:
Before we proceed any

Encoded text: [13326, 3653, 11563, 4707, 1062, 2388, 3015, 7313]

Recovered text: First Citizen: 
 Before we proceed any



### Converting dataset into tokens

In [5]:
tokenized_text = words_to_tokens(split_to_words(text))
print("Encoded text sample: {}".format(tokenized_text[:10]))
print(tokens_to_words(tokenized_text[:10]))

# The works of Shakespeare are now a sequence of integers representing the words in the text. Sorry, William.
tokenized_text = torch.tensor(tokenized_text)
tokenized_text.shape

Encoded text sample: [13326, 3653, 11563, 4707, 1062, 2388, 3015, 7313, 2879, 5692]
First Citizen: 
 Before we proceed any further,


torch.Size([292072])

## Task 2: Co-occurrence matrix

In [6]:
#TODO commented bc its slow
# Create co-occurrence matrix
# The co-occurrence matrix C is a c x c (c is our vocab size) symmetric matrix where C_ij is how many times the ith word appears within W words away from the jth word.
# W = 10
# C = torch.stack([torch.zeros(len(vocab)) for _ in range(len(vocab))])
# for t_idx in trange(len(tokenized_text)):
#     left_bound = max(t_idx-W//2,0)
#     right_bound = min(t_idx+W//2+1,len(tokenized_text))
#     context_words = tokenized_text[left_bound : right_bound]
#     for u_idx in range(left_bound, right_bound):
#         t = tokenized_text[t_idx]
#         u = tokenized_text[u_idx]
#         C[t, u] += 1.0
# C = C.to(device)
# # X should be a symmetric matrix
# torch.isclose(C, C.T, atol=1e-3).all()

# # Save C so that we dont have to compute it again
#torch.save(C, "C.pt")

# Load C from storage
C = torch.load("C.pt").to(device)

  C = torch.load("C.pt").to(device)


In [7]:
# size of C in GB: numel times 4 bytes per float / 1e9 which is GB
C.numel() * 4 / 1e9

0.8173881

In [8]:
torch.isclose(C, C.T, atol=1e-3).all()

tensor(True, device='cuda:0')

## Task 3: PCA Embeddings

In [9]:
# Torch has a bug on mps devices so this won't work on MacBooks
n = 256
# with torch.no_grad():
#     Z = C - C.mean(dim=1, keepdim=True)
#     Z /= Z.std(dim=1, keepdim=True)
#     cov = (Z @ Z.T)/(Z.shape[0] - 1)
#     L, Q = torch.linalg.eigh(cov)
#     principal_eigv = Q[:, -n:].T

#     # PCA embeddings for training
#     embeddings = Z @ principal_eigv.T # (c, n)
#     # Full embeddings if we need them to visualize
#     # In vector form would be Q.T @ x_n
#     full_embeddings = Z @ Q

# torch.save(embeddings, "embeddings.pt")
# Load embeddings
embeddings = torch.load("embeddings.pt").to(device)

  embeddings = torch.load("embeddings.pt").to(device)


## Visualize embeddings
Decide if this section goes into the notebook or not

### Average coefficients


In [10]:
# K=8192
# #average_coefficients = full_embeddings.mean(axis=0)
# sample_embeddings = full_embeddings[torch.randint(0,full_embeddings.shape[0],(1000,))]
# # Compute the expectation of the absolute value of the norm of each component.
# average_coefficients = sample_embeddings.norm(p=2,dim=0).cpu().numpy()[::-1]
# data = average_coefficients[:K]

# # Reverse the tensor:
# data = data

# # Normalize by sum?
# #data = data / data.sum()

# plt.figure(figsize=(12, 6))
# plt.title(f"Average Coefficients (k={K})")
# fig= plt.plot(range(K), data,marker='.',linestyle='')


# plt.show()

### Principal eigenvalues

In [11]:
# K=64
# L_plot = L[-K:]/L.sum()
# L_plot,_ = L_plot.sort(descending=True)
# L_plot = L_plot.cpu().numpy()
# plt.figure(figsize=(12, 6))
# plt.title("Top k eigenvalues (k=64)")
# markerline, stemlines, baseline = plt.stem(range(K), L_plot, linefmt='b-', markerfmt='o', basefmt='k-')
# plt.setp(markerline, marker='o', fillstyle='none')


### Co ocurrence matrix plot

In [12]:
# # Top 10 words
# top_10_words = C.sum(axis=0).sort(descending=True).indices[:10]
# top_10_words = [vocab[i] for i in top_10_words]
# print(top_10_words)

In [13]:
# import matplotlib.pyplot as plt
# import numpy as np

# # # Remove all the punctations and stop words from the matrix for visualization
# X_viz = C.clone()
# words_to_remove = [",", ":", ".", "the", "I", "to", "and", ";", "of", "you", "my", "a", "?", "!", "in", "that", "And", "not", "-", "is", "me", "be", "it", "with", "your", "for", "his", "have", "this", "thou", "as", "thy", "will", "so", "but", "The", "To", "all", "her", "thee", "by", "are", "our", "do", "we"]
# vocab_to_remove_indices = set(words_to_tokens(words_to_remove))
# idx_to_viz = [i for i, word in enumerate(vocab) if word not in vocab_to_remove_indices]
# X_viz = X_viz[idx_to_viz, :][:, idx_to_viz]

# # top 20 words not including stop words
# top_100_words = C.sum(axis=0).sort(descending=True).indices[:100].cpu().numpy()
# top_100_nostop = [word for word in top_100_words if word not in vocab_to_remove_indices]
# display(f"Top 100 words, excluding punctation and most common stop words: {tokens_to_words(top_100_nostop)}")

# # Create a custom colormap
# cmap = plt.cm.get_cmap('viridis').copy()
# cmap.set_over('green')

# # Plot the image with the custom colormap
# plt.figure(figsize=(10, 10))
# plt.imshow(X_viz.cpu().numpy(), cmap=cmap, vmax=3)

# # Add colorbar with custom settings
# cbar = plt.colorbar(extend='max')
# cbar.set_label('Value')

# plt.title('Co-occurrence Matrix')
# plt.show()
# # # Model

In [14]:
# test_loss = 0
# with torch.no_grad():
#     for t_idx, (E, y) in enumerate(test_loader):
#         logits = model(E)
#         B, _ = logits.shape
#         loss = F.cross_entropy(logits, y)
#         test_loss += loss.item()

# print("Test loss: ", test_loss / len(test_loader))

In [15]:
# initial = test[132:132+T].unsqueeze(0)
# generated_text = generate(model,initial, max_generate_tokens=100)
# print("\n===INPUT===\n")
# print(tokens_to_words(initial.reshape(-1).tolist()))
# print("\n===GENERATED TEXT===\n")
# print("".join(generated_text[:]))

## Task 4: Language Transformer

### Data Split

In [16]:
T = 64 # context size
split_factor = 0.9
split_index = int(split_factor * len(tokenized_text))

# Splitting into train and test sets
train = tokenized_text[:split_index].to(device)
test = tokenized_text[split_index:].to(device)

### Dataset

In [17]:
class WordIndexDataset(Dataset):
    """
    This Dataset class takes and encoded tensor of word indices and returns a tensor of context windows of size T.
    The tensors returned by this dataset are not yet one-hot encoded.
    """
    def __init__(self, text, T):
        self.text = text
        self.T = T
        assert self.T < len(text), "context_size (T) must be less than len(text)"

    def __len__(self):
        return len(self.text) - self.T

    def __getitem__(self, idx):
        """
        Return a single context window of size T. 
        The context window is a sequence of T words.

        During training, we will predict the next token of every word in the context window,
        so Y_item is the next word for every word in the context window.
        """
        X_item = self.text[idx:idx + self.T]
        Y_item = self.text[idx + 1:idx + self.T + 1]

        return X_item, Y_item

train_dataset = WordIndexDataset(train, T)
test_dataset = WordIndexDataset(test, T)


# Example of a batch
B = 64
train_loader = DataLoader(train_dataset, batch_size=B, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=B, shuffle=False)

E, y_idx = next(iter(train_loader))
print(f"X_idx shape: {E.shape}")
print(f"y_idx shape: {y_idx.shape}")

X_idx shape: torch.Size([64, 64])
y_idx shape: torch.Size([64, 64])


In [18]:
dd=torch.ones(5,5)
dddmask=torch.tril_indices(5,5, -1)
dd[dddmask[0], dddmask[1]] = float('-inf')
dd

tensor([[1., 1., 1., 1., 1.],
        [-inf, 1., 1., 1., 1.],
        [-inf, -inf, 1., 1., 1.],
        [-inf, -inf, -inf, 1., 1.],
        [-inf, -inf, -inf, -inf, 1.]])

## MultiheadLayer

In [19]:
class MultiheadLayer(nn.Module):
    def __init__(self, H, n=n):
        super(MultiheadLayer, self).__init__()
        
        assert n % H == 0, "n must be divisible by num_heads"
        self.H = H
        self.n = n
        #self.attn_heads = nn.ModuleList([HeadAttn(m = n // H, n=n) for _ in range(H)])
        self.m = n // H
        self.Q = nn.Parameter(torch.empty(H, n, self.m))
        self.K = nn.Parameter(torch.empty(H, n, self.m))
        self.V = nn.Parameter(torch.empty(H, n, self.m))

        self.W = nn.Parameter(torch.empty(H, self.m, n))

        self.norm1 = nn.LayerNorm(n)
        self.norm2 = nn.LayerNorm(n)

        self.W1 = nn.Parameter(torch.empty(n, 4 * n))
        self.W2 = nn.Parameter(torch.empty(4 * n, n))
        
        nn.init.kaiming_uniform_(self.W1, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.W2, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.Q, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.K, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.V, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.W, a=math.sqrt(5))

    def forward(self, X0):
        B, T, n = X0.shape
        X = self.norm1(X0) # (B, T, n)
        
        # Multi-head attention
        #heads = torch.stack([attn(X, use_mask=True) for attn in self.attn_heads])

        X = X.unsqueeze(1) # (B, 1, T, n)

        QX = torch.matmul(X, self.Q) # (B, H, T, m)
        KX = torch.matmul(X, self.K) # (B, H, T, m)
        VX = torch.matmul(X, self.V) # (B, H, T, m)     
        
        B =  QX @ KX.transpose(-2, -1) * (self.m ** -0.5) # (B, H, T, T)

        mask = torch.tril_indices(T,T, -1)
        B[:,:, mask[0], mask[1]] = float('-inf')

        #A = F.softmax(B, dim=-1)
        A = F.softmax(B, dim=-1) 
        
        AVX = torch.matmul(A, VX) # (B, H, T, m)
        Y = torch.matmul(AVX, self.W) # (B, T, n)

        X = torch.sum(Y, dim=1) # (B, T, n)
        
        X = self.norm2(X)
        X2 = X0 + F.relu(X)
        return X2
    
    
# Test
model = MultiheadLayer(H=4, n=32).to(device)
#E = next(iter(train_loader))[0]
#X = embeddings[E].transpose(2,1)
X = torch.randn(1,5, 32).to(device)
print(f"X.shape: {X.shape}")
X = X.to(device)
readout = model(X)
print(f"out.shape: {readout.shape}")

X.shape: torch.Size([1, 5, 32])
out.shape: torch.Size([1, 5, 32])


## LLM

In [29]:
class LLM(nn.Module):   
    def __init__(self, L, H, n):
        super(LLM, self).__init__()

        self.position_embedding = nn.Embedding(T, n)
        # this is the PCA embeddings 
        self.token_embedding = embeddings

        self.decoder_layers = nn.Sequential(*[MultiheadLayer(H, n) for _ in range(L)])
        
        self.norm = nn.LayerNorm(n)
        
        #self.readout_weight = nn.Parameter(torch.empty(c, n))
        # row wise
        self.readout_weight = nn.Parameter(torch.empty(n, c))
        nn.init.kaiming_uniform_(self.readout_weight, a=math.sqrt(5))

    def forward(self, E):
        B, T = E.shape

        X = self.token_embedding[E]#.transpose(1,2) # (B, n, T)
        P = self.position_embedding(torch.arange(T, device=device))#.transpose(0,1) # (n, T)
        
        X = X + P

        X = self.decoder_layers(X) # (B, T, n)
        #X = self.norm(X.transpose(1,2)).transpose(1,2)
        X = self.norm(X)

        #Y = torch.matmul(self.readout_weight, X) # (B, T, c)
        # row wise
        Y = torch.matmul(X, self.readout_weight) 
        
        return Y

# Test
model = LLM(L=2, H=2, n=n).to(device)
initial = test[132:132+T].unsqueeze(0)
readout = model(initial)
print(f"readout.shape: {readout.shape}")
# generated_text = generate(model, initial, max_generate_tokens=100)
# print("\n===INPUT===\n")
# print(tokens_to_words(initial.reshape(-1).tolist()))
# print("\n===GENERATED TEXT===\n")
# print(generated_text)

readout.shape: torch.Size([1, 64, 14295])


In [30]:
def generate_reversed(model, input_tokens, max_generate_tokens=500):
    """
    Generate text from a model given an initial input token sequence.
    Args:
        model (nn.Module): The model to use for generation.
        input_tokens (torch.Tensor): The initial input token sequence.
        max_generate_tokens (int): The maximum number of tokens to generate.
    Returns:
        torch.Tensor: The generated token sequence.
    """
    with torch.no_grad():
        context = input_tokens.clone()
        generated_sequence = input_tokens.cpu().squeeze().tolist()  # Ensure it's a 1D list
        for _ in range(max_generate_tokens):
            logits = model(context)
            
            last_token_logits = logits[:,-1,:]
            probs = F.softmax(last_token_logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            
            # Slide context window

            context = torch.cat([context[:, 1:], next_token], dim=1)  
            generated_sequence.append(next_token.item())  # Add new token to generated sequence
        generated_words = tokens_to_words(generated_sequence)
        generation_string = "".join(generated_words)
        return generation_string
# Test generate
model = LLM(L=2, H=4, n=n).to(device)
initial = test[132:132+T].unsqueeze(0)
print(generate_reversed(model, initial, max_generate_tokens=100))

mountains are for winds, 
 That shake not, though they blow perpetually. 
 
 BAPTISTA: 
 How now, my friend! why dost thou look so pale? 
 
 HORTENSIO: 
 For fear, I promise you, if I look pale. 
 
 BAPTISTA: 
 What, will my daughter prove a good musician hanging players trumpet Pay easeful Mann'd winked broiled vast lion inevitable Fine unconstrain'd Mariners prattling glistering Bunch contemn Successively cipher Praise Italy bestow'd chidest civil Crassus Accountant hours grievances needless laugh MARGARET dally High ho According Allowing hypocrisy thither inquire depose taunt pirate drowsy unwillingly helpful Brakenbury prevailing violentest Hour spirit brothers misbehaved doers painter infer fowling largess conspires cymbals enter tale's pillow Ethiope's wait relent ends compell'd move Framed Among shamefast desolation resemble beadsmen crosses parson's taunted signs chapless sceptre backing casque heading wounded attorneyed pastures dauntless cobbled permit bribe withdrawn moveth 

## Train

In [31]:

L = 6
H = 8
model = LLM(L, H, n).to(device)
lr = 1e-4
num_epochs = 5

B = 32
train_loader = DataLoader(train_dataset, batch_size=B, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=B, shuffle=False)

num_parameters_str = str(sum(p.numel() for p in model.parameters())/1e6,) + 'M parameters'
print("Created new model with {}".format(num_parameters_str))
run = wandb.init(
    project="lab-9-llm",
    name="Varun's version",
    entity="ese-2000",
    config={
        "architecture": "Transformer",
        "dataset": "TinyShakespeare",
        "num_epochs": num_epochs,
        "learning_rate": lr,
        "num_blocks": L,
        "num_heads_per_block": H,
        "context_size": T,
        "model_summary": str(model),
        "num_parameters": num_parameters_str
    }
)

Created new model with 8.401152M parameters


In [32]:
from tqdm import trange, tqdm
opt = optim.AdamW(model.parameters(), lr=lr)
model.train()
num_parameters = str(sum(p.numel() for p in model.parameters())/1e6,) + 'M parameters'
print("Created new model with {} parameters".format(num_parameters))
train_loss_evolution = []
for epoch in trange(num_epochs):
    train_loss = 0
    for t_idx, (X, y) in enumerate(train_loader):
        logits = model(X)
        B, _, _ = logits.shape
        logits = logits.view(B * T, -1)
        y = y.view(B * T, -1).squeeze()
        loss = F.cross_entropy(logits, y)
        opt.zero_grad()
        loss.backward()
        opt.step()
        train_loss += loss.item()
        wandb.log({"train_loss": loss.item()})
    train_loss_evolution.append(train_loss/len(train_loader))
    clear_output()
    print(f"Epoch {epoch+1}, Loss {train_loss/len(train_loader)}")
    run.log({"epoch_train_loss": train_loss/len(train_loader)}) 
    wandb.config.update({"num_epochs": epoch+1})
    plt.plot(train_loss_evolution)
    plt.show()


Created new model with 8.401152M parameters parameters


  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
initial = test[132:132+T].unsqueeze(0)
generated_text = generate_reversed(model,initial, max_generate_tokens=100)
print("\n===INPUT===\n")
print(tokens_to_words(initial.reshape(-1).tolist()))
print("\n===GENERATED TEXT===\n")
print("".join(generated_text[:]))


===INPUT===

mountains are for winds, 
 That shake not, though they blow perpetually. 
 
 BAPTISTA: 
 How now, my friend! why dost thou look so pale? 
 
 HORTENSIO: 
 For fear, I promise you, if I look pale. 
 
 BAPTISTA: 
 What, will my daughter prove a good musician

===GENERATED TEXT===

mountains are for winds, 
 That shake not, though they blow perpetually. 
 
 BAPTISTA: 
 How now, my friend! why dost thou look so pale? 
 
 HORTENSIO: 
 For fear, I promise you, if I look pale. 
 
 BAPTISTA: 
 What, will my daughter prove a good musician aside welcome; I would now now now your Hail tis that that so being prerogative storm murderer, therefore put EDWARD therefore good down harm 
 You lie love absolute 
 
 office 
 
 
 
 Give alive! 
 Let scale spared from how and and and at blind the accept for for member,, how advise thyself own the the the the, which Their oracle content why else not of sake has it it look mock'd to die gold whence, 
 AUTOLYCUS. 
 PETRUCHIO on on 
 midst son 
 Ha