# makemore 1 - Exercises

This notebook has the exercises from the first video.

In [3]:
import pandas as pd

def read_movie_names() -> list[str]:
    movies = pd.read_csv("../data/movies.csv")

    def has_special_chars(name: str) -> bool:
        AZ = set("ABCDEFGHIJKLMNOPQRSTUVWXYZ ")
        return len(set(name) - AZ) > 0

    return [
        n.upper()
        for n in list(movies.query("Language == 'hindi'")["Movie Name"])
        if not has_special_chars(n.upper())
    ]

MOVIES = read_movie_names()
print(f"Number of names: {len(MOVIES)}")
sample_names = "\n\t".join(MOVIES[5584:5589])
print(f"Sample names:\n\n\t{sample_names}")

Number of names: 12900
Sample names:

	BAAZIGAR
	PARDES
	ANURAG MAURYA ACT
	BLACK MARKET
	AKHIYON SE GOLI MAARE


In [4]:
VOCAB = sorted(list(set(''.join(MOVIES)))) + ['.'] 
STOI = {s:i for i,s in enumerate(VOCAB)}
ITOS = {i:s for s,i in STOI.items()}

In [5]:
import torch

def ngrams(corpus: list[str], n: int) -> torch.tensor:
    "Returns the n-grams present in the corpus as a tensor, one row per n-gram"
    # Add start/end tokens to each name
    padded_names = ["." * (n-1) + name + "." for name in corpus]
    
    # Initialize tensor to store all n-grams
    # Each n-gram will be represented as a row of n integers (STOI mappings)
    total_ngrams = sum(len(name) - n + 1 for name in padded_names)
    result = torch.zeros((total_ngrams, n), dtype=torch.long)
    
    # Fill the tensor with n-grams
    idx = 0
    for name in padded_names:
        for i in range(len(name) - n + 1):
            # Extract the n-gram and convert each character to its STOI index
            ngram = name[i:i+n]
            for j, char in enumerate(ngram):
                result[idx, j] = STOI[char]
            idx += 1
            
    return result


In [6]:
# Test the ngrams function with a small example
test_corpus = ["CAT", "DOG"]
trigrams = ngrams(test_corpus, n=3)
print("Shape of trigrams tensor:", trigrams.shape)
print("\nFirst few trigrams:")
for i in range(len(trigrams)):
    # Convert indices back to characters using ITOS
    chars = [ITOS[idx.item()] for idx in trigrams[i]]
    print(f"{''.join(chars)}")


Shape of trigrams tensor: torch.Size([8, 3])

First few trigrams:
..C
.CA
CAT
AT.
..D
.DO
DOG
OG.


In [31]:
import torch

def train_ngram(ngrams: torch.Tensor, n: int, r: float = 1.0) -> torch.tensor:
    """
    Train an n-gram model using pre-computed n-grams tensor.
    
    Args:
        ngrams: Tensor of shape (N, n) where N is number of n-grams and each row contains
               n STOI indices representing an n-gram
        n: The size of n-grams (e.g., 2 for bigrams, 3 for trigrams, etc.)
        r: regularization
    
    Returns:
        Tensor of shape (V, V, ..., V) containing normalized n-gram probabilities,
        where V is vocabulary size and the tensor has n dimensions
    """
    N_TOKENS = len(VOCAB)
    
    # Create a sparse tensor of counts using index_put_
    # Create a tuple of n dimensions, each of size N_TOKENS
    shape = tuple([N_TOKENS] * n)
    counts = torch.fill_(torch.zeros(shape), r)
    
    # Split the ngrams tensor into n columns for index_put_
    indices = tuple(ngrams[:, i] for i in range(n))
    counts.index_put_(
        indices,
        torch.ones(len(ngrams)),
        accumulate=True
    )
    
    # Normalize the counts into probabilities
    # Add a small epsilon to avoid division by zero
    # Sum over the last dimension for normalization
    return counts / (counts.sum(-1, keepdim=True))

# Generate trigrams from the corpus
trigrams_tensor = ngrams(MOVIES, n=3)
model = train_ngram(trigrams_tensor, n=3)

In [8]:
def ngram_loss(model: torch.Tensor, data: torch.Tensor, n: int) -> float:
    """
    Calculate negative log likelihood loss for n-grams using vectorized operations
    
    Args:
        model: Tensor of shape (V, V, ..., V) containing n-gram probabilities
        data: Tensor of shape (N, n) containing n-gram indices
        n: Size of n-grams (e.g., 2 for bigrams, 3 for trigrams)
        
    Returns:
        Average negative log likelihood across all n-grams
    """
    # Create index tuple for all n dimensions
    indices = tuple(data[:, i] for i in range(n))
    
    # Get probabilities for all n-grams at once using dynamic indexing
    probs = model[indices]
    
    # Calculate log probabilities (adding epsilon for numerical stability)
    logprobs = torch.log(probs + 1e-10)
    
    # Return average negative log likelihood
    return -logprobs.mean().item()


In [9]:
bigrams = ngrams(MOVIES, 2)
trigrams = ngrams(MOVIES, 3)

bmodel = train_ngram(bigrams, 2)
tmodel = train_ngram(trigrams, 3)

print(f"bigram loss = {ngram_loss(bmodel, bigrams, 2)}")
print(f"trigram loss = {ngram_loss(tmodel, trigrams, 3)}")

bigram loss = 2.5086231231689453
trigram loss = 2.1968438625335693


In [10]:
qgrams = ngrams(MOVIES, 4)
qmodel = train_ngram(qgrams, 4)

print(f"4-gram loss = {ngram_loss(qmodel, qgrams, 4)}")

4-gram loss = 1.7393230199813843


In [11]:
from typing import Tuple

def split_dataset(
        X: torch.tensor,
        train: float,
        dev: float,
        test: float
    ) -> Tuple[torch.tensor, torch.Tensor, torch.Tensor]:
    "Split X by row into the 3 datasets"
    assert abs(train + dev + test - 1.0) < 1e-5, "Proportions must sum to 1"
    n = len(X)
    
    # Calculate indices for splits
    train_idx = int(n * train)
    dev_idx = train_idx + int(n * dev)
    
    # Create random permutation of indices
    perm = torch.randperm(n)
    
    # Split the data using the permuted indices
    train_data = X[perm[:train_idx]]
    dev_data = X[perm[train_idx:dev_idx]]
    test_data = X[perm[dev_idx:]]
    
    return train_data, dev_data, test_data

In [23]:
train_trigrams, dev_trigrams, test_trigrams = split_dataset(trigrams_tensor, 0.8, 0.1, 0.1)

In [72]:
for r in [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]:
    # Train model on training set only
    model = train_ngram(train_trigrams, n=3, r=r)

    # Calculate and print losses using the generic ngram_loss function
    train_loss = ngram_loss(model, train_trigrams, n=3)
    dev_loss = ngram_loss(model, dev_trigrams, n=3)

    print(f"r = {r:5.2f}  losses: [train: {train_loss:.4f}, dev: {dev_loss:.4f}]")


r =  0.01  losses: [train: 2.1891, dev: 2.2900]
r =  0.10  losses: [train: 2.1958, dev: 2.2773]
r =  0.50  losses: [train: 2.2183, dev: 2.2857]
r =  1.00  losses: [train: 2.2405, dev: 2.3014]
r =  5.00  losses: [train: 2.3547, dev: 2.3995]
r = 10.00  losses: [train: 2.4445, dev: 2.4817]


In [73]:
model = train_ngram(test_trigrams, n=3, r=0.10)
ngram_loss(model, test_trigrams, n=3)

2.1236484050750732

In [44]:
def train_dev_test(n: int):
    data = ngrams(MOVIES, n)
    train_set, dev_set, test_set = split_dataset(data, 0.8, 0.1, 0.1)

    model = train_ngram(train_set, n, r=0.1)

    train_loss = ngram_loss(model, train_set, n)
    dev_loss = ngram_loss(model, dev_set, n)
    test_loss = ngram_loss(model, test_set, n)

    print(f"Training set loss: {train_loss:.4f}")
    print(f"Development set loss: {dev_loss:.4f}")
    print(f"Test set loss: {test_loss:.4f}")

In [14]:
train_dev_test(2)

Training set loss: 2.5090
Development set loss: 2.5243
Test set loss: 2.5005


In [45]:
train_dev_test(3)

Training set loss: 2.2001
Development set loss: 2.2449
Test set loss: 2.2512
