In [28]:
import torch
import torch.nn as nn
import torch.optim as optim

## Data Loading and Transformation

In [29]:
with open("../data/wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(len(text))
print(text[:300])

chars = sorted(set(text))
vocab_size = len(chars)
print(chars)

232309
﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK


  [Illustration]


  COPYRIGHT 1908 BY L. FRANK BAUM

  ALL RIGHTS RESERVED


         *    
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


## Encoding and Decoding

In [30]:
def encode(text: str):
    ans = []
    for t in text:
        ans.append(chars.index(t))
    return ans

encoded_hello = encode("hello")
encoded_hello

[61, 58, 65, 65, 68]

In [31]:
def decode(indices: list):
    ans = ""
    for i in indices:
        ans += chars[i]
    return ans

decoded_hello = decode(encoded_hello)
decoded_hello

'hello'

In [32]:
encoded_text = encode(text)
print(encoded_text[:100])
bigram_pairs = [(encoded_text[i], encoded_text[i + 1]) for i in range(len(encoded_text) - 1)]
bigram_pairs

[80, 1, 1, 28, 39, 42, 39, 44, 32, 49, 1, 25, 38, 28, 1, 44, 32, 29, 1, 47, 33, 50, 25, 42, 28, 1, 33, 38, 1, 39, 50, 0, 0, 1, 1, 26, 49, 0, 0, 1, 1, 36, 11, 1, 30, 42, 25, 38, 35, 1, 26, 25, 45, 37, 0, 0, 1, 1, 25, 45, 44, 32, 39, 42, 1, 39, 30, 1, 44, 32, 29, 1, 47, 33, 50, 25, 42, 28, 1, 39, 30, 1, 39, 50, 9, 1, 44, 32, 29, 1, 36, 25, 38, 28, 1, 39, 30, 1, 39, 50]


[(80, 1),
 (1, 1),
 (1, 28),
 (28, 39),
 (39, 42),
 (42, 39),
 (39, 44),
 (44, 32),
 (32, 49),
 (49, 1),
 (1, 25),
 (25, 38),
 (38, 28),
 (28, 1),
 (1, 44),
 (44, 32),
 (32, 29),
 (29, 1),
 (1, 47),
 (47, 33),
 (33, 50),
 (50, 25),
 (25, 42),
 (42, 28),
 (28, 1),
 (1, 33),
 (33, 38),
 (38, 1),
 (1, 39),
 (39, 50),
 (50, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (1, 26),
 (26, 49),
 (49, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (1, 36),
 (36, 11),
 (11, 1),
 (1, 30),
 (30, 42),
 (42, 25),
 (25, 38),
 (38, 35),
 (35, 1),
 (1, 26),
 (26, 25),
 (25, 45),
 (45, 37),
 (37, 0),
 (0, 0),
 (0, 1),
 (1, 1),
 (1, 25),
 (25, 45),
 (45, 44),
 (44, 32),
 (32, 39),
 (39, 42),
 (42, 1),
 (1, 39),
 (39, 30),
 (30, 1),
 (1, 44),
 (44, 32),
 (32, 29),
 (29, 1),
 (1, 47),
 (47, 33),
 (33, 50),
 (50, 25),
 (25, 42),
 (42, 28),
 (28, 1),
 (1, 39),
 (39, 30),
 (30, 1),
 (1, 39),
 (39, 50),
 (50, 9),
 (9, 1),
 (1, 44),
 (44, 32),
 (32, 29),
 (29, 1),
 (1, 36),
 (36, 25),
 (25, 38),
 (38, 28),
 (28, 1),
 (1, 39),
 (39, 30),
 (30

## Data Spliting

In [33]:
from torch.utils.data import Dataset, DataLoader, random_split

total_samples = len(bigram_pairs)
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_size = int(train_ratio * total_samples)
val_size = int(val_ratio * total_samples)
test_size = total_samples - train_size - val_size

torch.manual_seed(42)
train_data, val_data, test_data = random_split(bigram_pairs, [train_size, val_size, test_size])
# print(train_data[10])
batch_size = 64

# Create DataLoader instances
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

print(len(train_loader), len(val_loader), len(test_loader))

2904 363 363


## Bigram Model

In [34]:
class BigramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x):
        output = self.embeddings(x)
        output = self.linear(output)
        return output
    
    def generate(self, start_char, num_chars_to_generate):
        generated_chars = [start_char]
        for _ in range(num_chars_to_generate):
            x = torch.tensor([generated_chars[-1]])
            y = self.forward(x)
            p = nn.functional.softmax(y, dim=1)
            c = torch.multinomial(p, num_samples=1).item()
            generated_chars.append(c)
        return generated_chars
    
model = BigramModel(vocab_size, 32)

## Training Loop

In [35]:
import time

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

start_time = time.time()

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch, (X, y) in enumerate(train_loader):
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    for batch, (X, y) in enumerate(val_loader):
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        val_loss += loss.item()

    val_loss /= len(val_loader)

    print(f"Epoch: {epoch + 1}/{num_epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

end_time = time.time()

print(f"Training time: {end_time - start_time:.2f}s")

Epoch: 1/100 | Train Loss: 2.6761 | Val Loss: 2.4828
Epoch: 2/100 | Train Loss: 2.4728 | Val Loss: 2.4576
Epoch: 3/100 | Train Loss: 2.4568 | Val Loss: 2.4516
Epoch: 4/100 | Train Loss: 2.4504 | Val Loss: 2.4443
Epoch: 5/100 | Train Loss: 2.4466 | Val Loss: 2.4427
Epoch: 6/100 | Train Loss: 2.4447 | Val Loss: 2.4423
Epoch: 7/100 | Train Loss: 2.4427 | Val Loss: 2.4408
Epoch: 8/100 | Train Loss: 2.4415 | Val Loss: 2.4401
Epoch: 9/100 | Train Loss: 2.4407 | Val Loss: 2.4414
Epoch: 10/100 | Train Loss: 2.4400 | Val Loss: 2.4389
Epoch: 11/100 | Train Loss: 2.4395 | Val Loss: 2.4380
Epoch: 12/100 | Train Loss: 2.4388 | Val Loss: 2.4402
Epoch: 13/100 | Train Loss: 2.4387 | Val Loss: 2.4383
Epoch: 14/100 | Train Loss: 2.4383 | Val Loss: 2.4361
Epoch: 15/100 | Train Loss: 2.4380 | Val Loss: 2.4374
Epoch: 16/100 | Train Loss: 2.4378 | Val Loss: 2.4377
Epoch: 17/100 | Train Loss: 2.4374 | Val Loss: 2.4391
Epoch: 18/100 | Train Loss: 2.4373 | Val Loss: 2.4376
Epoch: 19/100 | Train Loss: 2.4371 | 

## Text Generation

In [37]:
start_char = "a"
num_chars_to_generate = 500

generated_chars = model.generate(chars.index(start_char), num_chars_to_generate)

print(decode(generated_chars))

ato in al ZEuid avilaste?"Oze d umorto o liead alenthed

"Anthel s, ave witre auncous thas bof co us ed hemoconouc Wisirrerr. c jun y bssositthiza I yoto avere ins tier "


wsind the atoler ache a fuid armad bere.
oder utengud om weintand sof hid he as." woure Ozand "Ye t ast bo
wn mawave, indong Wit ke ld ne. outouremmo le Vapangn thand t, s. I be masuply wn fr,
oug co he OFotllde

" he, they shttt anto Ifrge; be y
wimpe stht thed acupon whin heshe s ked nghed

ANourre "

d boblle
wioroun g izar
