In [16]:
import time
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
device = torch.device("mps")

We save the book into text and try to get all the unique characters from it

In [2]:
with open("wizard_of_oz.txt", "r") as f:
    text = f.read()
chars = sorted(set(text))
print(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


We then build a simple encoder and decoder to convert any word into a set of numbers representing that word. In this simple case, we just attach each character to its index, and when we have a word "hello", we just look it up and see ok h = index 63, e = index 60...etc.

In [3]:
string_to_int = {character:index for index, character in enumerate(chars)}
int_to_string = {index:character for index, character in enumerate(chars)}
encoder = lambda text: [string_to_int[character] for character in text]
decoder = lambda list_num: "".join(int_to_string[num] for num in list_num)

we encode the entire book (text). For ease of manipulation and do all the maths, we need to change the data strucutre. PyTorch has the tensor data structure which can be easily manipulated with maths. Thus, we convert our normal list of numbers into a tensor (multi-dimension array)

In [4]:

data = torch.tensor(encoder(text), dtype=torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


split up the data into: training and validation to avoid overfitting
setting the block size to 8 characters and creating a sliding window x and y
this mimic how the model guess the next character given the previous character

In [5]:
n = int(len(data)*0.8)
train_data = data[:n]
val_data = data[n:]

block_size = 8
x = train_data[:block_size]
y = train_data[1 : block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("when input is", context, "target is", target)

when input is tensor([80]) target is tensor(1)
when input is tensor([80,  1]) target is tensor(1)
when input is tensor([80,  1,  1]) target is tensor(28)
when input is tensor([80,  1,  1, 28]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
when input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
when input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
when input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


the problem with the above setup is scalability. It is "sequential" (ie. it is run in order by CPU). It can't be run all at once, which is what GPU can do.
The first cell is for Window. The second cell is for using Apple Sillicon.

#### Picking device and speed

In [7]:
device_on_window = "cuda" if torch.cuda.is_available() else "cpu"
print(device_on_window)

cpu


In [6]:
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [14]:
batch_size = 4
device = torch.device("mps")
print(device)

mps


let's do some testing to see how fast GPU comparing to CPU. As we can see, when it comes to large operation, GPU is 100x faster than CPU.

In [11]:
a = 150
torch_rand1 = torch.rand(a, a, a, a).to(device)
toche_rand2 = torch.rand(a, a, a, a).to(device)
np_rand1 = torch.rand(a, a, a, a)
np_rand2 = torch.rand(a, a, a, a)

start_time = time.time()
rand = (torch_rand1 @ toche_rand2)
endtime = time.time()
elapsed_time = endtime - start_time
print(f"elapsed time is {elapsed_time:.8f}")

start_time = time.time()
rand = np.multiply(np_rand1, np_rand2)
endtime = time.time()
elapsed_time = endtime - start_time
print(f"elapsed time is {elapsed_time:.8f}")


elapsed time is 0.27686405
elapsed time is 33.59036207


#### Initializing Neural Net

How do we train the model?
We will continuously feed them a set of 8 characters, once at a time. The first sequence (x) is 1st to 8th (block_size), then we show them the result (y) --> that's how we "teach" them.
ix is just a set of random positions that we will creating a sequence (set). We set the batch = 4, it means we will feed the model 4 set of x , then it will try to predict the next character, then we feed it 4 set of y, which is like the "answer". Then we do the comparision between the model's prediction and the answer. The difference is the loss function that we try to minimize. 

In [22]:
n = int(len(data)*0.8)
train_data = data[:n]
val_data = data[n:]
block_size = 8
batch_size = 4

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x , y = x.to(device), y.to(device)
    return x, y

x, y = get_batch("train")
print("input:")
print(x)
print("target:")
print(y)

tensor([99040,  2156, 12509, 85304])
input:
tensor([[54, 73,  1, 73, 61, 58,  1, 31],
        [ 1, 76, 62, 73, 61, 68, 74, 73],
        [ 0,  0, 44, 61, 58,  1, 72, 64],
        [78,  1, 59, 71, 68, 66,  1, 73]], device='mps:0')
target:
tensor([[73,  1, 73, 61, 58,  1, 31, 54],
        [76, 62, 73, 61, 68, 74, 73,  1],
        [ 0, 44, 61, 58,  1, 72, 64, 78],
        [ 1, 59, 71, 68, 66,  1, 73, 61]], device='mps:0')
