In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F




In [15]:
# Triangular matrix with ones on the lower and diagnol, 0 on the upper
a = torch.tril(torch.ones(3, 3))
# keepdim=False => generates [1,2,3] keepdim=True => generates [[1],[2],[3]]]
sum = torch.sum(a, 1, keepdim=True)
print(sum)
# Each row adds up to 1 with each element averaged
a = a / sum
print(a)
b = torch.randint(0, 10, (3, 2)).float()
print(b)
# This makes each row is sum of all previous row in the old matrix
# last row is the average of previous element of b
c = a @ b
print(c)




tensor([[1.],
        [2.],
        [3.]])
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[9., 9.],
        [5., 4.],
        [4., 1.]])
tensor([[9.0000, 9.0000],
        [7.0000, 6.5000],
        [6.0000, 4.6667]])


In [18]:
T = 8
weights = torch.tril(torch.ones(T, T))
weights = weights.sum(1, keepdim=True)

In [20]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
F.softmax(wei, dim=1)



tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [20]:
import numpy as np
import os


# device = 'cpu' # for later use in torch.autocast
block_size = 1024
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' # for later use in torch.autocast

data_dir = "shakespeare_char"
def get_batch(split, batch_size):
    # We recreate np.memmap every batch to avoid a memory leak, as per
    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
    if split == 'train':
        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
    else:
        data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
    # Sample random contiguous block of text of length block_size, ix here is offset
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # 
    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
    # Char prediction, so y=target is x=input shifted by 1 as it is the next char.
    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
    # if device_type == 'cuda':
        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
        # x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
    x, y = x.to(DEVICE), y.to(DEVICE)
    return x, y

In [6]:
xb, yb = get_batch('train')

In [15]:
m = BigramLangugageModel(65)
logits, loss = m(xb, yb)
print(logits)
print(loss)

m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()


tensor([[-0.2477, -0.0755, -1.0022,  ...,  1.0738, -0.5089,  1.1788],
        [ 0.6879, -0.4161,  0.7217,  ...,  0.2145, -0.5488, -0.0742],
        [-1.4251, -1.6420,  0.7253,  ...,  0.0152,  0.2248,  1.5000],
        ...,
        [ 0.6384, -2.6421, -0.2927,  ...,  1.6300,  0.0692, -0.1172],
        [-0.2019, -1.0738, -0.6054,  ...,  0.6918,  0.7287,  1.8285],
        [ 0.2242,  0.8994,  0.8124,  ...,  1.2630,  1.3209,  0.6855]],
       grad_fn=<ViewBackward0>)
tensor(5.0654, grad_fn=<NllLossBackward0>)


In [22]:
# Optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32

for step in range(10000):
    xb, yb = get_batch('train', 32)
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

4.390140533447266
4.395011901855469
4.38952112197876
4.393494606018066
4.379246234893799
4.380944728851318
4.383433818817139
4.375387191772461
4.382012367248535
4.384686470031738
4.378200531005859
4.3716721534729
4.378630638122559
4.375410556793213
4.368130207061768
4.362637519836426
4.373534679412842
4.370299339294434
4.362607479095459
4.358072757720947
4.365843772888184
4.356269359588623
4.365509986877441
4.360950469970703
4.357570171356201
4.3654656410217285
4.357369422912598
4.353784561157227
4.356688022613525
4.355712890625
4.366136074066162
4.352844715118408
4.34678316116333
4.333167552947998
4.349677562713623
4.340488910675049
4.338107109069824
4.338043689727783
4.3375563621521
4.339995861053467
4.336388111114502
4.332207202911377
4.320216178894043
4.332622051239014
4.327049732208252
4.319908618927002
4.3228044509887695
4.322932720184326
4.318503379821777
4.317607879638672
4.333260536193848
4.3164191246032715
4.322883129119873
4.306053161621094
4.313893795013428
4.32227325439453

In [47]:
from importlib import reload

import src.data_util

reload(src.data_util)

from src.data_util import DataLoader
dl = DataLoader("shakespeare_char")



tensor([[21]])
tensor([[1]])


In [48]:
dl.stoi

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}