In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


### Create dataset: bigrams

## MLP optimisation idea: for lang modelling
- logits: interpreting final layer output, as a log of counts (before it is transformed, into counts, then probabilities)

- part 1: bigrams training
    - estimate parameters: probability distribution for sampling: 
        - method 1:by counting (statistical/feature method): 2.47
        - method 2: by NN: gradient based learning: also around 2.46
            - with one hot encoding, it acts as index, into weights (logits output)
    - model smoothing: creates a more uniform prob distribution, to prevent zero prob assigned to any token (but without causing too much loss in information)
        - method 1: by adding 1 / a constant, to smooth the probability distribution (make it more uniform), to prevent extremes (eg. zeros)
        - method 2:
            - if W is initialised to the same value, (eg. zero, then logits w be zero, prob = 1)
            - incentivising W to be near zero, 
            - by add W**2 as regularisation term to loss function; 
            - without overpowering W learning (loss term) by adding a constant to control the 'strength' of regularisation term
        
- step 2: MLP 
    - dataset: increase context length
    - model: 
        - improved embedding 
        - 
    - model training: 
        - overfitting
        - batch training
        - learning rate
        - train/val/test split

    - experiments: (w design features)
        - increase hidden layers, embed size..
        - viz character embeddings O_O

    - notes: hidden layers, internal of torch.tensor!

- step 3: 
    - 

In [24]:
xs = torch.tensor([0, 5, 13, 13, 1])

ys = torch.tensor([5, 13, 13, 1, 0])
ys.dtype

torch.int64

In [47]:
# INITIALISE NETWORK 
# random initialisation of 27 neuron weights 
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)
W.dtype

torch.float32

In [55]:
# forward pass 

# one hot encoding (embedding, project data to dimension 27)
xenc = F.one_hot(xs, num_classes=27).float()
# SIDE_NOTE: requires same dtype
print(xenc.dtype)
print(W.dtype)

# predict log counts (data * 27 @ 27 * 27)
logits = xenc @ W
# print(logits)
# softmax: log exp ; and normalise - for prob of next token
counts = logits.exp()
probs = counts / counts.sum()

# eval: avg neg log likelihood
loss = -probs[torch.arange(5), ys].log().mean()

torch.float32
torch.float32


tensor(5.4286, grad_fn=<NegBackward0>)

In [50]:
## backward : compute gradients for all weights wrt to loss  
W.grad = None
loss.backward()

In [54]:
# update parameters based on gradients, to min loss 
# with learning rate 
W.data += -0.1 * W.grad

In [38]:
# loss : by average neg log likelihood 
# maximise likelihood of correct ground truth data 
# (by mle)

# ground trutsh
probs[0, 5], probs[1, 13], probs[2, 13], probs[3, 1], probs[4, 0]


(tensor(0.0039),
 tensor(0.0041),
 tensor(0.0042),
 tensor(0.0116),
 tensor(0.0021))

In [40]:
torch.arange(5)

tensor([0, 1, 2, 3, 4])

In [41]:
# get average log likelihood
# access by array 
probs[torch.arange(5), ys]

tensor([0.0039, 0.0041, 0.0042, 0.0116, 0.0021])

In [42]:
probs[torch.arange(5), ys].log().mean()

tensor(-5.4286)

In [None]:
## compile into final training script 
## sample from 

## Application to makemore

In [4]:
words = open('data/names.txt', 'r').read().splitlines()
words[:8]


['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [11]:
# build 'tokenizer' mapping 
chars = sorted(list(set("".join(words))))
stoi = {s:i for i, s in enumerate(chars)}
# note end of line
stoi['.'] = 0
itos = {i:s for i, s in enumerate(chars)}
print(itos)


{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z'}


In [None]:
# build dataset

X, y = [], []

for w in words: 
    

In [None]:
# model 

In [None]:
# gather parameters 

In [None]:
# update and optimise 