In [9]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [10]:
# read in all the words
words = open('../data/names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']


In [11]:
# build the vocabulary of characters and mapping to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [12]:
# build the dataset
block_size = 3 # context length : how many characters do we take to predict the next one?
def build_dataset(words):
    X,Y = [], []
    for w in words:
        # print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)  
            Y.append(ix)
            context = context[1:] + [ix] # crop and append
    X = torch.tensor(X) 
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X,Y

import random 
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])      # 80% training set
Xdev, Ydev = build_dataset(words[n1:n2])  # 10% dev=validation set
Xte, Yte = build_dataset(words[n2:])      # 10% test set

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [13]:
# GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device : {device}')

Xtr = Xtr.to(device)
Ytr = Ytr.to(device)
Xdev = Xdev.to(device)
Ydev = Ydev.to(device)
Xte = Xte.to(device)
Yte = Yte.to(device)

Using device : cpu


### ok biolerplate done, now we get to the action:

In [14]:
# utility function we will use later when comparing manual gradients to PyTorch gradients

def cmp(s, dt, t):
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

In [15]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

seed_num = 2147483647
g = torch.Generator().manual_seed(seed_num)

# torch.randn
# size         : defining the shape of the output tensor   
C = torch.randn((vocab_size,n_embd), generator=g).to(device)

# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/(n_embd * block_size) ** 0.5 
W1 = W1.to(device)
b1 = torch.randn(n_hidden,generator=g) * 0.001

# Layer 2
W2 = torch.randn((n_hidden,vocab_size), generator=g) * 0.01 # we want roughly zero at logits
W2 = W2.to(device)
b2 = torch.randn(vocab_size,generator=g) * 0 # at initialization, bias starts with 0, because we want roughly zero at logits.
b2 = b2.to(device)

# BatchNorm parameters
bngain = torch.ones((1, n_hidden)).to(device)
bnbias = torch.zeros((1, n_hidden)).to(device)


parameters = [C, W1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total

for p in parameters:
    p.requires_grad = True

12097


In [16]:
C.shape

torch.Size([27, 10])

In [17]:
batch_size = 32
n = batch_size # a shorter variable also, for convenience

# construct a minibatch

# torch.randint
# 0            : 뽑을 수 있는 가장 작은 정수 (포함)
# Xtr.shape[0] : 뽑을 수 있는 가장 큰 정수, 이 경우 데이터셋의 총 샘플 수 (미포함)
# (batch_size,): 결과 텐서의 크기, 이 경우 (32) 

# batch_size 만큼 랜덤하게 idx 번호를 뽑는다.
ix = torch.randint(0, Xtr.shape[0], (batch_size, ), generator=g) 
# 뽑은 idx 번호로 인덱싱해서 미니 배치를 만든다.
# 미니 배치 shape >>> ([32,3])
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

In [None]:
# forward pass, 'chunkated' into smaller steps that are possible to backward one at a time
emb = C[Xb] # embed the characters into vectors

# C : 임베딩 테이블, 27(문자 개수) * 10(임베딩 차원)
# Xb : 배치 데이터, 32(배치 사이즈) * 3(문자 인덱스)

# C[Xb]는 numpy의 'fancy indexing'처럼 동작
# 핵심은, C의 행이 문자 개수만큼, Xb의 열이 문자 인덱스라는 점

# Xb의 각 문자 인덱스를 행 번호로 해서, C에서 (1,10)의 tensor를 lookup
# 따라서 emb.shape는 ([32,3,10])

In [None]:
# emb.shape[0] = 32
# embcat.shape == ([32,30])
embcat = emb.view(emb.shape[0], -1) # concatenate the vectors

# Linear layer 1
# W1.shape == ([30,200]) == ([block_size * n_embd, n_hidden])
# b1.shape == ([200, 1])
# hprebn.shape == ([32,200])
hprebn = embcat @ W1 + b1 # hidden layer pre-activation

# BatchNorm layer
bnmeani = 1/n * hprebn.sum(0, keepdim=True)
bndiff = hprebn - bnmeani
bndiff2 = bndiff ** 2
bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim = True) # note: Bessel's correction (dividing by n-1, not n)
bnvar_inv = (bnvar + 1e-5) ** -0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias

# Non-linearity
h = torch.tanh(hpreact) # hidden layer

# Linear layer 2
logits = h @ W2 + b2 # output layer

### softmax에서 max 값을 빼는 이유

exp는 $y^{i} = e^{x_i}$이다.  
즉, logits를 승수로 한다.  
이때, logits가 아주 크면, 지수함수이므로 결과값이 무한대에 가까워질 수 있다.  

따라서 logit에서 max값을 빼준다.  

### logit에서 max값을 빼주어도, softmax 결과는 같은 이유.

softmax 수식으로 보면,
$$\dfrac{e^{x_i - m}}{\sum_j{e^{x_j - m}}} = \dfrac{e^{x_i} \cdot e^{-m}}{\sum_j{e^{x_j}} \cdot e^{-m}} = \dfrac{e^{x_i}}{\sum_j{e^{x_j}}}$$

즉, 분모 분자에 같은 값을 곱한 것과 같으므로, softmax 결과는 같다.

In [None]:
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # subtract max for numerical stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdim=True)
counts_sum_inv = counts_sum ** -1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), Yb].mean()

# Pytorch backward pass
for p in parameters:
    p.grad = None

for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
    t.retain_grad()

loss.backward()
loss

In [11]:
logprobs.shape

torch.Size([32, 27])

In [None]:
# Exercise 1 : backprop through the whole thing manually,
# backpropagating through exactly all of the variables
# as they are defined in the forward pass above, one by one

dlogprobs = 