<a href="https://colab.research.google.com/github/thomasshin/NLP_Study/blob/main/Andrej_Karpathy_Neural_Network_Zero_to_Hero/Exercises_from_lecture_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls "/content/drive/My Drive/makemore-master"

LICENSE  makemore.py  names.txt  README.md


In [None]:
!cp "/content/drive/My Drive/makemore-master/names.txt" "names.txt"

In [None]:
!ls

drive  names.txt  sample_data


In [None]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()

In [None]:
#build the vocabulary of characters and mappigs to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [None]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


### ***E01***
Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [None]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,40), generator=g)
W1 = torch.randn((120,300), generator=g)
b1 = torch.randn(300, generator=g)
W2 = torch.randn((300,27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [None]:
for p in parameters:
  p.requires_grad = True

In [None]:
for i in range(300000):

  # minibatch construction
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C[Xtr[ix]]
  h = torch.tanh(emb.view(-1, 120) @ W1 + b1)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits, Ytr[ix])

  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update
  if i < 250000:
    lr = 0.1
  else:
    lr = 0.01

  for p in parameters:
    p.data += -lr * p.grad

  #track stats
  #lri.append(i)
  #lossi.append(loss.log10().item())

In [None]:
#tr loss
emb = C[Xtr]
h = torch.tanh(emb.view(-1, 120) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr)
loss

tensor(1.9888, grad_fn=<NllLossBackward0>)

In [None]:
#dev/val loss
emb = C[Xdev]
h = torch.tanh(emb.view(-1, 120) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.1290, grad_fn=<NllLossBackward0>)

In [None]:
#test loss
emb = C[Xte]
h = torch.tanh(emb.view(-1, 120) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Yte)
loss

tensor(2.1346, grad_fn=<NllLossBackward0>)

### ***E02***
I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

In [None]:
uniform_e = -torch.tensor(1/27).log()
print("uniform initializatiom loss :", uniform_e.item())

uniform initializatiom loss : 3.295836925506592


In [None]:
C_ = torch.randn((27,10), generator=g)
W1_ = torch.randn((30,200), generator=g)
b1_ = torch.randn(200, generator=g)
W2_ = torch.randn((200,27)) * 0.001
b2_ = torch.zeros(27)
parameters_ = [C_, W1_, b1_, W2_, b2_]
for p in parameters_:
  p.requires_grad = True

In [None]:
for i in range(1):

  # minibatch construction
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C_[Xtr[ix]]
  h = torch.tanh(emb.view(-1, 30) @ W1_ + b1_)
  logits = h @ W2_ + b2_
  loss = F.cross_entropy(logits, Ytr[ix])

  # backward pass
  for p in parameters_:
    p.grad = None
  loss.backward()

  # update
  lr = 0.1
  for p in parameters_:
    p.data += -lr * p.grad
print("loss from tuned initialization :", loss.item())

loss from tuned initialization : 3.2973854541778564


### ***E03***
Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?



- Direct connections from the word features to the output

In [None]:
g = torch.Generator().manual_seed(2147483647)
C__ = torch.randn((27,40), generator=g)
W1__ = torch.randn((120,300), generator=g)
b1__ = torch.randn(300, generator=g)
W2__ = torch.randn((300,27), generator=g)
b2__ = torch.randn(27)
W3__ = torch.randn((120,27), generator=g)
parameters__ = [C__, W1__, b1__, W2__, b2__, W3__]

In [None]:
for p in parameters__:
  p.requires_grad = True

In [None]:
for i in range(300000):

  # minibatch construction
  ix = torch.randint(0, Xtr.shape[0], (32,))

  # forward pass
  emb = C__[Xtr[ix]]
  h = torch.tanh(emb.view(-1, 120) @ W1__ + b1__)
  logits = h @ W2__ + b2__ + emb.view(-1,120) @ W3__ # Direct connections from the word features to the output
  loss = F.cross_entropy(logits, Ytr[ix])

  # backward pass
  for p in parameters__:
    p.grad = None
  loss.backward()

  # update
  if i < 250000:
    lr = 0.1
  else:
    lr = 0.01

  for p in parameters__:
    p.data += -lr * p.grad

In [None]:
#tr loss
emb = C__[Xtr]
h = torch.tanh(emb.view(-1, 120) @ W1__ + b1__)
logits = h @ W2__ + b2__ + emb.view(-1,120) @ W3__
loss = F.cross_entropy(logits, Ytr)
loss

tensor(1.9937, grad_fn=<NllLossBackward0>)

In [None]:
#dev/val loss
emb = C__[Xdev]
h = torch.tanh(emb.view(-1, 120) @ W1__ + b1__)
logits = (h @ W2__ + b2__) + (emb.view(-1,120) @ W3__)
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.1178, grad_fn=<NllLossBackward0>)

In [None]:
#test loss
emb = C__[Xte]
h = torch.tanh(emb.view(-1, 120) @ W1__ + b1__)
logits = (h @ W2__ + b2__) + (emb.view(-1,120) @ W3__)
loss = F.cross_entropy(logits, Yte)
loss

tensor(2.1147, grad_fn=<NllLossBackward0>)

Compare to original train, val and test losses, the Direct connections from the word features to the output has decreased the val and test losses.