<a href="https://colab.research.google.com/github/toussaintma/neuralnetworksfromzerotohero/blob/main/walkthrough_makemore_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# data and code at https://github.com/karpathy/makemore
# course at https://www.youtube.com/watch?v=PaCmpygFfXo&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=2&t=11s

# Part 1: Next character probability

In [3]:
import pandas as pd
import torch

In [4]:
df_words = pd.read_csv('names.txt', header=None, names=['name'])

In [5]:
df_words.shape

(32033, 1)

In [6]:
df_words['name']

0            emma
1          olivia
2             ava
3        isabella
4          sophia
           ...   
32028       zylas
32029       zyran
32030       zyrie
32031       zyron
32032       zzyzx
Name: name, Length: 32033, dtype: object

In [7]:
stat = {}
for w in df_words.loc[:, 'name']:
  w = '.' + w + '.'
  for ch1, ch2 in zip(w, w[1:]):
    #print(ch1, ch2, sep='')
    if ch1 + ch2 in stat:
      stat[ch1 + ch2] += 1
    else:
      stat[ch1 + ch2] = 1
len(stat)

627

In [8]:
stat['em']

769

In [9]:
sorted(stat.items(), key = lambda v: -v[1])[:10]

[('n.', 6763),
 ('a.', 6640),
 ('an', 5438),
 ('.a', 4410),
 ('e.', 3983),
 ('ar', 3264),
 ('el', 3248),
 ('ri', 3033),
 ('na', 2977),
 ('.k', 2963)]

In [10]:
def get_index(c):
  result = 0
  if (c != '.'):
    result = ord(c) - ord('a') + 1
  return result

def get_char(i):
  result = '.'
  if (i != 0):
    result = chr(ord('a') + i - 1)
  return result
get_index('e'), get_index('m'), get_index(get_char(4)), get_index('.'), get_char(0)

(5, 13, 4, 0, '.')

In [11]:
stat['.a'], stat['a.']

(4410, 6640)

In [12]:
stats = [[0 for i in range(27)] for j in range(27)]
len(stats)

27

In [13]:
for k, v in stat.items():
  stats[get_index(k[0])][get_index(k[1])] = v
t_stats = torch.tensor(stats, dtype=torch.float32)
t_stats.shape, t_stats.ndim

(torch.Size([27, 27]), 2)

In [14]:
t_stats[5, 13] # is 'em' = 769

tensor(769.)

In [15]:
t_stats.sum(dim=1, keepdim=True).shape

torch.Size([27, 1])

In [16]:
t_stats = t_stats / t_stats.sum(dim=-1).unsqueeze(-1) # a is 556/27245 found 2.0407e-02 OK
t_stats[0]

tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])

In [17]:
t_stats.sum(dim=1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [18]:
sample = torch.multinomial(t_stats, 20, replacement=True)
result = '.'
next_char = '.'
done = False
for i in range(10):
  next_char = get_char(sample[get_index(next_char)][i])
  if (next_char == '.'):
    done = True
  if (not done):
    result = result + next_char
print(result)

.rran


In [19]:
t_stats[0]

tensor([0.0000, 0.1377, 0.0408, 0.0481, 0.0528, 0.0478, 0.0130, 0.0209, 0.0273,
        0.0184, 0.0756, 0.0925, 0.0491, 0.0792, 0.0358, 0.0123, 0.0161, 0.0029,
        0.0512, 0.0642, 0.0408, 0.0024, 0.0117, 0.0096, 0.0042, 0.0167, 0.0290])

In [20]:
torch.zeros(1)

tensor([0.])

In [21]:
ll = 0
n = 0
result = []
for w in df_words.loc[:, 'name']:
  w = '.' + w + '.'
  for ch1, ch2 in zip(w, w[1:]):
    newll = torch.log(t_stats[get_index(ch1)][get_index(ch2)])
    n += 1
    ll += newll.item()
    result.append([ch1, ch2, newll, ll])
ll # should be -38.7856 for the first 3 words
ll /= n
nll = - ll
nll

2.4540144946949742

# Part 2: neuron net

In [22]:
xs = []
ys = []

for w in df_words.loc[:, 'name']:
  chs = '.' + w + '.'
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = get_index(ch1)
    ix2 = get_index(ch2)
    #print(ch1, ch2)
    xs.append(ix1)
    ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
#xs, ys
num = xs.nelement()
print(num)

228146


In [23]:
import torch.nn.functional as F
import torch.nn as nn

xenc = F.one_hot(xs, 27).float()
yenc = F.one_hot(ys, 27).float()

import matplotlib.pyplot as plt
#plt.imshow(xenc)

In [24]:
W = torch.randn((27, 27), requires_grad=True)
r = xenc @ W
xenc.shape, W.shape, r.shape

(torch.Size([228146, 27]), torch.Size([27, 27]), torch.Size([228146, 27]))

In [25]:
for a in range(100):
  logits = xenc @ W # log counts
  counts = logits.exp()
  probs = counts / counts.sum(dim=1, keepdim=True)
  probs[0].sum(), probs
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean()
  print(loss.item())
  W.grad = None
  loss.backward()
  W.data += -50 * W.grad

3.8227946758270264
3.4095218181610107
3.1734468936920166
3.032843589782715
2.9366872310638428
2.8647685050964355
2.8096301555633545
2.767125129699707
2.7341184616088867
2.7079622745513916
2.686631679534912
2.668767213821411
2.653508424758911
2.640298843383789
2.628753662109375
2.618584394454956
2.6095659732818604
2.601515293121338
2.594285249710083
2.587754726409912
2.5818264484405518
2.576420307159424
2.571469306945801
2.566920042037964
2.5627267360687256
2.558849573135376
2.5552568435668945
2.551919460296631
2.5488131046295166
2.5459158420562744
2.5432090759277344
2.540675401687622
2.5383007526397705
2.536071538925171
2.533975601196289
2.5320029258728027
2.5301427841186523
2.5283870697021484
2.5267281532287598
2.525158166885376
2.5236706733703613
2.5222601890563965
2.520920515060425
2.5196473598480225
2.5184357166290283
2.5172817707061768
2.516181230545044
2.5151307582855225
2.514127492904663
2.5131680965423584
2.5122499465942383
2.5113704204559326
2.5105276107788086
2.50971889495849

In [26]:
#nlls = torch.zeros(5)
#for i in range(5):
#  x = xs[i].item()
#  y = ys[i].item()
#  print(f'bigram example {i+1}: {get_char(x)}{get_char(y)} index {x}, {y}')
#  print(f'input to the neural net: {x}')
#  print(f'output probabilities: {probs[i]}')
#  print(f'label: {y}')
#  p = probs[i, y]
#  print(f'probability assigned to the correct character: {p.item()}')
#  logp = torch.log(p)
#  print(f'log likelyhood: {logp.item()}')
#  nll = - logp
#  print(f'negative log likelyhood: {nll}')
# nlls[i] = nll
# print(f'average nll: {nlls.mean().item()}')


In [27]:
class Bigram_nn(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(27, 27, bias=False),
            nn.LogSoftmax(dim=1))
    def forward(self, x):
        logits = self.linear(x)
        return logits

model = Bigram_nn()
loss_fn = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=50)
model

Bigram_nn(
  (linear): Sequential(
    (0): Linear(in_features=27, out_features=27, bias=False)
    (1): LogSoftmax(dim=1)
  )
)

In [28]:
for e in range(100):
  preds = model(xenc)
  loss = loss_fn(preds, ys)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()

print(f"step {e+1} {loss:.3f}") #, end="; ")

step 100 2.471


## Exercises

In [31]:
# trigram model
x_l = []
y_l = []
tri_index = []

def stoi(bi):
  if not(bi in tri_index):
    tri_index.append(bi)
  return tri_index.index(bi)


for w in df_words.loc[:, 'name']:
  chs = '.' + w + '.'
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    ix1 = stoi(ch1 + ch2)
    ix3 = get_index(ch3)
    #print(ch1, ch2, ch3)
    x_l.append(ix1)
    y_l.append(ix3)

x_l = torch.tensor(x_l)
y_l = torch.tensor(y_l)
num = len(tri_index)
print(num)

601


In [32]:
x_lenc = F.one_hot(x_l, num).float()
x_lenc.shape

torch.Size([196113, 601])

In [33]:
class Trigram_nn(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(num, 27, bias=False),
            nn.LogSoftmax(dim=1))
    def forward(self, x):
        logits = self.linear(x)
        return logits

model = Trigram_nn()
loss_fn = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=50)
model

Trigram_nn(
  (linear): Sequential(
    (0): Linear(in_features=601, out_features=27, bias=False)
    (1): LogSoftmax(dim=1)
  )
)

In [36]:
for e in range(200):
  preds = model(x_lenc)
  loss = loss_fn(preds, y_l)
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()

print(f"step {e+1} {loss:.3f}") #

step 200 2.211
