<h3><b>Toy Problem</b></h3>

In [594]:
# Toy Problem for educational purposes
import torch
import torch.nn.functional as F

# Vocabulary and mappings
words = ["ab", "abc", "cab", "bac", "aa", "bb"]
alphabet = sorted(list(set("".join(words))))
stoi = {s: i + 1 for i, s in enumerate(alphabet)}  # {'a': 1, 'b': 2, 'c': 3}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}  # {0: '.', 1: 'a', 2: 'b', 3: 'c'}

# Create dataset
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

# Initialize weights
g = torch.Generator().manual_seed(2147483647)
vocab_size = len(stoi)  # 4
W = torch.randn((vocab_size, vocab_size), generator=g, requires_grad=True)

# Gradient descent loop
for k in range(3):  # Run for 3 iterations
    # Forward pass
    xenc = F.one_hot(xs, num_classes=vocab_size).float()  # One-hot encoding
    logits = xenc @ W  # Predict log-counts
    counts = logits.exp()  # Convert to unnormalized counts
    probs = counts / counts.sum(1, keepdims=True)  # Normalize to probabilities
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean()  # Compute loss
    print(f"Iteration {k+1}, Loss: {loss.item()}")
    
    # Backward pass
    print(f'Before backprop')
    W.grad = None  # Reset gradient
    print(f'W.data is: {W.data}')
    print(f'W.grad is: {W.grad}')
    loss.backward()  # Compute gradient
   
    print()
    
    # Update weights
    print(f'After backprop')
    print(f'W.grad is: {W.grad}')
    W.data += -20 * W.grad  # Gradient descent update
    print(f'W.data is: {W.data}')
    print()

# Sample from the model
g = torch.Generator().manual_seed(2147483647)
for i in range(10):
  out = []
  ix = 0
  while True:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=vocab_size).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character

    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

Iteration 1, Loss: 2.218292236328125
Before backprop
W.data is: tensor([[ 1.5674, -0.2373, -0.0274, -1.1008],
        [ 0.2859, -0.0296, -1.5471,  0.6049],
        [ 0.0791,  0.9046, -0.4713,  0.7868],
        [-0.3284, -0.4330,  1.3729,  2.9334]])
W.grad is: None

After backprop
W.grad is: tensor([[ 0.2008, -0.1104, -0.0549, -0.0352],
        [ 0.0402,  0.0162, -0.1308,  0.0735],
        [-0.0942,  0.0643, -0.0202,  0.0518],
        [-0.0914, -0.0443,  0.0251,  0.1150]])
W.data is: tensor([[-2.4487,  1.9715,  1.0709, -0.3968],
        [-0.5190, -0.3531,  1.0688, -0.8656],
        [ 1.9640, -0.3809, -0.0667, -0.2495],
        [ 1.4992,  0.4534,  0.8708,  0.6329]])

Iteration 2, Loss: 1.2635860443115234
Before backprop
W.data is: tensor([[-2.4487,  1.9715,  1.0709, -0.3968],
        [-0.5190, -0.3531,  1.0688, -0.8656],
        [ 1.9640, -0.3809, -0.0667, -0.2495],
        [ 1.4992,  0.4534,  0.8708,  0.6329]])
W.grad is: None

After backprop
W.grad is: tensor([[-0.0008,  0.0486, -0.017

<h3><b>Actual Problem</b></h3>

  

In [565]:
import torch
import torch.nn.functional as F

words  = open("names.txt", "r").read().splitlines()

# Vocabulary and mappings
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

#create the dataset
xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(ix1)
    ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)

# initialize the 'network'
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

# gradient descent
for k in range(20):
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
  counts = logits.exp() # counts, equivalent to N
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())

  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(2147483647)

for i in range(5):
  out = []
  ix = 0
  while True:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

number of examples:  228146
3.768618583679199
3.3788065910339355
3.161090850830078
3.027186155319214
2.9344840049743652
2.867231607437134
2.8166542053222656
2.777146339416504
2.7452542781829834
2.7188305854797363
2.696505308151245
2.6773722171783447
2.6608052253723145
2.6463515758514404
2.633664846420288
2.622471570968628
2.6125476360321045
2.6037068367004395
2.595794916152954
2.5886809825897217
cexza.
mogllurailezityha.
konimittain.
llayn.
ka.
