In [99]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
words = open('../../names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)

block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:]) 

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [90]:
# MLP revisited
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g)
b1 = torch.randn(n_hidden,                        generator=g)
W2 = torch.randn((n_hidden, vocab_size),          generator=g)
b2 = torch.randn(vocab_size,                      generator=g)

# # BatchNorm parameters
# bngain = torch.ones((1, n_hidden))
# bnbias = torch.zeros((1, n_hidden))
# bnmean_running = torch.zeros((1, n_hidden))
# bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W2, b2, b1]
for p in parameters:
  p.requires_grad = True

In [93]:
torch.randint(0, Xtr.shape[0], (32,))

tensor([159461, 112211,  12469,   1736, 110579, 158757, 139065,  22757,   3948,
        122680,  33196, 154905,  52446,  21921, 140175,  33069,  43168,  38159,
         18664,  11272, 164305,   6444,  29532, 118557, 123448,  88050,  51799,
         50274, 173020,  23536,   3445,  11694])

In [91]:
for i in range(1):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (32,))
  
  # forward pass
  emb = C[Xtr[ix]] # (32, 3, 10)

  hprev = emb.view(-1, 30) @ W1 + b1
  
  h = torch.tanh(hprev) # (32, 200)
  

  logits = h @ W2 + b2 # (32, 27)
  probs = F.softmax(logits, dim=1)
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  #lr = lrs[i]
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  break

tensor([[[ 1.5674e+00, -2.3729e-01, -2.7385e-02, -1.1008e+00,  2.8588e-01,
          -2.9643e-02, -1.5471e+00,  6.0489e-01,  7.9136e-02,  9.0462e-01],
         [ 1.2815e+00, -6.3182e-01, -1.2464e+00,  6.8305e-01, -3.9455e-01,
           1.4388e-02,  5.7216e-01,  8.6726e-01,  6.3149e-01, -1.2230e+00],
         [-2.4780e-01,  8.8282e-01, -8.1004e-02, -9.5299e-01, -4.8838e-01,
          -7.3712e-01,  7.0609e-01, -1.9295e-01,  1.2348e+00,  3.3308e-01]],

        [[-2.1286e-01,  5.0950e-01,  3.2713e-01,  1.9661e+00, -2.4091e-01,
          -7.9515e-01,  2.7198e-01, -1.1100e+00, -4.5285e-01, -4.9578e-01],
         [-1.2801e+00,  9.2445e-02,  1.0526e-01, -3.9072e-01,  3.1723e-02,
          -5.4753e-01,  8.1827e-01, -8.1628e-01, -3.9243e-01, -7.4521e-01],
         [ 4.6827e-01, -6.5650e-01,  6.1662e-01, -6.2197e-01,  5.1007e-01,
           1.3563e+00,  2.3445e-01, -4.5585e-01, -1.3132e-03, -5.1161e-01]],

        [[ 1.1560e-01,  8.0319e-01,  5.4108e-01, -1.1646e+00,  1.4756e-01,
          -1.00

In [None]:
emb.view(-1, 30)

In [None]:
hprev.size(), h.size(), logits.size(), probs.size()

In [None]:
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

# Your code to get hprev, h, logits, and probs
# ...

# Convert the tensors to NumPy and flatten them for histogram plotting
hprev_numpy = hprev.detach().numpy().flatten()
h_numpy = h.detach().numpy().flatten()
logits_numpy = logits.detach().numpy().flatten()
probs_numpy = probs.detach().numpy().flatten()

# Create a figure with 2x2 grid of axes
fig, axs = plt.subplots(2, 2, figsize=(20, 16))

# Plot histogram for hprev
axs[0, 0].hist(hprev_numpy, bins=50, color='blue', alpha=0.7)
axs[0, 0].set_title('Histogram of hprev values')
axs[0, 0].set_xlabel('Value')
axs[0, 0].set_ylabel('Frequency')

# Plot histogram for h
axs[0, 1].hist(h_numpy, bins=50, color='green', alpha=0.7)
axs[0, 1].set_title('Histogram of h values')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

# Plot histogram for logits
axs[1, 0].hist(logits_numpy, bins=50, color='red', alpha=0.7)
axs[1, 0].set_title('Histogram of logits values')
axs[1, 0].set_xlabel('Value')
axs[1, 0].set_ylabel('Frequency')

# Plot histogram for probs
axs[1, 1].hist(probs_numpy, bins=50, color='purple', alpha=0.7)
axs[1, 1].set_title('Histogram of probs values')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')

# Layout adjustment and display
plt.tight_layout()
plt.show()


In [None]:
[(max(data).item(),min(data).item()) for data in hprev],len([(max(data),min(data)) for data in hprev]), hprev.size()


In [None]:
[(max(data).item(),min(data).item()) for data in h],len([(max(data),min(data)) for data in h]), h.size()
