In [94]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
words = open('../../names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:]) 

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [95]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of the MLP

g = torch.Generator().manual_seed(2147483647) # for reproducibility
C  = torch.randn((vocab_size, n_embd),            generator=g)
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2
#W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * .2
#b1 = torch.randn(n_hidden,                        generator=g) * 0.01
W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.01
b2 = torch.randn(vocab_size,                      generator=g) * 0

# BatchNorm parameters
bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))
bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

12097


In [96]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(10):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
  
  # forward pass
  emb = C[Xb] # embed the characters into vectors
  embcat = emb.view(emb.shape[0], -1) # concatenate the vectors
  # Linear layer
  hpreact = embcat @ W1  #+ b1 # hidden layer pre-activation
  # BatchNorm layer
  # -------------------------------------------------------------
  bnmeani = hpreact.mean(0, keepdim=True)
  bnstdi = hpreact.std(0, keepdim=True)
  hpreact1 = bngain * (hpreact - bnmeani) / bnstdi + bnbias
  
  with torch.no_grad():
    bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
    bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
  # -------------------------------------------------------------
  # Non-linearity
  h = torch.tanh(hpreact1) # hidden layer
  logits = h @ W2 + b2 # output layer
  loss = F.cross_entropy(logits, Yb) # loss function
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  lr = 0.1 if i < 100000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())

      0/ 200000: 3.3239


In [99]:
bnbias

tensor([[-8.3022e-04,  2.8436e-03,  7.5748e-04, -1.2532e-03,  1.8060e-03,
          7.4958e-04,  3.8155e-04, -5.7934e-04,  1.9510e-03,  1.2351e-03,
         -1.1636e-03,  2.4252e-03, -2.7925e-05,  1.2173e-03, -6.4847e-04,
         -1.1563e-03,  2.2941e-03,  2.8483e-04, -1.9071e-03,  2.1225e-03,
          7.7311e-04,  1.5842e-03, -7.0111e-04, -1.6864e-03,  6.3860e-04,
          2.2649e-04,  3.5112e-04, -4.1968e-04,  3.0978e-04, -1.5282e-03,
          5.4467e-04, -1.0083e-03, -1.5767e-04, -1.5740e-03,  2.1849e-03,
          4.0400e-04,  3.8938e-03,  1.1417e-03,  7.2118e-05, -1.2634e-03,
         -1.7090e-03, -3.1570e-04,  7.2111e-04,  1.5205e-04, -2.3437e-04,
          3.1936e-04, -1.4578e-03,  6.6501e-04,  1.5635e-03, -2.7168e-03,
         -2.0321e-03, -2.2622e-04, -2.1008e-03, -1.4070e-03, -4.4478e-03,
          4.6786e-04,  1.0359e-03,  3.7095e-04, -4.2625e-03, -1.6165e-04,
          1.0836e-03, -1.7837e-03, -1.4272e-03,  1.3013e-03,  1.3420e-03,
          4.7858e-04,  3.1355e-04,  3.

In [None]:
max(hpreact[0]),min(hpreact[0]),max(hpreact1[0]),min(hpreact1[0])

In [None]:
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

# Your code to get hprev, h, logits, and probs
# ...

# Convert the tensors to NumPy and flatten them for histogram plotting
hprev_numpy = hpreact.detach().numpy().flatten()
hprev1_numpy = hpreact1.detach().numpy().flatten()
h_mean = hpreact.mean(0, keepdim=True).detach().numpy().flatten()
h_mean_sub = (hpreact - hpreact.mean(0, keepdim=True)).detach().numpy().flatten()

# Create a figure with 2x2 grid of axes
fig, axs = plt.subplots(2, 2, figsize=(20, 16))

# Plot histogram for hprev
axs[0, 0].hist(hprev_numpy, bins=50, color='blue', alpha=0.7)
axs[0, 0].set_title('Histogram of hprev values')
axs[0, 0].set_xlabel('Value')
axs[0, 0].set_ylabel('Frequency')

axs[0, 1].hist(hprev1_numpy, bins=50, color='blue', alpha=0.7)
axs[0, 1].set_title('Histogram of hprev values')
axs[0, 1].set_xlabel('Value')
axs[0, 1].set_ylabel('Frequency')

axs[1, 0].hist(h_mean, bins=50, color='blue', alpha=0.7)
axs[1, 0].set_title('Histogram of hmean values')
axs[1, 0].set_xlabel('Value')
axs[1, 0].set_ylabel('Frequency')

axs[1, 1].hist(h_mean_sub, bins=50, color='blue', alpha=0.7)
axs[1, 1].set_title('Histogram of h_mean_sub values')
axs[1, 1].set_xlabel('Value')
axs[1, 1].set_ylabel('Frequency')



In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Generating a synthetic non-normalized dataset that could represent a real-world scenario
# For instance, this could represent ages (20 to 70 years) and incomes (20k to 120k dollars)
np.random.seed(0)  # For reproducibility
ages = np.random.randint(20, 70, 100)
incomes = np.random.randint(20000, 120000, 100)

#10- 20000
#.1 - .3

# Plotting the non-normalized data
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.scatter(ages, incomes)
plt.title('Non-normalized Data')
plt.xlabel('Age')
plt.ylabel('Income ($)')

# Normalizing the data
scaler = MinMaxScaler()
data = np.column_stack((ages, incomes))
normalized_data = scaler.fit_transform(data)

# Plotting the normalized data
plt.subplot(1, 2, 2)
plt.scatter(normalized_data[:, 0], normalized_data[:, 1])
plt.title('Normalized Data')
plt.xlabel('Age (normalized)')
plt.ylabel('Income (normalized)')

# Display the plots
plt.tight_layout()
plt.show()
