<a href="https://colab.research.google.com/github/ysurs/nn_with_karpathy/blob/main/folding_batchnorm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [256]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures

%matplotlib inline

In [264]:
# read in all the words
words = open('/content/drive/MyDrive/building_makemore/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [265]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [266]:
block_size=3

In [267]:
# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [268]:
Xtr.shape

torch.Size([182625, 3])

In [269]:
Ytr.shape

torch.Size([182625])

Pytorchfying things and building a 3 layered mlp

In [245]:
class Linear:

  def __init__(self,fanin,fanout,bias=True):
   self.weight=torch.randn((fanin,fanout))/(fanin)**0.5
   self.bias=torch.zeros(fanout) if bias==True else None   ## bias is usually intialised to zero in case to speed up training


  def __call__(self,x):
    self.out = x@self.weight
    if self.bias is not None:
      self.out+=self.bias
    return self.out

  
  def parameters(self):
    return [self.weight]+([] if self.bias is None else [self.bias])


In [246]:
class batchnorm1d:

  def __init__(self,dim,eps=1e-5,momentum=0.1):
    self.eps=eps
    self.momentum=momentum
    self.training=True
    self.gamma=torch.ones(dim)
    self.beta=torch.zeros(dim)

    self.running_mean=torch.zeros(dim)
    self.running_var=torch.ones(dim)

  def __call__(self,h):
    
    if self.training==True:
      hmean=h.mean(0,keepdims=True)
      hvar=h.var(0,keepdims=True)
    else:
      hmean=self.running_mean
      hvar=self.running_var
    
    hnorm=(h-hmean)/((hvar+self.eps)**0.5)
    self.out=self.gamma*hnorm+self.beta

    if self.training:
      with torch.no_grad():
       self.running_mean=(1-self.momentum)*self.running_mean +(self.momentum)*hmean
       self.running_var=(1-self.momentum)*self.running_var +(self.momentum)*hvar

    return self.out
  
  def parameters(self):
    return [self.gamma,self.beta]

  



In [247]:
class tanh:

  def __call__(self,x):
    self.out=torch.tanh(x)
    return self.out
  
  def parameters(self):
    return []

In [270]:
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 100 # the number of neurons in the hidden layer of the MLP
g = torch.Generator().manual_seed(2147483647) # for reproducibility

In [271]:
C = torch.randn((vocab_size, n_embd),            generator=g)

In [272]:
layers = [
  Linear(n_embd * block_size, n_hidden,bias=False), batchnorm1d(n_hidden), tanh(),
  Linear(           n_hidden, n_hidden, bias=False), batchnorm1d(n_hidden), tanh(),
  Linear(           n_hidden, n_hidden, bias=False), batchnorm1d(n_hidden), tanh(),
  Linear(           n_hidden, vocab_size, bias=False), batchnorm1d(vocab_size),
]

In [273]:
with torch.no_grad():
  
  layers[-1].gamma*=0.1 ## making the output layer less confident

  for layer in layers[:-1]:

    if isinstance(layer,Linear):
      layer.weight*=5/3          ## Kiaming initialisation

In [274]:
parameters=[C]+[p for layer in layers for p in layer.parameters()]

for p in parameters:
  p.requires_grad = True

In [275]:
## Training needs to be done now

# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []
ud = []

for i in range(max_steps):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

  emb=C[Xb]
  x=emb.view(emb.shape[0],-1)

  for layer in layers:
    x=layer(x)
  
  loss=F.cross_entropy(x,Yb)

  # backward pass
  for layer in layers:
      layer.out.retain_grad() # AFTER_DEBUG: would take out retain_graph
    
      #print(layer)
  for p in parameters:
    p.grad=None
  #print(loss)
  #loss.requires_grad=True
  loss.backward()

  # update
  #with torch.no_grad():
  lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad
  

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())
  with torch.no_grad():
    ud.append([((lr*p.grad).std() / p.data.std()).log10().item() for p in parameters])




      0/ 200000: 3.2818
  10000/ 200000: 2.4163
  20000/ 200000: 1.9671
  30000/ 200000: 1.9336
  40000/ 200000: 2.3348
  50000/ 200000: 2.3135
  60000/ 200000: 1.7474
  70000/ 200000: 2.1601
  80000/ 200000: 2.0903
  90000/ 200000: 1.9366
 100000/ 200000: 1.6916
 110000/ 200000: 2.0548
 120000/ 200000: 1.8437
 130000/ 200000: 1.8865
 140000/ 200000: 2.0312
 150000/ 200000: 1.8649
 160000/ 200000: 1.9336
 170000/ 200000: 2.3135
 180000/ 200000: 2.0979
 190000/ 200000: 1.9938


In [254]:
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, y)
  print(split, loss.item())

# put layers into eval mode
for layer in layers:
  layer.training = False
split_loss('train')
split_loss('val')

train 2.026998996734619
val 2.0902256965637207


In [277]:
def fold(linear_batchnorm):
  with torch.no_grad():
    only_linear=[]
    for i in range(len(linear_batchnorm)):

      if isinstance(linear_batchnorm[i],Linear):
        linear_batchnorm[i].weight=((layers[i].parameters())[0]*layers[i+1].parameters()[0])/(layers[i+1].running_var+layers[i+1].eps)**0.5
        linear_batchnorm[i].bias=-(((layers[i+1].running_mean)*layers[i+1].parameters()[0])/(layers[i+1].running_var+layers[i+1].eps)**0.5)+layers[i+1].parameters()[1]
        only_linear.append(linear_batchnorm[i])
      if isinstance(linear_batchnorm[i],tanh):
        only_linear.append(linear_batchnorm[i])
  return only_linear

In [278]:
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  folded_layers=fold(layers)

  for layer in folded_layers:
    x = layer(x)
  loss = F.cross_entropy(x, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')



train 2.0272374153137207
val 3.074354887008667


### Getting different values after normal batch norm and folding batch norm, needs some rechecking

*****

### Rough shape checking below

In [166]:
(layers[0].parameters())#[0].sha


[tensor([[ 0.4556, -0.5162,  0.9341,  ..., -0.1871, -0.3436, -0.4096],
         [-0.0550, -0.1066,  0.1744,  ...,  0.0784,  0.2280,  0.6458],
         [-0.3999, -0.4941,  0.3786,  ..., -0.2446,  0.3587, -0.0846],
         ...,
         [-0.6333, -0.7574, -0.0051,  ...,  0.0594,  0.6658,  0.1757],
         [ 0.3074,  0.1121,  0.2378,  ..., -0.3029, -0.7110,  0.6990],
         [-0.4368,  0.3691,  0.5114,  ...,  0.6087,  0.0854, -0.2348]],
        requires_grad=True)]

In [169]:
(layers[1].running_mean).shape

torch.Size([1, 100])

In [185]:
layers[0].weight

tensor([[ 0.4556, -0.5162,  0.9341,  ..., -0.1871, -0.3436, -0.4096],
        [-0.0550, -0.1066,  0.1744,  ...,  0.0784,  0.2280,  0.6458],
        [-0.3999, -0.4941,  0.3786,  ..., -0.2446,  0.3587, -0.0846],
        ...,
        [-0.6333, -0.7574, -0.0051,  ...,  0.0594,  0.6658,  0.1757],
        [ 0.3074,  0.1121,  0.2378,  ..., -0.3029, -0.7110,  0.6990],
        [-0.4368,  0.3691,  0.5114,  ...,  0.6087,  0.0854, -0.2348]],
       requires_grad=True)

In [184]:
((layers[0].parameters())[0]*layers[1].parameters()[0])/(layers[1].running_var+layers[1].eps)**0.5#.shape

tensor([[ 0.3376, -0.3569,  0.5789,  ..., -0.1201, -0.2349, -0.3138],
        [-0.0407, -0.0737,  0.1081,  ...,  0.0503,  0.1559,  0.4948],
        [-0.2963, -0.3416,  0.2346,  ..., -0.1570,  0.2453, -0.0648],
        ...,
        [-0.4692, -0.5237, -0.0031,  ...,  0.0381,  0.4553,  0.1346],
        [ 0.2277,  0.0775,  0.1474,  ..., -0.1944, -0.4862,  0.5356],
        [-0.3236,  0.2552,  0.3170,  ...,  0.3907,  0.0584, -0.1799]],
       grad_fn=<DivBackward0>)

In [196]:
((layers[1].running_mean)*layers[1].parameters()[0])/(layers[1].running_var+layers[1].eps)**0.5#.shape

tensor([[ 1.3109, -0.5712,  0.0744,  1.7327, -0.3876, -0.0088,  1.3397,  0.8394,
          0.7448,  1.5963, -0.5540, -1.4288, -1.5064,  1.6359,  1.3002,  0.5130,
          0.3724,  0.0935,  0.5197, -0.7148, -0.8239, -0.0983, -1.0260,  0.0917,
          0.6817,  0.4311, -1.2134,  1.9022,  0.2278,  0.4701,  0.4133,  0.0253,
         -0.1466, -0.4083,  0.2230,  0.4810,  0.2815,  0.2737,  1.8236, -0.4514,
         -0.3458, -0.7911,  0.8122,  0.3471, -0.7148, -0.0382,  0.2791,  0.0629,
         -0.8876,  1.2558, -1.0734,  1.1313, -0.0658, -1.5244,  0.5124,  0.3266,
          0.1267, -1.4845,  0.1201,  1.4262, -1.2011, -0.8255, -0.6270,  0.7721,
         -0.2698, -0.9140,  0.2299,  0.6089,  0.4187,  0.5998, -0.0999, -0.5497,
         -0.7161, -0.2868, -0.3372, -1.1187, -0.0772, -0.3294, -0.1829, -2.7382,
          0.5173, -0.2464,  0.6587, -1.2862, -0.3789,  0.2041,  0.5631, -1.2353,
         -0.5645, -0.0864, -0.1272,  0.6849,  2.3741, -0.5449, -0.3802,  0.9762,
          1.2386, -0.0829,  

In [191]:
(layers[1].parameters()[0]).shape


torch.Size([100])

In [193]:
torch.tensor([1,2,3])*torch.tensor([[1,2,3]])#.shape

tensor([[1, 4, 9]])

In [195]:
(layers[1].parameters()[0])*(layers[0].parameters()[0])

tensor([[ 0.7256, -0.8731,  1.6219,  ..., -0.2843, -0.5297, -0.7736],
        [-0.0875, -0.1803,  0.3027,  ...,  0.1191,  0.3515,  1.2199],
        [-0.6368, -0.8357,  0.6573,  ..., -0.3717,  0.5530, -0.1598],
        ...,
        [-1.0086, -1.2811, -0.0088,  ...,  0.0902,  1.0265,  0.3319],
        [ 0.4895,  0.1896,  0.4129,  ..., -0.4602, -1.0963,  1.3203],
        [-0.6956,  0.6243,  0.8881,  ...,  0.9247,  0.1316, -0.4435]],
       grad_fn=<MulBackward0>)