<a href="https://colab.research.google.com/github/varun29-git/deep-learning-foundations/blob/main/wavenet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

WaveNet Implementation
---

This notebook implements a neural network that generates names character by character. The architecture is based on **WaveNet**, which processes text in a hierarchical, tree-like structure rather than a simple sequence.

The primary goal of this project is to understand deep learning fundamentals by building the model components from scratch using **PyTorch**.

### Key Features
* **Manual Implementation:** All layers (Linear, BatchNorm1d, Tanh, Embedding) are written from first principles to demonstrate the underlying mathematics and gradient flow.
* The model uses `FlattenConsecutive` layer to merge characters in groups, allowing the model to learn patterns efficiently.
*  Includes a custom loop for training and a specific evaluation mode for generating new names.



In [9]:
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn.functional as F
import random

In [10]:
!wget -q https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

In [11]:
# Configuration
g = torch.Generator().manual_seed(2147483647)
block_size = 8
batch_size = 32

In [12]:
with open('names.txt', 'r') as f:
    names = f.read().splitlines()

print(f"Dataset size: {len(names)}")
# Should print roughly 32033

Dataset size: 32033


In [13]:
# Building Dataset

chars = sorted(list(set("".join(names))))

# String to Integer
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

# Integer to String
itos = {i:s for s, i in stoi.items()}


def build_dataset(words):
  X, Y = [], []
  for name in words:
    context = [0] * block_size

    # For each character in name
    for n in name + '.':
      ix = stoi[n]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix]

  # List converted to Tensor
  X = torch.tensor(X, dtype=torch.long) # Explicitly casted to long
  Y = torch.tensor(Y, dtype=torch.long) # Explicitly casted to long

  print(X.shape, Y.shape)
  return X, Y

# Shuffle the dataset
random.seed(42)
random.shuffle(names)
n1 = int(0.8*len(names))
n2 = int(0.9*len(names))

# Split Dataset into train, validation and test
Xtr, Ytr = build_dataset(names[:n1])
Xval, Yval = build_dataset(names[n1:n2])
Xte, Yte = build_dataset(names[n2:])

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [14]:
class Linear:

  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out),generator=g) * (fan_in ** (-0.5))
    self.weight.requires_grad = True
    self.bias = torch.zeros(fan_out) if bias else None
    if self.bias is not None:
      self.bias.requires_grad = True

  def __call__(self,x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out

  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])

#-------------------------------------------------------------------------------


class BatchNorm1d:

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # Parameters
    self.gamma = torch.ones(dim, requires_grad=True) # Gain of backprop
    self.beta = torch.zeros(dim, requires_grad=True) # Bias of backprop
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)

  def __call__(self, x):
    if self.training:
      # 2d = Batch, Features -> Reduce over Batch
      # 3d = Batch, Time, Features -> Reduce over Batch, Time
      if x.ndim == 2:
        dim = 0
      elif x.ndim == 3:
        dim = (0,1)

      # Calculate the means and variences
      xmean = x.mean(dim, keepdim=True)
      xvar = x.var(dim, keepdim=True, unbiased=False)
    else:
      xmean = self.running_mean
      xvar = self.running_var

    # Batch Norm
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
    self.out = self.gamma * xhat + self.beta

    if self.training:
      with torch.no_grad():
        self.running_mean = (1.0 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1.0 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

#-------------------------------------------------------------------------------


class Tanh:

  def __init__(self):
    pass

  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out

  def parameters(self):
    return []

#-------------------------------------------------------------------------------


class Embedding:

  def __init__(self, num_embeddings, embedding_dim):
    self.weight = torch.randn((num_embeddings, embedding_dim))
    self.weight.requires_grad = True

  def __call__(self, ix):
    self.out = self.weight[ix]
    return self.out

  def parameters(self):
    return [self.weight]

#-------------------------------------------------------------------------------

class FlattenConsecutive:

  def __init__(self,n):
    self.n = n

  def __call__(self, x):
    B, T, C = x.shape
    x = x.view(B, T//self.n, C * self.n)
    if x.shape[1] == 1:
      x = x.squeeze(1)
    self.out = x
    return self.out

  def parameters(self):
    return []

#-------------------------------------------------------------------------------

class Sequential: # Will allow us to format the process

  def __init__(self, layers):
    self.layers = layers

  def __call__(self, x):
    for layer in self.layers:
      x = layer(x)
    self.out = x
    return self.out

  def parameters(self):
    return [p for layer in self.layers for p in layer.parameters()]

  def train(self, mode=True):
    for layer in self.layers:
      if hasattr(layer, 'training'):
        layer.training = mode

#-------------------------------------------------------------------------------



In [15]:
# HyperParameters
torch.manual_seed(42)
n_emb = 10
n_hidden = 100
vocab_size = len(itos)

# Model
model = Sequential([
    Embedding(vocab_size, n_emb),
    FlattenConsecutive(2), Linear(n_emb * 2, n_hidden, bias=False), BatchNorm1d(n_hidden),Tanh(),

    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden),Tanh(),

    FlattenConsecutive(2),Linear(n_hidden * 2, n_hidden, bias=False),
    Tanh(),Linear(n_hidden, vocab_size),
])

# Parameter initialization
with torch.no_grad():
  model.layers[-1].weight *= 0.1 # Makes layers less confidentally wrong

# Removed C from parameters list
parameters = model.parameters()
print(sum(p.nelement() for p in parameters))


45397


In [16]:
# Optimisation starts
epochs = 100000

# Get all model parameters for optimization
parameters = model.parameters()

for epoch in range(epochs):

  # Minibatch Contruct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,))
  Xb, Yb = Xtr[ix], Ytr[ix]

  # Automated!
  logits = model(Xb)
  loss = F.cross_entropy(logits, Yb.long())  # Backward pass starts
  for p in parameters:
    p.grad = None
  loss.backward()

  # Optimization
  learning_rate = 0.1 if epoch <= (epochs/2) else 0.01
  with torch.no_grad():
    for p in parameters:
      p -= learning_rate * p.grad

  # Print Loss
  if epoch % (epochs/10) == 0:
    print(f"epoch: {epoch}, loss: {loss.item()}")

# Final Loss
print(f"epoch: {epochs}, loss: {loss.item()}")

epoch: 0, loss: 3.286693572998047
epoch: 10000, loss: 2.389369487762451
epoch: 20000, loss: 2.253600597381592
epoch: 30000, loss: 1.9474616050720215
epoch: 40000, loss: 1.7674556970596313
epoch: 50000, loss: 1.9966901540756226
epoch: 60000, loss: 2.0138232707977295
epoch: 70000, loss: 2.0366742610931396
epoch: 80000, loss: 2.5754318237304688
epoch: 90000, loss: 1.940301775932312
epoch: 100000, loss: 1.8007537126541138


In [17]:
@torch.no_grad()

def split_loss(split):
  # Evaluation Mode
  model.train(False)

  x,y = {
      'train' : (Xtr, Ytr),
      'val' : (Xval, Yval),
      'test' : (Xte, Yte),
  }[split]
  x = model(x)
  loss = F.cross_entropy(x, y)
  print(split, loss.item())

  # Set back to Training Mode
  model.train(True)

split_loss('train')
split_loss('val')
split_loss('test')

train 1.9005718231201172
val 2.0178701877593994
test 2.011873245239258


In [18]:
# Sample from the model

@torch.no_grad()
def generate_name():
  # Evaluation Mode entered
  model.train(False)
  out = []
  context = [0] * block_size

  while True:
    # Forward pass starts
    # The Embedding layer is now layers[0], so we pass the indices directly.
    x = torch.tensor([context], dtype=torch.long) # Ensure x is long tensor
    x = model(x)
    logits = x
    probs = F.softmax(logits, dim=1) # Specify dimension for softmax

    # Sample
    ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()

    # Shift the context window
    context = context[1:] + [ix]
    out.append(ix)

    # If sample from '.' then break
    if ix == 0:
      break

  model.train(True)
  return ''.join(itos[i] for i in out)

for _ in range(10):
  print(generate_name())


rader.
dionna.
keylannique.
ezai.
willa.
kinley.
habalis.
quelilyse.
streyath.
berken.
