In [52]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline
import torch.nn.functional as F

In [53]:
# train = open('Training.txt','r').read().splitlines()
test = open('../MLP/data/test.txt').read().splitlines()
dev = open('../MLP/data/dev.txt').read().splitlines()
words = open('../MLP/data/Training.txt').read().splitlines()
len(words)

30000

In [54]:
# letter to number mapping

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
itos
vocab_size = len(itos)

In [55]:
block = 8

def dataset(words):


    X,Y = [],[]

    for w in words:

    #     print(w)

        context = [0] * block
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)

    #         print(''.join(itos[i] for i in context), itos[ix])
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape,Y.shape)
    return X,Y

In [56]:
Xtr, Ytr = dataset(words)
Xdev,Ydev = dataset(dev)
Xte, Yte = dataset(test)
for x,y in zip(Xtr[:20], Ytr[:20]):
    print(''.join(itos[ix.item()] for ix in x), '-->', itos[y.item()])

torch.Size([313130, 8]) torch.Size([313130])
torch.Size([104449, 8]) torch.Size([104449])
torch.Size([104449, 8]) torch.Size([104449])
........ --> u
.......u --> n
......un --> a
.....una --> r
....unar --> r
...unarr --> a
..unarra --> i
.unarrai --> g
unarraig --> n
narraign --> e
arraigne --> d
rraigned --> .
........ --> c
.......c --> i
......ci --> r
.....cir --> s
....cirs --> o
...cirso --> t
..cirsot --> o
.cirsoto --> m


In [102]:

#Creates a liner layer within the model
class DenseLayer:

    def __init__(self, input_dim, output_dim, bias=True):
        limit = input_dim ** 0.5

        #Sets weights
        self.weights = torch.randn(input_dim, output_dim) / limit
        
        #Sets bias
        self.biases = torch.zeros(output_dim) if bias else None

    def __call__(self, inputs):
        
        # weight calculation
        projection = inputs @ self.weights

        # Bias calculation
        if self.biases is not None:
            projection += self.biases
        
        # Sets the output
        self.output = projection
        return self.output

    def parameters(self):

        # Gets the trainable parameters
        params = [self.weights]
        if self.biases is not None:
            params.append(self.biases)
        return params


    # Batch normalization normalizes layer inputs to have zero mean and unit variance
class BatchNorm:
  
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True

        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0,1)
            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]




class tanh:
    def forward(self, input_tensor):
        self.output = torch.tanh(input_tensor)
        return self.output

    def parameters(self):
        # No parameters
        return []

    # Call like a function
    def __call__(self, input_tensor):
        return self.forward(input_tensor)


class Embedding:
    def __init__(self, vocab_size, embed_dim):
        # Initialize embedding matrix with random weights
        self.embeddings = torch.randn((vocab_size, embed_dim))

    def forward(self, indices):

        self.output = self.embeddings[indices]
        return self.output

    def parameters(self):
        # Does parameters
        return [self.embeddings]

    # Calls function
    def __call__(self, indices):
        return self.forward(indices)



class Flatten:
    def __init__(self, group_size):

        self.group_size = group_size

    def forward(self, tensor):
        B, T, C = tensor.shape

        reshaped = tensor.view(B, T // self.group_size, C * self.group_size)

        if reshaped.shape[1] == 1:
            reshaped = reshaped.squeeze(1)
        self.output = reshaped
        return self.output

    def __call__(self, tensor):
        return self.forward(tensor)

    def parameters(self):
        # No Parameters
        return []


class LayerStack:
  
    def __init__(self, layers):
        self.layers = layers
  
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
            if torch.any(torch.isnan(x)):
                print(f"NaNs found after layer {i}: {layer.__class__.__name__}")
                break

        self.out = x
        return self.out
  
    def parameters(self):
    
        return [p for layer in self.layers for p in layer.parameters()]


In [103]:
n_embd = 24 # Dimension of each character vector
n_hidden = 128 # the number of neurons in the hidden layer of the MLP




# Creates the model
model = LayerStack([
      Embedding(vocab_size, n_embd),
      Flatten(2), DenseLayer(n_embd * 2, n_hidden, bias=False), BatchNorm(n_hidden), Tanh(),
      Flatten(2), DenseLayer(n_hidden*2, n_hidden, bias=False), BatchNorm(n_hidden), Tanh(),
      Flatten(2), DenseLayer(n_hidden*2, n_hidden, bias=False), BatchNorm(n_hidden), Tanh(),
      DenseLayer(n_hidden, vocab_size),
    ])

# parameter init
with torch.no_grad():
    model.layers[-1].weights *= 0.1 # last layer make less confident

parameters = model.parameters()
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
    p.requires_grad = True

76579


In [107]:
max_steps = 200000
batch_size = 32

for i in range(max_steps):
  

    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
    

    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb) 
  

    for p in parameters:
        p.grad = None
    loss.backward()
  
    lr = 0.1 if i < 150000 else 0.01 
    for p in parameters:
        p.data += -lr * p.grad

  # track stats
    if i % 10000 == 0: 
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')


      0/ 200000: 1.9539
  10000/ 200000: 2.2085
  20000/ 200000: 2.1416
  30000/ 200000: 2.0545
  40000/ 200000: 1.8726
  50000/ 200000: 1.5855
  60000/ 200000: 1.9837
  70000/ 200000: 1.8217
  80000/ 200000: 1.4675
  90000/ 200000: 1.7908
 100000/ 200000: 1.6322
 110000/ 200000: 1.6204
 120000/ 200000: 1.9774
 130000/ 200000: 1.7093
 140000/ 200000: 1.4864
 150000/ 200000: 1.8600
 160000/ 200000: 2.0416
 170000/ 200000: 1.7932
 180000/ 200000: 1.8300
 190000/ 200000: 1.8756


In [108]:
for layer in model.layers:
    layer.training = False

@torch.no_grad() # this decorator disables gradient tracking inside pytorch
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'val': (Xdev, Ydev),
        'test': (Xte, Yte),
  }[split]
    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.790572166442871
val 1.790382742881775


In [109]:
for _ in range(20):
    out = []
    context = [0] * block

    while True:
        x = torch.tensor([context])
        with torch.no_grad():
            logits = model(x)
            

            logits = torch.clamp(logits, -10, 10)
            probs = F.softmax(logits, dim=1)

        if torch.any(torch.isnan(probs)) or torch.any(probs < 0):
            print("Invalid probs:", probs)
            break

        ix = torch.multinomial(probs, num_samples=1).item()
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break

    print(''.join(itos[i] for i in out))


faulvering.
retroscopically.
dassto.
whittends.
tacaraeman.
figurates.
fundlindtomy.
proplight.
subduck.
submediator.
quiformations.
blinds.
spruncy.
naooteroscirs.
aurilia.
bargelike.
badzaz.
nonprofublever.
dippense.
rankering.
