<a href="https://colab.research.google.com/github/woodRock/grokking-deep-learning/blob/main/chapter_13_introducing_automatic_optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 13 | Introducing automatic optimization

In [170]:
import numpy as np

class Tensor (object):

    def __init__(self, data):
        self.data = np.array(data)

    def __add__(self, other):
        return Tensor(self.data + other.data)

    def __repr__(self):
        return str(self.data.__repr__())

    def __str__(self):
        return str(self.data.__str__())

x = Tensor([1,2,3,4,5])
print(x)

y = x + x
print(y)

[1 2 3 4 5]
[ 2  4  6  8 10]


# Introduction to automatic gradient computation (autograd)

In [171]:
import numpy as np

class Tensor (object):
    def __init__(self, data, creators=None, creation_op=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None

    def backward(self, grad):
        self.grad = grad
        if (self.creation_op == "add"):
            self.creators[0].backward(grad)
            self.creators[1].backward(grad)

    def __add__(self, other):
        return Tensor(self.data + other.data, creators=[self,other], creation_op = "add")

    def __repr__(self):
        return str(self.data.__repr__())

    def __str__(self):
        return str(self.data.__str__())


x = Tensor([1,2,3,4,5])
y = Tensor([2,2,2,2,2])
z = x + y
z.backward(Tensor(np.array([1,1,1,1,1])))

In [172]:
print(f"x.grad: {x.grad}")
print(f"y.grad: {y.grad}")
print(f"z.creators: {z.creators}")
print(f"z.creation_op: {z.creation_op}")

x.grad: [1 1 1 1 1]
y.grad: [1 1 1 1 1]
z.creators: [array([1, 2, 3, 4, 5]), array([2, 2, 2, 2, 2])]
z.creation_op: add


In [173]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])
d = Tensor([-1,-2,-3,-4,-5])
e = a + b
f = c + d
g = e + f
g.backward(Tensor(np.array([1,1,1,1,1])))
print(f"a.grad: {a.grad}")

a.grad: [1 1 1 1 1]


# Upgrading autograd to support mutliuse tensors

In [174]:
import numpy as np

class Tensor (object):

    def __init__(self,data,
                 autograd=False,
                 creators=None,
                 creation_op=None,
                 id=None):

        self.data = np.array(data)
        self.autograd = autograd
        self.grad = None
        if(id is None):
            self.id = np.random.randint(0,100000)
        else:
            self.id = id

        self.creators = creators
        self.creation_op = creation_op
        self.children = {}

        if(creators is not None):
            for c in creators:
                if(self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1

    def all_children_grads_accounted_for(self):
        for id,cnt in self.children.items():
            if(cnt != 0):
                return False
        return True

    def backward(self,grad=None, grad_origin=None):
        if(self.autograd):

            if(grad is None):
                grad = Tensor(np.ones_like(self.data))

            if(grad_origin is not None):
                if(self.children[grad_origin.id] == 0):
                    raise Exception("cannot backprop more than once")
                else:
                    self.children[grad_origin.id] -= 1

            if(self.grad is None):
                self.grad = grad
            else:
                self.grad += grad

            # grads must not have grads of their own
            assert grad.autograd == False

            # only continue backpropping if there's something to
            # backprop into and if all gradients (from children)
            # are accounted for override waiting for children if
            # "backprop" was called on this variable directly
            if (self.creators is not None and
               (self.all_children_grads_accounted_for() or
                grad_origin is None)):

                if (self.creation_op == "add"):
                    self.creators[0].backward(self.grad, self)
                    self.creators[1].backward(self.grad, self)

                if (self.creation_op == "sub"):
                    self.creators[0].backward(Tensor(self.grad.data), self)
                    self.creators[1].backward(Tensor(self.grad.__neg__().data), self)

                if (self.creation_op == "mul"):
                    new = self.grad * self.creators[1]
                    self.creators[0].backward(new , self)
                    new = self.grad * self.creators[0]
                    self.creators[1].backward(new, self)

                if (self.creation_op == "mm"):
                    c0 = self.creators[0]
                    c1 = self.creators[1]
                    new = self.grad.mm(c1.transpose())
                    c0.backward(new)
                    new = self.grad.transpose().mm(c0).transpose()
                    c1.backward(new)

                if (self.creation_op == "transpose"):
                    self.creators[0].backward(self.grad.transpose())

                if ("sum" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.expand(dim,self.creators[0].data.shape[dim]))

                if ("expand" in self.creation_op):
                    dim = int(self.creation_op.split("_")[1])
                    self.creators[0].backward(self.grad.sum(dim))

                if(self.creation_op == "neg"):
                    self.creators[0].backward(self.grad.__neg__())

                if (self.creation_op == "sigmoid"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad * (self * (ones - self)))

                if (self.creation_op == "tanh"):
                    ones = Tensor(np.ones_like(self.grad.data))
                    self.creators[0].backward(self.grad* (ones - (self * self)))

                if (self.creation_op == "index_select"):
                    new_grad = np.zeros_like(self.creators[0].data)
                    indices_ = self.index_select_indices.data.flatten()
                    grad_ = grad.data.reshape(len(indices_), -1)
                    for i in range(len(indices_)):
                        new_grad[indices_[i]] += grad_[i]
                    self.creators[0].backward(Tensor(new_grad))

                if (self.creation_op == "cross_entropy"):
                    dx = self.softmax_output - self.target_dist
                    self.creators[0].backward(Tensor(dx))

    def __add__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data + other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if (self.autograd):
            return Tensor(self.data * -1,
                          autograd=True,
                          creators=[self],
                          creation_op="neg")
        return Tensor(self.data * -1)

    def __sub__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data - other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if (self.autograd and other.autograd):
            return Tensor(self.data * other.data,
                          autograd=True,
                          creators=[self,other],
                          creation_op="mul")
        return Tensor(self.data * other.data)

    def sum(self, dim):
        if (self.autograd):
            return Tensor(self.data.sum(dim),
                          autograd=True,
                          creators=[self],
                          creation_op="sum_"+str(dim))
        return Tensor(self.data.sum(dim))

    def expand(self, dim,copies):

        trans_cmd = list(range(0,len(self.data.shape)))
        trans_cmd.insert(dim,len(self.data.shape))
        new_data = self.data.repeat(copies).reshape(list(self.data.shape) + [copies]).transpose(trans_cmd)

        if (self.autograd):
            return Tensor(new_data,
                          autograd=True,
                          creators=[self],
                          creation_op="expand_"+str(dim))
        return Tensor(new_data)

    def transpose(self):
        if (self.autograd):
            return Tensor(self.data.transpose(),
                          autograd=True,
                          creators=[self],
                          creation_op="transpose")

        return Tensor(self.data.transpose())

    def mm(self, x):
        if (self.autograd):
            return Tensor(self.data.dot(x.data),
                          autograd=True,
                          creators=[self,x],
                          creation_op="mm")
        return Tensor(self.data.dot(x.data))

    def __repr__(self):
        return str(self.data.__repr__())

    def __str__(self):
        return str(self.data.__str__())

    def sigmoid(self):
        if (self.autograd):
            return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self], creation_op = "sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if (self.autograd):
            return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op = "tanh")
        return Tensor(np.tanh(self.data))

    def index_select(self, indices):
        if (self.autograd):
            new = Tensor(self.data[indices.data], autograd=True, creators=[self], creation_op = "index_select")
            new.index_select_indices = indices
            return new
        return Tensor(self.data[indices.data])

    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape)-1,keepdims=True)
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t), -1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * (target_dist)).sum(1).mean()

        if (self.autograd):
            out = Tensor(loss, autograd=True, creators=[self], creation_op="cross_entropy")
            out.softmax_output = softmax_output
            out.target_dist = target_dist
            return out

        return Tensor(loss)

a = Tensor([1,2,3,4,5], autograd=True)
b = Tensor([2,2,2,2,2], autograd=True)
c = Tensor([5,4,3,2,1], autograd=True)

d = a + b
e = b + c
f = d + e

f.backward(Tensor(np.array([1,1,1,1,1])))

print(b.grad.data == np.array([2,2,2,2,2]))

[ True  True  True  True  True]


In [175]:
import numpy as np

# Freeze the seed for reproduciability.
np.random.seed(0)

data = np.array([[0,0],[0,1],[1,0],[1,1]])
target = np.array([[0,1,0,1]]).T

# Initialize the network
weights_0_1 = np.random.rand(2,3)
weights_1_2 = np.random.rand(3,1)

# Hyperparameters
alpha = 0.1
epochs = 10

for i in range(epochs):
    # Foward pass
    layer_1 = data.dot(weights_0_1)
    layer_2 = layer_1.dot(weights_1_2)
    prediction = layer_2

    # Mean squared error
    diff = (prediction - target)
    squared_diff = diff ** 2
    loss = squared_diff.sum(0)

    # Back propagation
    layer_1_grad = diff.dot(weights_1_2.transpose())
    weight_1_2_update = layer_1.transpose().dot(diff)
    weight_0_1_update = data.transpose().dot(layer_1_grad)

    # Update the weights
    weights_1_2 -= weight_1_2_update * alpha
    weights_0_1 -= weight_0_1_update * alpha

    print(loss[0])

5.066439994622396
0.4959907791902341
0.4180671892167177
0.35298133007809646
0.2972549636567376
0.24923260381633278
0.20785392075862477
0.17231260916265181
0.14193744536652994
0.11613979792168387


# Using autograd to train a neural network

In [176]:
import numpy as np

# Freeze the seed for reproducability.
np.random.seed(0)

# Create the dataset.
X_train = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
y_train = Tensor(np.array([[0,1,0,1]]).T, autograd=True)

# Hyperparameters
alpha = 0.1
input_dim = 2
hidden_dim = 3
output_dim = 1
epochs = 10

# Initialize the network
w = list()
w.append(Tensor(np.random.rand(input_dim,hidden_dim), autograd=True))
w.append(Tensor(np.random.rand(hidden_dim,output_dim), autograd=True))

# Training loop
for i in range(epochs):
    # Forward pass
    pred = X_train.mm(w[0]).mm(w[1])

    # Compute MSE loss
    loss = ((pred - y_train)*(pred - y_train)).sum(0)

    # Backprogpation
    loss.backward(Tensor(np.ones_like(loss.data)))

    # Update the weights
    for w_ in w:
        w_.data -= w_.grad.data * alpha
        w_.grad.data *= 0

    print(loss)

[0.58128304]
[0.48988149]
[0.41375111]
[0.34489412]
[0.28210124]
[0.2254484]
[0.17538853]
[0.1324231]
[0.09682769]
[0.06849361]


# Adding automatic optimization

In [177]:
class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha

    def zero(self):
        for p in self.parameters:
            p.grad.data *= 0

    def step(self, zero=True):
        for p in self.parameters:
            p.data -= p.grad.data * self.alpha

            if (zero):
                p.grad.data *= 0

# Create the dataset
X_train = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
y_train = Tensor(np.array([[0,1,0,1]]).T, autograd=True)

# Hyperparameters
alpha = 0.1
input_dim = 2
hidden_size = 3
output_dim = 1
epochs = 10

# Initialize the network
w = list()
w.append(Tensor(np.random.rand(input_dim, hidden_size), autograd = True))
w.append(Tensor(np.random.rand(hidden_size, output_dim), autograd = True))

optim = SGD(parameters=w, alpha=alpha)

for i in range(epochs):
    # Forward pass
    pred = X_train.mm(w[0]).mm(w[1])
    loss = ((pred - y_train) * (pred - y_train)).sum(0)
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[0.94479064]
[0.59825449]
[0.45744809]
[0.35735139]
[0.27835386]
[0.21376109]
[0.16081403]
[0.1180269]
[0.08428556]
[0.05849538]


# Linear layers

In [178]:
class Layer(object):
    def __init__(self):
        self.parameters = list()

    def get_parameters(self):
        return self.parameters

class Linear(Layer):
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        W = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0 / (n_inputs))
        self.weight = Tensor(W, autograd=True)
        self.bias = Tensor(np.zeros(n_outputs), autograd=True)

        self.parameters.append(self.weight)
        self.parameters.append(self.bias)

    def forward(self, input):
        return input.mm(self.weight) + self.bias.expand(0, len(input.data))

# Layers that contain layers

In [179]:
class Sequential(Layer):
    def __init__(self, layers=list()):
        super().__init__()
        self.layers = layers

    def add(self, layer):
        self.layers.append(layer)

    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input

    def get_parameters(self):
        params = list()
        for l in self.layers:
            params += l.get_parameters()
        return params

# Freeze the seed for reproducability
np.random.seed(0)

# Create the dataset
X_train = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
y_train = Tensor(np.array([[0,1,0,1]]).T, autograd=True)

# Hyperparameters
alpha = 0.05
input_dim = 2
hidden_size = 3
output_dim = 1
epochs = 10

# Initialize the modfel
model = Sequential([Linear(input_dim,hidden_size), Linear(hidden_size,output_dim)])

# Initialize the optimizer
optim = SGD(parameters=model.get_parameters(), alpha=alpha)

for i in range(epochs):
    # Forward pass
    pred = model.forward(X_train)
    loss = ((pred - y_train) * (pred - y_train)).sum(0)

    # Back propagation
    loss.backward(Tensor(np.ones_like(loss.data)))

    optim.step()
    print(loss)

[2.33428272]
[0.06743796]
[0.0521849]
[0.04079507]
[0.03184365]
[0.02479336]
[0.01925443]
[0.01491699]
[0.01153118]
[0.00889602]


# Loss function layers

In [180]:
class MSELoss(Layer):

    def __init__(self):
        super().__init__()

    def forward(self, pred, target):
        return ((pred - target) * (pred - target)).sum(0)

import numpy as np
# Freeze the seed for reproducability
np.random.seed(0)

# Create the dataset
X_train = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
y_train = Tensor(np.array([[0,1,0,1]]).T, autograd=True)

# Hyperparameters
alpha = 0.05
input_dim = 2
hidden_size = 3
output_dim = 1
epochs = 10

# Initialize the model
model = Sequential([Linear(input_dim, hidden_size), Linear(hidden_size, output_dim)])

# Criterion
criterion = MSELoss()

# Optimizer
optim = SGD(parameters=model.get_parameters(), alpha=alpha)

for i in range(epochs):
    # Forward pass
    pred = model.forward(X_train)
    loss = criterion.forward(pred, y_train)
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[2.33428272]
[0.06743796]
[0.0521849]
[0.04079507]
[0.03184365]
[0.02479336]
[0.01925443]
[0.01491699]
[0.01153118]
[0.00889602]


# Non-linearities

In [181]:
class Tanh(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.tanh()

class Sigmoid(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.sigmoid()

class Relu(Layer):
    def __init__(self):
        super().__init__()

    def forward(self, input):
        return input.relu()

import numpy as np
# Freeze the seed for reproducability
np.random.seed(0)

# Create the dataset
X_train = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
y_train = Tensor(np.array([[0,1,0,1]]).T, autograd=True)

# Hyperparameters
alpha = 0.05
input_dim = 2
hidden_size = 3
output_dim = 1
epochs = 10

# Initialize the model
model = Sequential([Linear(input_dim,hidden_size), Tanh(), Linear(hidden_size, output_dim), Sigmoid()])

# Loss function
criterion = MSELoss()

# Optimizer
optim = SGD(parameters=model.get_parameters(), alpha=1)

for i in range(epochs):
    # Forward pass
    pred = model.forward(X_train)
    loss = criterion.forward(pred, y_train)
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[1.06372865]
[0.75148144]
[0.57384259]
[0.39574294]
[0.2482279]
[0.15515294]
[0.10423398]
[0.07571169]
[0.05837623]
[0.04700013]


# The Embedding layer

In [182]:
class Embedding(Layer):

    def __init__(self, vocab_size, dim):
        super().__init__()

        self.vocab_size = vocab_size
        self.dim = dim

        # this random initialiation style is just a convention from word2vec
        self.weight = Tensor((np.random.rand(vocab_size, dim) - 0.5) / dim, autograd=True)

        self.parameters.append(self.weight)

    def forward(self, input):
        return self.weight.index_select(input)

In [183]:
x = Tensor(np.eye(5), autograd=True)
x.index_select(Tensor([[1,2,3],[2,3,4]])).backward()
print(x.grad)

[[0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1.]]


In [184]:
import numpy
np.random.seed(0)

data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([[0,1,0,1]]).T, autograd=True)

embed = Embedding(5,3)
model = Sequential([embed, Tanh(), Linear(3,1), Sigmoid()])
criterion = MSELoss()

optim = SGD(parameters=model.get_parameters(), alpha=0.5)

for i in range(10):

    # Predict
    pred = model.forward(data)

    # Compare
    loss = criterion.forward(pred, target)

    # Learn
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

[0.98874126]
[0.6658868]
[0.45639889]
[0.31608168]
[0.2260925]
[0.16877423]
[0.13120515]
[0.10555487]
[0.08731868]
[0.07387834]


In [185]:
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()

    def forward(self, input, target):
        return input.cross_entropy(target)

import numpy as np

# Freeze the seed for reproducability.
np.random.seed(0)

# Load the dataset
X_train = Tensor(np.array([1,2,1,2]), autograd=True)
y_train = Tensor(np.array([0,1,0,1]), autograd=True)

# Hyperparameters
alpha = 0.05
input_dim = 3
hidden_size = 3
output_dim = 4
epochs = 100

# Intializae the model
model = Sequential([Embedding(input_dim,hidden_size), Tanh(), Linear(hidden_size,output_dim)])
criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters(), alpha=alpha)

# Training loop
for i in range(epochs):
    # Foward pass
    pred = model.forward(X_train)
    loss = criterion.forward(pred, y_train)
    # Back propagation
    loss.backward(Tensor(np.ones_like(loss.data)))
    optim.step()
    print(loss)

1.3885032434928422
1.1589954572218848
0.9716749474445552
0.8210884856655897
0.7006078064272916
0.6039034135741919
0.5256911650712581
0.4618412956011161
0.4092094174112788
0.3654141538910436
0.32864434909977636
0.29751151228726846
0.27094154389571456
0.24809605233780735
0.22831497243195564
0.2110743693694357
0.19595511500698373
0.18261942038294754
0.17079310304498868
0.16025208120465795
0.15081201069040906
0.1423202773270627
0.13464976758172476
0.12769399096647832
0.12136323670917579
0.11558152676294825
0.11028418571360833
0.10541589142987418
0.10092910253494163
0.09678278292307543
0.0929413617343458
0.08937388097963198
0.08605329350287548
0.08295588200386533
0.08006077603098155
0.07734954863993607
0.07480587813709627
0.07241526323547084
0.0701647822365549
0.0680428886545614
0.0660392371293141
0.06414453461282305
0.06235041272552896
0.06064931791013976
0.05903441660150906
0.05749951310942289
0.05603897830027456
0.05464768748131345
0.053320966151530716
0.052054542497422016
0.050844505688

In [186]:
class RNNCell(Layer):

    def __init__(self, n_inputs, n_hidden, n_output, activation='sigmoid'):
        super().__init__()

        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output

        if(activation == 'sigmoid'):
            self.activation = Sigmoid()
        elif(activation == 'tanh'):
            self.activation == Tanh()
        else:
            raise Exception("Non-linearity not found")

        self.w_ih = Linear(n_inputs, n_hidden)
        self.w_hh = Linear(n_hidden, n_hidden)
        self.w_ho = Linear(n_hidden, n_output)

        self.parameters += self.w_ih.get_parameters()
        self.parameters += self.w_hh.get_parameters()
        self.parameters += self.w_ho.get_parameters()

    def forward(self, input, hidden):
        from_prev_hidden = self.w_hh.forward(hidden)
        combined = self.w_ih.forward(input) + from_prev_hidden
        new_hidden = self.activation.forward(combined)
        output = self.w_ho.forward(new_hidden)
        return output, new_hidden

    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size,self.n_hidden)), autograd=True)

In [187]:
! wget https://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz
! tar -xvf tasks_1-20_v1-2.tar.gz

--2024-08-24 15:38:41--  https://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz
Resolving www.thespermwhale.com (www.thespermwhale.com)... 50.31.160.191
Connecting to www.thespermwhale.com (www.thespermwhale.com)|50.31.160.191|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15719851 (15M) [application/x-gzip]
Saving to: ‘tasks_1-20_v1-2.tar.gz.2’


2024-08-24 15:38:44 (6.27 MB/s) - ‘tasks_1-20_v1-2.tar.gz.2’ saved [15719851/15719851]

tasks_1-20_v1-2/
tasks_1-20_v1-2/hn/
tasks_1-20_v1-2/hn/qa16_basic-induction_train.txt
tasks_1-20_v1-2/hn/qa13_compound-coreference_train.txt
tasks_1-20_v1-2/hn/qa13_compound-coreference_test.txt
tasks_1-20_v1-2/hn/qa14_time-reasoning_test.txt
tasks_1-20_v1-2/hn/qa5_three-arg-relations_test.txt
tasks_1-20_v1-2/hn/qa17_positional-reasoning_train.txt
tasks_1-20_v1-2/hn/qa9_simple-negation_train.txt
tasks_1-20_v1-2/hn/qa12_conjunction_train.txt
tasks_1-20_v1-2/hn/qa6_yes-no-questions_train.txt
tasks_1-20_v1-2/hn/qa2_tw

In [188]:
import sys,random,math
from collections import Counter
import numpy as np

f = open('tasks_1-20_v1-2/en/qa1_single-supporting-fact_train.txt','r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])

new_tokens = list()
for line in tokens:
    new_tokens.append(['-'] * (6 - len(line)) + line)

tokens = new_tokens

vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)

vocab = list(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

indices = list()
for line in tokens:
    idx = list()
    for w in line:
        idx.append(word2index[w])
    indices.append(idx)

data = np.array(indices)

In [189]:
# Hyperparamters
alpha = 0.05
epochs = 1_000
batch_size = 100
input_dim = len(vocab)
output_dim = len(vocab)

embed = Embedding(vocab_size=input_dim,dim=16)
model = RNNCell(n_inputs=16, n_hidden=16, n_output=output_dim)

criterion = CrossEntropyLoss()
optim = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=alpha)

# Training loop
for iter in range(epochs):
    batch_size = 100
    total_loss = 0

    hidden = model.init_hidden(batch_size=batch_size)

    for t in range(5):
        input = Tensor(data[0:batch_size,t], autograd=True)
        rnn_input = embed.forward(input=input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)

    target = Tensor(data[0:batch_size,t+1], autograd=True)
    loss = criterion.forward(output, target)
    loss.backward()
    optim.step()
    total_loss += loss.data
    if(iter % 200 == 0):
        p_correct = (target.data == np.argmax(output.data,axis=1)).mean()
        print("Loss:",total_loss / (len(data)/batch_size),"% Correct:",p_correct)

Loss: 0.4364644457671538 % Correct: 0.02
Loss: 0.17284790681688394 % Correct: 0.25
Loss: 0.1582427394973346 % Correct: 0.31
Loss: 0.1411711909238484 % Correct: 0.37
Loss: 0.13707044352997122 % Correct: 0.37


In [190]:
batch_size = 1
hidden = model.init_hidden(batch_size=batch_size)
for t in range(5):
    input = Tensor(data[0:batch_size,t], autograd=True)
    rnn_input = embed.forward(input=input)
    output, hidden = model.forward(input=rnn_input, hidden=hidden)

target = Tensor(data[0:batch_size, t+1], autograd=True)
loss = criterion.forward(output, target)

context = ""
for idx in data[0:batch_size][0][0:-1]:
    context += vocab[idx] + " "
pred = vocab[output.data.argmax()]
print(f"Context: {context}\n pred: {pred}")

Context: - mary moved to the 
 pred: garden.
