In [1]:
import numpy as np
np.random.seed(0)

In [2]:
# Basic Tensor

In [3]:
class Tensor(object):
    def __init__(self, data):
        self.data = np.array(data)
    
    def __add__(self, other):
        return Tensor(self.data + other.data)

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [4]:
x = Tensor([1,2,3,4,5])
print(x)

[1 2 3 4 5]


In [5]:
y = x + x
print(y)

[ 2  4  6  8 10]


In [6]:
# Autograd

In [7]:
class Tensor(object):
    def __init__(self, data, creators=None, creation_op=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
    
    def backward(self, grad):
        self.grad = grad
        
        if self.creation_op == "add":
            self.creators[0].backward(grad)
            self.creators[1].backward(grad)
    
    def __add__(self, other):
        return Tensor(self.data + other.data, creators=[self, other], creation_op="add")

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [8]:
x = Tensor([1,2,3,4,5])
y = Tensor([2,2,2,2,2])
z = x + y
z.backward(Tensor(np.array([1,1,1,1,1])))

print(x.grad)
print(y.grad)
print(z.creators)
print(z.creation_op)

[1 1 1 1 1]
[1 1 1 1 1]
[array([1, 2, 3, 4, 5]), array([2, 2, 2, 2, 2])]
add


In [9]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])
d = Tensor([-1,-2,-3,-4,-5])

e = a + b
f = c + d
g = e + f

g.backward(Tensor(np.array([1,1,1,1,1])))
print(a.grad)

[1 1 1 1 1]


In [10]:
# Multiple-use tensors

In [11]:
a = Tensor([1,2,3,4,5])
b = Tensor([2,2,2,2,2])
c = Tensor([5,4,3,2,1])

d = a + b
e = b + c
f = d + e

f.backward(Tensor(np.array([1,1,1,1,1])))
print(b.grad.data == np.array([2,2,2,2,2]))

[False False False False False]


In [12]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [13]:
a = Tensor([1,2,3,4,5], autograd=True)
b = Tensor([2,2,2,2,2], autograd=True)
c = Tensor([5,4,3,2,1], autograd=True)

d = a + b
e = b + c
f = d + e

f.backward(Tensor(np.array([1,1,1,1,1])))

print(b.grad.data == np.array([2,2,2,2,2]))

[ True  True  True  True  True]


In [14]:
# Negation

In [15]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
            elif self.creation_op == "neg":
                self.creators[0].backward(self.grad.__neg__(), self)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [16]:
a = Tensor([1,2,3,4,5], autograd=True)
b = Tensor([2,2,2,2,2], autograd=True)
c = Tensor([5,4,3,2,1], autograd=True)

d = a + (-b)
e = (-b) + c
f = d + e

f.backward(Tensor(np.array([1,1,1,1,1])))

print(b.grad.data == np.array([-2,-2,-2,-2,-2]))

[ True  True  True  True  True]


In [17]:
# Additional functions

In [18]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad is None:
            grad = Tensor(np.ones_like(self.data))
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "neg":
                self.creators[0].backward(self.grad.__neg__(), self)
            elif self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
            elif self.creation_op == "sub":
                positive_grad = Tensor(self.grad.data)
                self.creators[0].backward(positive_grad, self)
                negative_grad = Tensor(self.grad.__neg__().data)
                self.creators[1].backward(negative_grad, self)
            elif self.creation_op == "mul":
                mul_grad_0 = self.grad * self.creators[0]
                self.creators[0].backward(mul_grad_0, self)
                mul_grad_1 = self.grad * self.creators[1]
                self.creators[1].backward(mul_grad_1, self)
            elif self.creation_op == "transpose":
                self.creators[0].backward(self.grad.transpose())
            elif self.creation_op == "mm":
                activations = self.creators[0]
                weights = self.creators[1]
                activations_grad = self.grad.mm(weights.transpose())
                activations.backward(activations_grad)
                weights_grad = self.grad.transpose().mm(activations).transpose()
                weights.backward(weights_grad)
            elif "sum" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                ds = self.creators[0].data.shape[dim]
                self.creators[0].backward(self.grad.expand(dim, ds))
            elif "expand" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                self.creators[0].backward(self.grad.sum(dim))
            

    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data, autograd=True, creators=[self, other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data, autograd=True, creators=[self, other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def sum(self, dimension):
        if self.autograd:
            return Tensor(self.data.sum(dimension), autograd=True, creators=[self], creation_op="sum_"+str(dimension))
        return Tensor(self.data.sum(dimension))

    def expand(self, dimension, copies):
        transpose_cmd = list(range(0, len(self.data.shape)))
        transpose_cmd.insert(dimension, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape).transpose(transpose_cmd)
        
        if self.autograd:
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dimension))
        return Tensor(new_data)
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if self.autograd and x.autograd:
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self, x], creation_op="mm")
        return Tensor(self.data.dot(x.data))

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [19]:
# Training a network

In [20]:
data = np.array([[0,0], [0,1], [1,0], [1,1]])
target = np.array([[0],[1],[0],[1]])

weights_0_1 = np.random.rand(2, 3)
weights_1_2 = np.random.rand(3, 1)

alpha = 0.1

In [21]:
for i in range(10):
    layer_1 = data.dot(weights_0_1)
    layer_2 = layer_1.dot(weights_1_2)
    
    diff = (layer_2 - target)
    sqdiff = diff ** 2
    loss = sqdiff.sum(0)
    
    layer_1_grad = diff.dot(weights_1_2.transpose())
    weight_1_2_update = layer_1.transpose().dot(diff)
    weight_0_1_update = data.transpose().dot(layer_1_grad)
    
    weights_1_2 -= weight_1_2_update * alpha
    weights_0_1 -= weight_0_1_update * alpha
    print(loss[0])

1.5577602923842329
0.5469507992621946
0.45037225801320546
0.380620419782252
0.3237968431042391
0.27629934815300594
0.23609460698260856
0.20177490347199836
0.17231082160127653
0.1469253877068483


In [22]:
data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

weight_layers = list()
weight_layers.append(Tensor(np.random.rand(2,3), autograd=True))
weight_layers.append(Tensor(np.random.rand(3,1), autograd=True))

alpha = 0.1

In [23]:
for i in range(10):
    prediction = data.mm(weight_layers[0]).mm(weight_layers[1])
    loss = ((prediction - target) * (prediction - target)).sum(0)
    
    loss.backward(Tensor(np.ones_like(loss.data)))
    for weight_layer in weight_layers:
        weight_layer.data -= weight_layer.grad.data * alpha
        weight_layer.grad.data *= 0
    
    print(loss)

[1.76391958]
[0.96123086]
[0.33601059]
[0.27636296]
[0.22428661]
[0.17849161]
[0.13893825]
[0.10559741]
[0.07827501]
[0.05656575]


In [24]:
# Automatic optimization

In [25]:
class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero(self):
        for parameter in self.parameters:
            parameter.grad.data *= 0

    def step(self, zero=True):
        for parameter in self.parameters:
            parameter.data -= parameter.grad.data * self.alpha
            
            if zero:
                parameter.grad.data *= 0

In [26]:
data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

weight_layers = list()
weight_layers.append(Tensor(np.random.rand(2,3), autograd=True))
weight_layers.append(Tensor(np.random.rand(3,1), autograd=True))

optimizer = SGD(parameters=weight_layers, alpha=0.1)

In [27]:
for i in range(10):
    prediction = data.mm(weight_layers[0]).mm(weight_layers[1])
    loss = ((prediction - target) * (prediction - target)).sum(0)
    loss.backward(Tensor(np.ones_like(loss.data)))
    optimizer.step()
    
    print(loss)

[0.58298382]
[0.42357386]
[0.31110421]
[0.22411593]
[0.15731142]
[0.1072131]
[0.07083649]
[0.04537987]
[0.02823462]
[0.01710821]


In [28]:
# Layer types

In [29]:
class Layer(object):
    def __init__(self):
        self.parameters = list()
    
    def get_parameters(self):
        return self.parameters

In [30]:
class Linear(Layer):
    def __init__(self, n_inputs, n_outputs):
        super().__init__()
        weights = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0/n_inputs)
        self.weights = Tensor(weights, autograd=True)
        self.biases = Tensor(np.zeros(n_outputs), autograd=True)
        
        self.parameters.append(self.weights)
        self.parameters.append(self.biases)
    
    def forward(self, input):
        return input.mm(self.weights) + self.biases.expand(0, len(input.data))

In [31]:
class Sequential(Layer):
    def __init__(self, layers=list()):
        super().__init__()
        self.layers = layers
    
    def get_parameters(self):
        parameters = list()
        for layer in self.layers:
            parameters += layer.get_parameters()
        return parameters
    
    def add(self, layer):
        self.layers.append(layer)
    
    def forward(self, input):
        for layer in self.layers:
            input = layer.forward(input)
        return input

In [32]:
data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

model = Sequential([Linear(2, 3), Linear(3, 1)])
optimizer = SGD(parameters=model.get_parameters(), alpha=0.05)

In [33]:
for i in range(10):
    prediction = model.forward(data)
    
    loss = ((prediction - target) * (prediction - target)).sum(0)
    
    loss.backward(Tensor(np.ones_like(loss.data)))
    optimizer.step()
    
    print(loss)

[14.11211938]
[1.28564955]
[0.53306001]
[0.32024448]
[0.20546412]
[0.1325231]
[0.08501483]
[0.05434979]
[0.03475919]
[0.02230738]


In [34]:
# Loss-function layers

In [35]:
class MSELoss(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, prediction, target):
        diff = prediction - target
        return (diff * diff).sum(0)

In [36]:
data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

model = Sequential([Linear(2, 3), Linear(3, 1)])
criterion = MSELoss()

optimizer = SGD(parameters=model.get_parameters(), alpha=0.05)

In [37]:
for i in range(10):
    prediction = model.forward(data)
    
    loss = criterion.forward(prediction, target)
    print(loss)
    
    loss.backward(Tensor(np.ones_like(loss.data)))
    optimizer.step()

[4.86870492]
[9.88245204]
[10.61136536]
[0.53863137]
[0.42991429]
[0.36324564]
[0.30510217]
[0.25303351]
[0.20686486]
[0.16659943]


In [38]:
# Nonlinear layers

In [39]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad is None:
            grad = Tensor(np.ones_like(self.data))
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "neg":
                self.creators[0].backward(self.grad.__neg__(), self)
            elif self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
            elif self.creation_op == "sub":
                positive_grad = Tensor(self.grad.data)
                self.creators[0].backward(positive_grad, self)
                negative_grad = Tensor(self.grad.__neg__().data)
                self.creators[1].backward(negative_grad, self)
            elif self.creation_op == "mul":
                mul_grad_0 = self.grad * self.creators[0]
                self.creators[0].backward(mul_grad_0, self)
                mul_grad_1 = self.grad * self.creators[1]
                self.creators[1].backward(mul_grad_1, self)
            elif self.creation_op == "transpose":
                self.creators[0].backward(self.grad.transpose())
            elif self.creation_op == "mm":
                activations = self.creators[0]
                weights = self.creators[1]
                activations_grad = self.grad.mm(weights.transpose())
                activations.backward(activations_grad)
                weights_grad = self.grad.transpose().mm(activations).transpose()
                weights.backward(weights_grad)
            elif "sum" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                ds = self.creators[0].data.shape[dim]
                self.creators[0].backward(self.grad.expand(dim, ds))
            elif "expand" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                self.creators[0].backward(self.grad.sum(dim))
            elif self.creation_op == "sigmoid":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * self * (ones - self))
            elif self.creation_op == "tanh":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * (ones - (self * self)))
            

    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data, autograd=True, creators=[self, other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data, autograd=True, creators=[self, other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def sum(self, dimension):
        if self.autograd:
            return Tensor(self.data.sum(dimension), autograd=True, creators=[self], creation_op="sum_"+str(dimension))
        return Tensor(self.data.sum(dimension))

    def expand(self, dimension, copies):
        transpose_cmd = list(range(0, len(self.data.shape)))
        transpose_cmd.insert(dimension, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape).transpose(transpose_cmd)
        
        if self.autograd:
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dimension))
        return Tensor(new_data)
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if self.autograd and x.autograd:
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self, x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self], creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op="tanh")
        return Tensor(np.tanh(self.data))

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [40]:
class Tanh(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.tanh()

In [41]:
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()

In [42]:
class Relu(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.relu()

In [43]:
data = Tensor(np.array([[0,0],[0,1],[1,0],[1,1]]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

model = Sequential([Linear(2, 3), Tanh(), Linear(3, 1), Sigmoid()])
criterion = MSELoss()

optimizer = SGD(parameters=model.get_parameters(), alpha=1)

In [44]:
for i in range(10):
    prediction = model.forward(data)
    
    loss = criterion.forward(prediction, target)
    print(loss)
    
    loss.backward(Tensor(np.ones_like(loss.data)))
    optimizer.step()

[1.09648744]
[0.84013166]
[0.66856069]
[0.49896911]
[0.36942846]
[0.29071279]
[0.18462137]
[0.11293919]
[0.07756522]
[0.06419609]


In [45]:
# Embedding layer

In [46]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad is None:
            grad = Tensor(np.ones_like(self.data))
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "neg":
                self.creators[0].backward(self.grad.__neg__(), self)
            elif self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
            elif self.creation_op == "sub":
                positive_grad = Tensor(self.grad.data)
                self.creators[0].backward(positive_grad, self)
                negative_grad = Tensor(self.grad.__neg__().data)
                self.creators[1].backward(negative_grad, self)
            elif self.creation_op == "mul":
                mul_grad_0 = self.grad * self.creators[0]
                self.creators[0].backward(mul_grad_0, self)
                mul_grad_1 = self.grad * self.creators[1]
                self.creators[1].backward(mul_grad_1, self)
            elif self.creation_op == "transpose":
                self.creators[0].backward(self.grad.transpose())
            elif self.creation_op == "mm":
                activations = self.creators[0]
                weights = self.creators[1]
                activations_grad = self.grad.mm(weights.transpose())
                activations.backward(activations_grad)
                weights_grad = self.grad.transpose().mm(activations).transpose()
                weights.backward(weights_grad)
            elif "sum" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                ds = self.creators[0].data.shape[dim]
                self.creators[0].backward(self.grad.expand(dim, ds))
            elif "expand" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                self.creators[0].backward(self.grad.sum(dim))
            elif self.creation_op == "sigmoid":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * self * (ones - self))
            elif self.creation_op == "tanh":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * (ones - (self * self)))
            elif self.creation_op == "index_select":
                new_grad = np.zeros_like(self.creators[0].data)
                indices = self.index_select_indices.data.flatten()
                grad_reshaped = grad.data.reshape(len(indices), -1)
                for i in range(len(indices)):
                    new_grad[indices[i]] += grad_reshaped[i]
                self.creators[0].backward(Tensor(new_grad))
            

    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data, autograd=True, creators=[self, other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data, autograd=True, creators=[self, other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def sum(self, dimension):
        if self.autograd:
            return Tensor(self.data.sum(dimension), autograd=True, creators=[self], creation_op="sum_"+str(dimension))
        return Tensor(self.data.sum(dimension))

    def expand(self, dimension, copies):
        transpose_cmd = list(range(0, len(self.data.shape)))
        transpose_cmd.insert(dimension, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape).transpose(transpose_cmd)
        
        if self.autograd:
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dimension))
        return Tensor(new_data)
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if self.autograd and x.autograd:
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self, x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self], creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):
        if self.autograd:
            tensor = Tensor(self.data[indices.data], autograd=True, creators=[self], creation_op="index_select")
            tensor.index_select_indices = indices
            return tensor
        return Tensor(self.data[indices.data])

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [47]:
x = Tensor(np.eye(5), autograd=True)
x.index_select(Tensor([[1,2,3],[2,3,4]])).backward()
print(x.grad)

[[0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1.]
 [2. 2. 2. 2. 2.]
 [2. 2. 2. 2. 2.]
 [1. 1. 1. 1. 1.]]


In [48]:
class Embedding(Layer):
    def __init__(self, vocab_size, dimensions):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dimensions = dimensions
        
        weights = (np.random.rand(vocab_size, dimensions) - 0.5) / dimensions
        self.weights = Tensor(weights, autograd=True)
        
        self.parameters.append(self.weights)
    
    def forward(self, input):
        return self.weights.index_select(input)

In [49]:
data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([[0],[1],[0],[1]]), autograd=True)

embedding = Embedding(5, 3)
model = Sequential([embedding, Tanh(), Linear(3, 1), Sigmoid()])
criterion = MSELoss()

optimizer = SGD(parameters=model.get_parameters(), alpha=0.5)

In [50]:
for i in range(10):
    prediction = model.forward(data)
    
    loss = criterion.forward(prediction, target)
    print(loss)
    
    loss.backward(Tensor(np.ones_like(loss.data)))
    optimizer.step()

[1.00933088]
[0.92933578]
[0.84443706]
[0.7327378]
[0.59166591]
[0.44141286]
[0.31249914]
[0.22011396]
[0.15963874]
[0.12052437]


In [51]:
# Cross entropy

In [52]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad is None:
            grad = Tensor(np.ones_like(self.data))
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "neg":
                self.creators[0].backward(self.grad.__neg__(), self)
            elif self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
            elif self.creation_op == "sub":
                positive_grad = Tensor(self.grad.data)
                self.creators[0].backward(positive_grad, self)
                negative_grad = Tensor(self.grad.__neg__().data)
                self.creators[1].backward(negative_grad, self)
            elif self.creation_op == "mul":
                mul_grad_0 = self.grad * self.creators[0]
                self.creators[0].backward(mul_grad_0, self)
                mul_grad_1 = self.grad * self.creators[1]
                self.creators[1].backward(mul_grad_1, self)
            elif self.creation_op == "transpose":
                self.creators[0].backward(self.grad.transpose())
            elif self.creation_op == "mm":
                activations = self.creators[0]
                weights = self.creators[1]
                activations_grad = self.grad.mm(weights.transpose())
                activations.backward(activations_grad)
                weights_grad = self.grad.transpose().mm(activations).transpose()
                weights.backward(weights_grad)
            elif "sum" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                ds = self.creators[0].data.shape[dim]
                self.creators[0].backward(self.grad.expand(dim, ds))
            elif "expand" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                self.creators[0].backward(self.grad.sum(dim))
            elif self.creation_op == "sigmoid":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * self * (ones - self))
            elif self.creation_op == "tanh":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * (ones - (self * self)))
            elif self.creation_op == "index_select":
                new_grad = np.zeros_like(self.creators[0].data)
                indices = self.index_select_indices.data.flatten()
                grad_reshaped = grad.data.reshape(len(indices), -1)
                for i in range(len(indices)):
                    new_grad[indices[i]] += grad_reshaped[i]
                self.creators[0].backward(Tensor(new_grad))
            elif self.creation_op == "cross_entropy":
                dx = self.softmax_output - self.target_dist
                self.creators[0].backward(Tensor(dx))
            

    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data, autograd=True, creators=[self, other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data, autograd=True, creators=[self, other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def sum(self, dimension):
        if self.autograd:
            return Tensor(self.data.sum(dimension), autograd=True, creators=[self], creation_op="sum_"+str(dimension))
        return Tensor(self.data.sum(dimension))

    def expand(self, dimension, copies):
        transpose_cmd = list(range(0, len(self.data.shape)))
        transpose_cmd.insert(dimension, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape).transpose(transpose_cmd)
        
        if self.autograd:
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dimension))
        return Tensor(new_data)
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if self.autograd and x.autograd:
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self, x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self], creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):
        if self.autograd:
            tensor = Tensor(self.data[indices.data], autograd=True, creators=[self], creation_op="index_select")
            tensor.index_select_indices = indices
            return tensor
        return Tensor(self.data[indices.data])

    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape) - 1, keepdims=True)
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t), -1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * target_dist).sum(1).mean()
        
        if self.autograd:
            tensor = Tensor(loss, autograd=True, creators=[self], creation_op="cross_entropy")
            tensor.softmax_output = softmax_output
            tensor.target_dist = target_dist
            return tensor
        return Tensor(loss)

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [53]:
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()
    
    def forward(self, input, target):
        return input.cross_entropy(target)

In [54]:
data = Tensor(np.array([1,2,1,2]), autograd=True)
target = Tensor(np.array([0,1,0,1]), autograd=True)

model = Sequential([Embedding(3, 3), Tanh(), Linear(3, 4)])
criterion = CrossEntropyLoss()

optimizer = SGD(parameters=model.get_parameters(), alpha=0.1)

In [55]:
for i in range(10):
    prediction = model.forward(data)
    
    loss = criterion.forward(prediction, target)
    print(loss)
    
    loss.backward(Tensor(np.ones_like(loss.data)))
    optimizer.step()

1.3399271020544412
1.0619458660885153
0.8728699024400208
0.7334989706390589
0.6224934608997292
0.530626432363073
0.4537804142680153
0.38964996221011383
0.3364454060377313
0.2925048310786341


In [56]:
# Recurrent neural network layer

In [57]:
class RNNCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output, activation="sigmoid"):
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        if activation == "sigmoid":
            self.activation = Sigmoid()
        elif activation == "tanh":
            self.activation = Tanh()
        else:
            raise Exception("Non-linearity not found")
        
        self.weights_ih = Linear(n_inputs, n_hidden)
        self.weights_hh = Linear(n_hidden, n_hidden)
        self.weights_ho = Linear(n_hidden, n_output)
        
        self.parameters += self.weights_ih.get_parameters()
        self.parameters += self.weights_hh.get_parameters()
        self.parameters += self.weights_ho.get_parameters()
    
    def forward(self, input, hidden):
        from_previous_hidden = self.weights_hh.forward(hidden)
        combined = self.weights_ih.forward(input) + from_previous_hidden
        new_hidden = self.activation.forward(combined)
        output = self.weights_ho.forward(new_hidden)
        return output, new_hidden
    
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)

In [58]:
import sys, random, math
from collections import Counter

In [59]:
f = open('data/babi/tasksv11/en/qa1_single-supporting-fact_train.txt')
raw = f.readlines()
f.close()

In [60]:
tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n","").split(" ")[1:])
    
new_tokens = list()
for line in tokens:
    new_tokens.append(['-'] * (6 - len(line)) + line)
tokens = new_tokens

vocab = set()
for sent in tokens:
    for word in sent:
        vocab.add(word)
vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

def words2indices(sentence):
    idx = list()
    for word in sentence:
        idx.append(word2index[word])
    return idx

indices = list()
for line in tokens:
    idx = list()
    for word in line:
        idx.append(word2index[word])
    indices.append(idx)

data = np.array(indices)

In [61]:
embedding = Embedding(vocab_size=len(vocab), dimensions=16)
model = RNNCell(n_inputs=16, n_hidden=16, n_output=len(vocab))
criterion = CrossEntropyLoss()

parameters = model.get_parameters() + embedding.get_parameters()
optimizer = SGD(parameters=parameters, alpha=0.05)

batch_size = 100

In [62]:
for iteration in range(1000):
    total_loss = 0
    hidden = model.init_hidden(batch_size=batch_size)
    
    for t in range(5):
        input = Tensor(data[0:batch_size, t], autograd=True)
        rnn_input = embedding.forward(input=input)
        output, hidden = model.forward(input=rnn_input, hidden=hidden)
    
    target = Tensor(data[0:batch_size, t+1], autograd=True)
    loss = criterion.forward(output, target)
    
    loss.backward()
    optimizer.step()
    
    total_loss += loss.data
    
    if iteration % 200 == 0:
        p_correct = (target.data == np.argmax(output.data, axis=1)).mean()
        print_loss = total_loss / (len(data) / batch_size)
        print("Loss: %f, Correct: %f" % (print_loss, p_correct))

Loss: 0.482629, Correct: 0.000000
Loss: 0.179454, Correct: 0.220000
Loss: 0.160982, Correct: 0.320000
Loss: 0.152251, Correct: 0.330000
Loss: 0.138002, Correct: 0.370000


In [63]:
batch_size = 1
hidden = model.init_hidden(batch_size=batch_size)

for t in range(5):
    input = Tensor(data[0:batch_size, t], autograd=True)
    rnn_input = embedding.forward(input=input)
    output, hidden = model.forward(input=rnn_input, hidden=hidden)

target = Tensor(data[0:batch_size, t+1], autograd=True)
loss = criterion.forward(output, target)

ctx = ""
for idx in data[0:batch_size][0][:-1]:
    ctx += vocab[idx] + " "

print("Context: %s" % ctx)
print("Prediction: %s" % vocab[output.data.argmax()])

Context: - mary moved to the 
Prediction: office.
