In [2]:
import math
import random
import sys
from collections import Counter
import numpy as np

In [3]:
np.random.seed(0)

In [14]:
# Use definitions from previous chapters

In [117]:
class Tensor(object):
    def __init__(self, data, autograd=False, creators=None, creation_op=None, id=None):
        self.data = np.array(data)
        self.creation_op = creation_op
        self.creators = creators
        self.grad = None
        self.autograd = autograd
        self.children = {}
        self.id = np.random.randint(0, 100000) if id is None else id
        
        if creators is not None:
            for c in creators:
                if (self.id not in c.children):
                    c.children[self.id] = 1
                else:
                    c.children[self.id] += 1
    
    def all_children_grads_accounted_for(self):
        for _, cnt in self.children.items():
            if cnt != 0:
                return False
        return True
    
    def backward(self, grad=None, grad_origin=None):
        if not self.autograd:
            return
        
        if grad is None:
            grad = Tensor(np.ones_like(self.data))
        
        if grad_origin is not None:
            if self.children[grad_origin.id] == 0:
                return
                raise Exception("cannot backprop more than once")

            self.children[grad_origin.id] -= 1
        
        if self.grad is None:
            self.grad = grad
        else:
            self.grad += grad
        
        assert grad.autograd == False
        
        if self.creators is not None and (self.all_children_grads_accounted_for() or grad_origin is None):
            if self.creation_op == "neg":
                self.creators[0].backward(self.grad.__neg__(), self)
            elif self.creation_op == "add":
                self.creators[0].backward(self.grad, self)
                self.creators[1].backward(self.grad, self)
            elif self.creation_op == "sub":
                positive_grad = Tensor(self.grad.data)
                self.creators[0].backward(positive_grad, self)
                negative_grad = Tensor(self.grad.__neg__().data)
                self.creators[1].backward(negative_grad, self)
            elif self.creation_op == "mul":
                mul_grad_0 = self.grad * self.creators[0]
                self.creators[0].backward(mul_grad_0, self)
                mul_grad_1 = self.grad * self.creators[1]
                self.creators[1].backward(mul_grad_1, self)
            elif self.creation_op == "transpose":
                self.creators[0].backward(self.grad.transpose())
            elif self.creation_op == "mm":
                activations = self.creators[0]
                weights = self.creators[1]
                activations_grad = self.grad.mm(weights.transpose())
                activations.backward(activations_grad)
                weights_grad = self.grad.transpose().mm(activations).transpose()
                weights.backward(weights_grad)
            elif "sum" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                ds = self.creators[0].data.shape[dim]
                self.creators[0].backward(self.grad.expand(dim, ds))
            elif "expand" in self.creation_op:
                dim = int(self.creation_op.split("_")[1])
                self.creators[0].backward(self.grad.sum(dim))
            elif self.creation_op == "sigmoid":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * self * (ones - self))
            elif self.creation_op == "tanh":
                ones = Tensor(np.ones_like(self.grad.data))
                self.creators[0].backward(self.grad * (ones - (self * self)))
            elif self.creation_op == "index_select":
                new_grad = np.zeros_like(self.creators[0].data)
                indices = self.index_select_indices.data.flatten()
                grad_reshaped = grad.data.reshape(len(indices), -1)
                for i in range(len(indices)):
                    new_grad[indices[i]] += grad_reshaped[i]
                self.creators[0].backward(Tensor(new_grad))
            elif self.creation_op == "cross_entropy":
                dx = self.softmax_output - self.target_dist
                self.creators[0].backward(Tensor(dx))
            

    def __neg__(self):
        if self.autograd:
            return Tensor(self.data * -1, autograd=True, creators=[self], creation_op="neg")
        return Tensor(self.data * -1)
    
    def __add__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data + other.data, autograd=True, creators=[self, other], creation_op="add")
        return Tensor(self.data + other.data)

    def __sub__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data - other.data, autograd=True, creators=[self, other], creation_op="sub")
        return Tensor(self.data - other.data)

    def __mul__(self, other):
        if self.autograd and other.autograd:
            return Tensor(self.data * other.data, autograd=True, creators=[self, other], creation_op="mul")
        return Tensor(self.data * other.data)
    
    def sum(self, dimension):
        if self.autograd:
            return Tensor(self.data.sum(dimension), autograd=True, creators=[self], creation_op="sum_"+str(dimension))
        return Tensor(self.data.sum(dimension))

    def expand(self, dimension, copies):
        transpose_cmd = list(range(0, len(self.data.shape)))
        transpose_cmd.insert(dimension, len(self.data.shape))
        new_shape = list(self.data.shape) + [copies]
        new_data = self.data.repeat(copies).reshape(new_shape).transpose(transpose_cmd)
        
        if self.autograd:
            return Tensor(new_data, autograd=True, creators=[self], creation_op="expand_"+str(dimension))
        return Tensor(new_data)
    
    def transpose(self):
        if self.autograd:
            return Tensor(self.data.transpose(), autograd=True, creators=[self], creation_op="transpose")
        return Tensor(self.data.transpose())
    
    def mm(self, x):
        if self.autograd and x.autograd:
            return Tensor(self.data.dot(x.data), autograd=True, creators=[self, x], creation_op="mm")
        return Tensor(self.data.dot(x.data))
    
    def sigmoid(self):
        if self.autograd:
            return Tensor(1 / (1 + np.exp(-self.data)), autograd=True, creators=[self], creation_op="sigmoid")
        return Tensor(1 / (1 + np.exp(-self.data)))

    def tanh(self):
        if self.autograd:
            return Tensor(np.tanh(self.data), autograd=True, creators=[self], creation_op="tanh")
        return Tensor(np.tanh(self.data))
    
    def index_select(self, indices):
        if self.autograd:
            tensor = Tensor(self.data[indices.data], autograd=True, creators=[self], creation_op="index_select")
            tensor.index_select_indices = indices
            return tensor
        return Tensor(self.data[indices.data])
    
    def softmax(self):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape) - 1, keepdims=True)
        return softmax_output

    def cross_entropy(self, target_indices):
        temp = np.exp(self.data)
        softmax_output = temp / np.sum(temp, axis=len(self.data.shape) - 1, keepdims=True)
        t = target_indices.data.flatten()
        p = softmax_output.reshape(len(t), -1)
        target_dist = np.eye(p.shape[1])[t]
        loss = -(np.log(p) * target_dist).sum(1).mean()
        
        if self.autograd:
            tensor = Tensor(loss, autograd=True, creators=[self], creation_op="cross_entropy")
            tensor.softmax_output = softmax_output
            tensor.target_dist = target_dist
            return tensor
        return Tensor(loss)

    def __repr__(self):
        return str(self.data.__repr__())
    
    def __str__(self):
        return str(self.data.__str__())

In [83]:
class Layer(object):
    def __init__(self):
        self.parameters = list()
    
    def get_parameters(self):
        return self.parameters

In [148]:
class Linear(Layer):
    def __init__(self, n_inputs, n_outputs, bias=True):
        super().__init__()
        
        self.use_bias = bias
        
        weights = np.random.randn(n_inputs, n_outputs) * np.sqrt(2.0/n_inputs)
        self.weights = Tensor(weights, autograd=True)
        
        if self.use_bias:
            self.biases = Tensor(np.zeros(n_outputs), autograd=True)
        
        self.parameters.append(self.weights)
        if self.use_bias:
            self.parameters.append(self.biases)
    
    def forward(self, input):
        if self.use_bias:
            return input.mm(self.weights) + self.biases.expand(0, len(input.data))
        return input.mm(self.weights)

In [149]:
class Sigmoid(Layer):
    def __init__(self):
        super().__init__()
    
    def forward(self, input):
        return input.sigmoid()

In [150]:
class Embedding(Layer):
    def __init__(self, vocab_size, dimensions):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.dimensions = dimensions
        
        weights = (np.random.rand(vocab_size, dimensions) - 0.5) / dimensions
        self.weights = Tensor(weights, autograd=True)
        
        self.parameters.append(self.weights)
    
    def forward(self, input):
        return self.weights.index_select(input)

In [151]:
class RNNCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output, activation="sigmoid"):
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_hidden = n_hidden
        self.n_output = n_output
        
        if activation == "sigmoid":
            self.activation = Sigmoid()
        elif activation == "tanh":
            self.activation = Tanh()
        else:
            raise Exception("Non-linearity not found")
        
        self.weights_ih = Linear(n_inputs, n_hidden)
        self.weights_hh = Linear(n_hidden, n_hidden)
        self.weights_ho = Linear(n_hidden, n_output)
        
        self.parameters += self.weights_ih.get_parameters()
        self.parameters += self.weights_hh.get_parameters()
        self.parameters += self.weights_ho.get_parameters()
    
    def forward(self, input, hidden):
        from_previous_hidden = self.weights_hh.forward(hidden)
        combined = self.weights_ih.forward(input) + from_previous_hidden
        new_hidden = self.activation.forward(combined)
        output = self.weights_ho.forward(new_hidden)
        return output, new_hidden
    
    def init_hidden(self, batch_size=1):
        return Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)

In [152]:
class CrossEntropyLoss(object):
    def __init__(self):
        super().__init__()
    
    def forward(self, input, target):
        return input.cross_entropy(target)

In [153]:
class SGD(object):
    def __init__(self, parameters, alpha=0.1):
        self.parameters = parameters
        self.alpha = alpha
    
    def zero(self):
        for parameter in self.parameters:
            parameter.grad.data *= 0

    def step(self, zero=True):
        for parameter in self.parameters:
            parameter.data -= parameter.grad.data * self.alpha
            
            if zero:
                parameter.grad.data *= 0

In [154]:
# Load dataset

In [155]:
f = open('data/shakespeare/shakespear.txt', 'r')
raw = f.read()
f.close()

In [156]:
# Vocab consists of characters, not words!
vocab = list(set(raw))

In [157]:
word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
indices = np.array(list(map(lambda x: word2index[x], raw)))

In [158]:
embed = Embedding(vocab_size=len(vocab), dimensions=512)
model = RNNCell(n_inputs=512, n_hidden=512, n_output=len(vocab))
criterion = CrossEntropyLoss()
optimizer = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

In [159]:
iterations=1
batch_size = 32
bptt = 16
n_batches = int(len(indices) / batch_size)

In [160]:
trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches).transpose()

input_batched_indices = batched_indices[:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int((n_batches - 1) / bptt)
input_batches = input_batched_indices[:n_bptt*bptt].reshape(n_bptt, bptt, batch_size)
target_batches = target_batched_indices[:n_bptt*bptt].reshape(n_bptt, bptt, batch_size)

In [161]:
def train(iterations=iterations):
    for iteration in range(iterations):
        total_loss = 0
        hidden = model.init_hidden(batch_size=batch_size)
        n_batches = len(input_batches)
        
        for batch_i in range(n_batches):
            hidden = Tensor(hidden.data, autograd=True)
            loss = None
            
            for t in range(bptt):
                embed_input = Tensor(input_batches[batch_i][t], autograd=True)
                rnn_input = embed.forward(input=embed_input)
                output, hidden = model.forward(input=rnn_input, hidden=hidden)
                target = Tensor(target_batches[batch_i][t], autograd=True)
                batch_loss = criterion.forward(output, target)
                loss = batch_loss if t == 0 else loss + batch_loss
        
            loss.backward()
            optimizer.step()
            total_loss += loss.data / bptt
            
            log = "\rIter:" + str(iteration)
            log += "\tBatch:" + str(batch_i + 1) + "/" + str(len(input_batches))
            log += "\tLoss:" + str(np.exp(total_loss / (batch_i + 1)))
            print(log)
        
        optimizer.alpha *= 0.99

train()

Iter:0	Batch:1/195	Loss:93.41064503197084
Iter:0	Batch:2/195	Loss:7.7272044771709e+75
Iter:0	Batch:3/195	Loss:nan




Iter:0	Batch:4/195	Loss:nan
Iter:0	Batch:5/195	Loss:nan
Iter:0	Batch:6/195	Loss:nan
Iter:0	Batch:7/195	Loss:nan
Iter:0	Batch:8/195	Loss:nan
Iter:0	Batch:9/195	Loss:nan
Iter:0	Batch:10/195	Loss:nan
Iter:0	Batch:11/195	Loss:nan
Iter:0	Batch:12/195	Loss:nan
Iter:0	Batch:13/195	Loss:nan
Iter:0	Batch:14/195	Loss:nan
Iter:0	Batch:15/195	Loss:nan
Iter:0	Batch:16/195	Loss:nan
Iter:0	Batch:17/195	Loss:nan
Iter:0	Batch:18/195	Loss:nan
Iter:0	Batch:19/195	Loss:nan
Iter:0	Batch:20/195	Loss:nan
Iter:0	Batch:21/195	Loss:nan
Iter:0	Batch:22/195	Loss:nan
Iter:0	Batch:23/195	Loss:nan
Iter:0	Batch:24/195	Loss:nan
Iter:0	Batch:25/195	Loss:nan
Iter:0	Batch:26/195	Loss:nan
Iter:0	Batch:27/195	Loss:nan
Iter:0	Batch:28/195	Loss:nan
Iter:0	Batch:29/195	Loss:nan
Iter:0	Batch:30/195	Loss:nan
Iter:0	Batch:31/195	Loss:nan
Iter:0	Batch:32/195	Loss:nan
Iter:0	Batch:33/195	Loss:nan
Iter:0	Batch:34/195	Loss:nan
Iter:0	Batch:35/195	Loss:nan
Iter:0	Batch:36/195	Loss:nan
Iter:0	Batch:37/195	Loss:nan
Iter:0	Batch:38/195	



Iter:0	Batch:48/195	Loss:nan
Iter:0	Batch:49/195	Loss:nan
Iter:0	Batch:50/195	Loss:nan
Iter:0	Batch:51/195	Loss:nan
Iter:0	Batch:52/195	Loss:nan
Iter:0	Batch:53/195	Loss:nan
Iter:0	Batch:54/195	Loss:nan
Iter:0	Batch:55/195	Loss:nan
Iter:0	Batch:56/195	Loss:nan
Iter:0	Batch:57/195	Loss:nan
Iter:0	Batch:58/195	Loss:nan
Iter:0	Batch:59/195	Loss:nan
Iter:0	Batch:60/195	Loss:nan
Iter:0	Batch:61/195	Loss:nan
Iter:0	Batch:62/195	Loss:nan
Iter:0	Batch:63/195	Loss:nan
Iter:0	Batch:64/195	Loss:nan
Iter:0	Batch:65/195	Loss:nan
Iter:0	Batch:66/195	Loss:nan
Iter:0	Batch:67/195	Loss:nan
Iter:0	Batch:68/195	Loss:nan
Iter:0	Batch:69/195	Loss:nan
Iter:0	Batch:70/195	Loss:nan
Iter:0	Batch:71/195	Loss:nan
Iter:0	Batch:72/195	Loss:nan
Iter:0	Batch:73/195	Loss:nan
Iter:0	Batch:74/195	Loss:nan
Iter:0	Batch:75/195	Loss:nan
Iter:0	Batch:76/195	Loss:nan
Iter:0	Batch:77/195	Loss:nan
Iter:0	Batch:78/195	Loss:nan
Iter:0	Batch:79/195	Loss:nan
Iter:0	Batch:80/195	Loss:nan
Iter:0	Batch:81/195	Loss:nan
Iter:0	Batch:8

In [162]:
def generate_sample(n=30, init_char=' '):
    s = ""
    hidden = model.init_hidden(batch_size=1)
    embed_input = Tensor(np.array([word2index[init_char]]))

    for i in range(n):
        rnn_input = embed.forward(embed_input)
        output, hidden = model.forward(input=rnn_input, hidden = hidden)
        output.data *= 10
        temp_dist = output.softmax()
        temp_dist /= temp_dist.sum()
        
        maximum = (temp_dist > np.random.rand()).argmax()
        char = vocab[maximum]
        s += char
        embed_input = Tensor(np.array([maximum]))
    
    return s

In [163]:
generate_sample()

  del sys.path[0]


'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT'

In [164]:
# Vanishing/exploding gradients

In [165]:
sigmoid = lambda x: 1 / (1 + np.exp(-x))
relu = lambda x: (x > 0).astype(float) * x

weights = np.array([[1,4], [4,1]])
activation = sigmoid(np.array([1,0.01]))

print("Sigmoid activations")
activations = list()
for iter in range(10):
    activation = sigmoid(activation.dot(weights))
    activations.append(activation)
    print(activation)
    
print("\nSigmoid gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = (activation * (1 - activation) * gradient).dot(weights.transpose())
    print(gradient)

print("\nRelu activations")
activations = list()
for iter in range(10):
    activation = relu(activation.dot(weights))
    activations.append(activation)
    print(activation)
    
print("\nRelu gradients")
gradient = np.ones_like(activation)
for activation in reversed(activations):
    gradient = ((activation > 0) * gradient).dot(weights.transpose())
    print(gradient)

Sigmoid activations
[0.93940638 0.96852968]
[0.9919462  0.99121735]
[0.99301385 0.99302901]
[0.9930713  0.99307098]
[0.99307285 0.99307285]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]
[0.99307291 0.99307291]

Sigmoid gradients
[0.03439552 0.03439552]
[0.00118305 0.00118305]
[4.06916726e-05 4.06916726e-05]
[1.39961115e-06 1.39961115e-06]
[4.81403643e-08 4.81403637e-08]
[1.65582672e-09 1.65582765e-09]
[5.69682675e-11 5.69667160e-11]
[1.97259346e-12 1.97517920e-12]
[8.45387597e-14 8.02306381e-14]
[1.45938177e-14 2.16938983e-14]

Relu activations
[4.8135251  4.72615519]
[23.71814585 23.98025559]
[119.63916823 118.852839  ]
[595.05052421 597.40951192]
[2984.68857188 2977.61160877]
[14895.13500696 14916.36589628]
[74560.59859209 74496.90592414]
[372548.22228863 372739.30029248]
[1863505.42345854 1862932.18944699]
[9315234.18124649 9316953.88328115]

Relu gradients
[5. 5.]
[25. 25.]
[125. 125.]
[625. 625.]
[3125. 3125.]
[15625. 15625.]
[7812

In [166]:
# LSTM cells

In [173]:
class LSTMCell(Layer):
    def __init__(self, n_inputs, n_hidden, n_output):
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_output = n_output
        self.n_hidden = n_hidden
        
        self.xf = Linear(n_inputs, n_hidden)
        self.xi = Linear(n_inputs, n_hidden)
        self.xo = Linear(n_inputs, n_hidden)
        self.xc = Linear(n_inputs, n_hidden)
        
        self.hf = Linear(n_inputs, n_hidden, bias=False)
        self.hi = Linear(n_inputs, n_hidden, bias=False)
        self.ho = Linear(n_inputs, n_hidden, bias=False)
        self.hc = Linear(n_inputs, n_hidden, bias=False)
        
        self.w_ho = Linear(n_hidden, n_output, bias=False)
        
        self.parameters += self.xf.get_parameters()
        self.parameters += self.xi.get_parameters()
        self.parameters += self.xo.get_parameters()
        self.parameters += self.xc.get_parameters()
        
        self.parameters += self.hf.get_parameters()
        self.parameters += self.hi.get_parameters()
        self.parameters += self.ho.get_parameters()
        self.parameters += self.hc.get_parameters()
        
        self.parameters += self.w_ho.get_parameters()
        
    def forward(self, input, hidden):
        prev_hidden = hidden[0]
        prev_cell = hidden[1]

        f = self.xf.forward(input) + self.hf.forward(prev_hidden)
        i = self.xi.forward(input) + self.hi.forward(prev_hidden)
        o = self.xo.forward(input) + self.ho.forward(prev_hidden)
        g = self.xc.forward(input) + self.hc.forward(prev_hidden)
        c = (f * prev_cell) + (i * g)
        h = o * c.tanh()

        output = self.w_ho.forward(h)
        return output, (h, c)

    def init_hidden(self, batch_size=1):
        init_hidden = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        init_cell = Tensor(np.zeros((batch_size, self.n_hidden)), autograd=True)
        init_hidden.data[:,0] += 1
        init_cell.data[:,0] += 1
        return (init_hidden, init_cell)

In [174]:
embed = Embedding(vocab_size=len(vocab), dimensions=512)
model = LSTMCell(n_inputs=512, n_hidden=512, n_output=len(vocab))

criterion = CrossEntropyLoss()
optimizer = SGD(parameters=model.get_parameters() + embed.get_parameters(), alpha=0.05)

batch_size = 16
bptt = 25
n_batches = int(len(indices) / batch_size)

trimmed_indices = indices[:n_batches*batch_size]
batched_indices = trimmed_indices.reshape(batch_size, n_batches)
batched_indices = batched_indices.transpose()

input_batched_indices = batched_indices[0:-1]
target_batched_indices = batched_indices[1:]

n_bptt = int((n_batches-1) / bptt)
input_batches = input_batched_indices[:n_bptt*bptt]
input_batches = input_batches.reshape(n_bptt,bptt,batch_size)
target_batches = target_batched_indices[:n_bptt*bptt]
target_batches = target_batches.reshape(n_bptt, bptt, batch_size)