# Adaptive Moment Estimation

In [1]:
import csv
import re
from abc import ABC, abstractmethod
import numpy as np

In [2]:
class Tensor:

    def __init__(self, data):
        self.data = np.array(data)
        self.grad = None
        self.gradient_fn = lambda: None
        self.parents = set()

    def gradient(self):
        if self.gradient_fn:
            self.gradient_fn()

        for p in self.parents:
            p.gradient()

    def shape(self):
        return self.data.shape

    def size(self):
        return np.prod(self.data.shape[1:])

    def __add__(self, other):
        p = Tensor(self.data + other.data)

        def gradient_fn():
            self.grad = p.grad
            other.grad = p.grad

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def __mul__(self, other):
        p = Tensor(self.data * other.data)

        def gradient_fn():
            self.grad = p.grad * other.data
            other.grad = p.grad * self.data

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def concat(self, other, axis):
        p = Tensor(np.concatenate([self.data, other.data], axis=axis))

        def gradient_fn():
            self.grad, other.grad = np.split(p.grad, [self.data.shape[axis]], axis=axis)

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

In [3]:
class Sequence:

    def __init__(self, tokens, vocabulary_size, batch_size):
        self.tokens = tokens
        self.vocabulary_size = vocabulary_size
        self.batch_size = batch_size

    def __len__(self):  # 3
        return len(self.tokens) - self.batch_size

    def __getitem__(self, index):  # 4
        return (Tensor([self.tokens[index:index + self.batch_size]]),
                Tensor([self.embedding(self.tokens[index + self.batch_size:
                                                   index + self.batch_size + 1])]))

    def embedding(self, index):
        ebd = np.zeros(self.vocabulary_size)
        ebd[index] = 1
        return ebd

In [4]:
class DataLoader:

    def __init__(self, sequence_batch_size):
        self.sequence_batch_size = sequence_batch_size

        self.reviews = []
        self.sentiments = []
        with open('reviews.csv', 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)
            for _, row in enumerate(reader):
                self.reviews.append(row[0])
                self.sentiments.append(row[1])

        split_reviews = []
        for r in self.reviews:
            split_reviews.append(self.clean_text(r.lower()).split())

        self.vocabulary = set(w for r in split_reviews for w in r)
        self.word2index = {w: idx for idx, w in enumerate(self.vocabulary)}
        self.index2word = {idx: w for idx, w in enumerate(self.vocabulary)}
        self.tokens = [[self.word2index[w] for w in r if w in self.word2index] for r in split_reviews]

        self.train()

    @staticmethod
    def clean_text(text):
        txt = re.sub(r'<[^>]+>', '', text)
        txt = re.sub(r'[^a-zA-Z0-9\s]', '', txt)
        return txt

    def encode(self, text):
        words = self.clean_text(text.lower()).split()
        return [self.word2index[word] for word in words]

    def decode(self, tokens):
        return " ".join([self.index2word[index] for index in tokens])

    def train(self):
        self.sequences = self.tokens[:-10]

    def eval(self):
        self.sequences = self.tokens[-10:]

    def __len__(self):  # 3
        return len(self.sequences)

    def __getitem__(self, index):  # 4
        return Sequence(self.sequences[index], len(self.vocabulary), self.sequence_batch_size)

In [5]:
class Layer(ABC):

    def __init__(self):
        self.training = True

    def __call__(self, *args):
        return self.forward(*args)

    @abstractmethod
    def forward(self, *args):
        pass

    def parameters(self):
        return []

    def train(self):
        self.training = True

    def eval(self):
        self.training = False

In [6]:
np.random.seed(99)

class Linear(Layer):

    def __init__(self, in_size, out_size):
        super().__init__()
        self.in_size = in_size
        self.out_size = out_size

        self.weight = Tensor(np.random.rand(out_size, in_size) / in_size)
        self.bias = Tensor(np.zeros(out_size))

    def forward(self, x: Tensor):
        p = Tensor(x.data @ self.weight.data.T + self.bias.data)

        def gradient_fn():
            self.weight.grad = p.grad.T @ x.data
            self.bias.grad = np.sum(p.grad, axis=0)
            x.grad = p.grad @ self.weight.data

        p.gradient_fn = gradient_fn
        p.parents = {self.weight, self.bias, x}
        return p

    def parameters(self):
        return [self.weight, self.bias]

In [7]:
class Sequential(Layer):

    def __init__(self, layers):
        super().__init__()
        self.layers = layers

    def forward(self, x: Tensor):
        for l in self.layers:
            x = l(x)
        return x

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]

    def train(self):
        for l in self.layers:
            l.train()

    def eval(self):
        for l in self.layers:
            l.eval()

In [8]:
class Embedding(Layer):

    def __init__(self, vocabulary_size, embedding_size, axis=1):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size
        self.axis = axis

        self.weight = Tensor(np.random.rand(embedding_size, vocabulary_size) / vocabulary_size)

    def forward(self, x: Tensor):
        p = Tensor(np.sum(self.weight.data.T[x.data], axis=self.axis))

        def gradient_fn():
            if self.weight.grad is None:
                self.weight.grad = np.zeros_like(self.weight.data)
            self.weight.grad.T[x.data] += p.grad

        p.gradient_fn = gradient_fn
        p.parents = {self.weight}
        return p

    def parameters(self):
        return [self.weight]

In [9]:
class Tanh(Layer):

    def forward(self, x: Tensor):
        p = Tensor(np.tanh(x.data))

        def gradient_fn():
            x.grad = p.grad * (1 - p.data ** 2)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

In [10]:
class Sigmoid(Layer):

    def __init__(self, clip_range=(-100, 100)):
        super().__init__()
        self.clip_range = clip_range

    def forward(self, x: Tensor):
        z = np.clip(x.data, self.clip_range[0], self.clip_range[1])
        p = Tensor(1 / (1 + np.exp(-z)))

        def gradient_fn():
            x.grad = p.grad * p.data * (1 - p.data)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

In [11]:
class CELoss:

    def __call__(self, p: Tensor, y: Tensor):
        exp = np.exp(p.data - np.max(p.data, axis=-1, keepdims=True))
        softmax = exp / np.sum(exp, axis=-1, keepdims=True)

        log = np.log(softmax + 1e-10)
        ce = Tensor(0 - np.sum(y.data * log) / len(p.data))

        def gradient_fn():
            p.grad = (softmax - y.data) / len(p.data)

        ce.gradient_fn = gradient_fn
        ce.parents = {p}
        return ce

In [12]:
class Adam:

    def __init__(self, params, lr=0.01, betas=(0.9, 0.999), eps=1e-8):
        self.parameters = params
        self.lr = lr
        self.beta1, self.beta2 = betas
        self.eps = eps

        self.m = [None for _ in range(len(params))]
        self.v = [None for _ in range(len(params))]
        self.t = 0

    def backward(self):
        self.t += 1
        for idx, p in enumerate(self.parameters):
            if p is not None and p.grad is not None:
                grad = p.grad.reshape(p.data.shape)

                if self.m[idx] is None:
                    self.m[idx] = np.zeros_like(p.data)
                    self.v[idx] = np.zeros_like(p.data)

                self.m[idx] = self.beta1 * self.m[idx] + (1 - self.beta1) * grad
                self.v[idx] = self.beta2 * self.v[idx] + (1 - self.beta2) * (grad ** 2)
                m_hat = self.m[idx] / (1 - self.beta1 ** self.t)
                v_hat = self.v[idx] / (1 - self.beta2 ** self.t)
                p.data -= m_hat / (np.sqrt(v_hat) + self.eps) * self.lr

In [13]:
class LSTM(Layer):

    def __init__(self, vocabulary_size, embedding_size):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size

        self.embedding = Embedding(vocabulary_size, embedding_size)
        self.forget_gate = Linear(embedding_size * 2, embedding_size)
        self.input_gate = Linear(embedding_size * 2, embedding_size)
        self.output_gate = Linear(embedding_size * 2, embedding_size)
        self.cell_update = Linear(embedding_size * 2, embedding_size)
        self.output = Linear(embedding_size, vocabulary_size)
        self.sigmoid = Sigmoid()
        self.tanh = Tanh()

        self.layers = [self.embedding,
                       self.forget_gate,
                       self.input_gate,
                       self.output_gate,
                       self.cell_update,
                       self.output,
                       self.sigmoid,
                       self.tanh]

    def __call__(self, x: Tensor, c: Tensor, h: Tensor):
        return self.forward(x, c, h)

    def forward(self, x: Tensor, c: Tensor, h: Tensor):
        if not c:
            c = Tensor(np.zeros((1, self.embedding_size)))
        if not h:
            h = Tensor(np.zeros((1, self.embedding_size)))

        embedding_feature = self.embedding(x)
        concat_feature = embedding_feature.concat(h, axis=1)
        forget_hidden = self.sigmoid(self.forget_gate(concat_feature))
        input_hidden = self.sigmoid(self.input_gate(concat_feature))
        output_hidden = self.sigmoid(self.output_gate(concat_feature))
        cell_hidden = self.tanh(self.cell_update(concat_feature))
        cell_feature = forget_hidden * c + input_hidden * cell_hidden
        hidden_feature = output_hidden * self.tanh(cell_feature)

        return (self.output(hidden_feature),
                Tensor(cell_feature.data),
                Tensor(hidden_feature.data))

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]

In [14]:
LEARNING_RATE = 0.02
BATCHES = 2
EPOCHS = 10

In [15]:
dataset = DataLoader(BATCHES)

model = LSTM(len(dataset.vocabulary), 64)

loss = CELoss()
sgd = Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    for i in range(len(dataset)):
        sequence = dataset[i]

        cell = hidden = None
        for i in range(len(sequence)):
            feature, label = sequence[i]

            prediction, cell, hidden = model(feature, cell, hidden)
            error = loss(prediction, label)

            error.gradient()
            sgd.backward()

print(f'Prediction: {prediction.data}')
print(f'Error: {error.data}')

Prediction: [[-4.04900090e+00 -5.84674294e+00 -5.33842421e+00 -5.57115736e+00
  -8.18867773e+00 -1.34329802e+01 -1.16146400e+01 -3.14296912e+00
  -1.12861749e+01 -1.76461452e+01 -3.40336663e+00 -8.99788956e+00
  -3.89741699e+00 -5.65133165e+00 -1.54284206e+00 -1.28098516e+01
  -2.30106175e+00 -7.43985541e+00 -8.80805212e+00 -4.53431763e+00
  -1.11425452e+01 -7.82262266e+00 -1.70105253e+01 -1.57685972e+01
  -4.25978467e-01 -3.98345078e+00 -1.53478424e+01 -5.17304282e+00
  -6.75999380e+00 -7.77051522e+00 -4.52281018e-02 -5.62405349e-01
  -1.76404996e+01 -2.51374861e+00 -1.24502808e+01 -7.15392786e+00
  -8.43083031e+00 -1.10636635e+01 -1.82611784e+00 -1.55500135e+01
  -1.05460036e+01 -1.70282013e+01 -8.40966213e+00 -1.34897113e+00
  -5.59147875e+00 -1.20092543e+01 -4.89029561e-03  2.05789977e+00
  -7.29836026e+00 -1.54971561e+00 -1.84463199e+01 -9.22815772e+00
  -1.31208494e+01 -5.30092067e+00 -1.60852403e+01 -1.05135157e+01
  -5.92977257e+00 -9.08364434e+00 -1.01969223e+01 -2.02484418e+0

In [16]:
dataset.eval()

for i in range(len(dataset)):
    sequence = dataset[i]

    feature, label = sequence[0]
    original = sequence.tokens[:BATCHES]
    generated = original.copy()

    cell = hidden = None
    for j in range(len(sequence)):
        feature, label = sequence[j]

        prediction, cell, hidden = model(feature, cell, hidden)
        original.append(sequence.tokens[j + BATCHES])
        generated.append(prediction.data.argmax())

    print(f'original: {dataset.decode(original)}')
    print(f'generated: {dataset.decode(generated)}')

original: worst movie with awful music the actor did a boring job actress director character screenplay scene but or it at by
generated: worst movie with and and actor director and the the on the character character screenplay mine music effect if plot character
original: i recommend this film the actress was wonderful and the music was amazing actor director character scene plot by was
generated: i recommend this film the actor was enjoyed i screenplay story was actress director actress screenplay screenplay plot his story
original: poor movie the story and actor were terrible i disappointed the director actress character action screenplay scene by is
generated: poor movie and and and script actress actress i screenplay actress performance story plot plot music is plot is
original: excellent film i saw this time perfect performance by the actor and actress director screenplay scene character his her
generated: excellent film the saw this time fantastic the actor was was and the the di