# Adam优化器

In [53]:
import csv
import re
from abc import abstractmethod, ABC

import numpy as np

np.random.seed(99)

## 基础架构

### 张量

In [54]:
class Tensor:

    def __init__(self, data):
        self.data = np.array(data)
        self.grad = np.zeros_like(self.data)
        self.gradient_fn = lambda: None
        self.parents = set()

    def backward(self):
        if self.gradient_fn:
            self.gradient_fn()

        for p in self.parents:
            p.backward()

    def shape(self):
        return self.data.shape

    def size(self):
        return np.prod(self.data.shape[1:])

    def __str__(self):
        return f'Tensor({self.data})'

    def __add__(self, other):
        p = Tensor(self.data + other.data)

        def gradient_fn():
            self.grad += p.grad
            other.grad += p.grad

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def __mul__(self, other):
        p = Tensor(self.data * other.data)

        def gradient_fn():
            self.grad += p.grad * other.data
            other.grad += p.grad * self.data

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def concat(self, other, axis):
        p = Tensor(np.concatenate([self.data, other.data], axis=axis))

        def gradient_fn():
            grad = np.split(p.grad, [self.data.shape[axis]], axis=axis)
            self.grad += grad[0]
            other.grad += grad[1]

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

### 基础数据集

In [55]:
class Dataset(ABC):

    def __init__(self, batch_size=1):
        self.batch_size = batch_size
        self.load()
        self.train()

    @abstractmethod
    def load(self):
        pass

    def train(self):
        self.features = self.train_features
        self.labels = self.train_labels

    def eval(self):
        self.features = self.test_features
        self.labels = self.test_labels

    def shape(self):
        return Tensor(self.features).size(), Tensor(self.labels).size()

    def items(self):
        return Tensor(self.features), Tensor(self.labels)

    def __len__(self):
        return len(self.features) // self.batch_size

    def __getitem__(self, index):
        start = index * self.batch_size
        end = start + self.batch_size

        feature = Tensor(self.features[start: end])
        label = Tensor(self.labels[start: end])
        return feature, label

### 基础层

In [56]:
class Layer(ABC):

    def __init__(self):
        self.training = True

    def __call__(self, *args):
        return self.forward(*args)

    def train(self):
        self.training = True

    def eval(self):
        self.training = False

    @abstractmethod
    def forward(self, *args):
        pass

    def parameters(self):
        return []

    def __str__(self):
        return ''

### 基础损失函数

In [57]:
class Loss(ABC):

    def __call__(self, p: Tensor, y: Tensor):
        return self.loss(p, y)

    @abstractmethod
    def loss(self, p: Tensor, y: Tensor):
        pass

### 基础优化器

In [58]:
class Optimizer(ABC):

    def __init__(self, parameters, lr):
        self.parameters = parameters
        self.lr = lr

    def reset(self):
        for p in self.parameters:
            p.grad = np.zeros_like(p.data)

    @abstractmethod
    def step(self):
        pass

### 基础模型

In [59]:
class Model(ABC):

    def __init__(self, layer, loss_fn, optimizer):
        self.layer = layer
        self.loss_fn = loss_fn
        self.optimizer = optimizer

    @abstractmethod
    def train(self, dataset, epochs):
        pass

    @abstractmethod
    def test(self, dataset):
        pass

## 数据

### IMDB数据集

In [60]:
class IMDBDataset(Dataset):

    def __init__(self, filename):
        self.filename = filename
        super().__init__()

    def load(self):
        self.reviews = []
        self.sentiments = []
        with open(self.filename, 'r', encoding='utf-8') as f:
            reader = csv.reader(f)
            next(reader)
            for _, row in enumerate(reader):
                self.reviews.append(row[0])
                self.sentiments.append(row[1])

        split_reviews = []
        for line in self.reviews:
            split_reviews.append(self.clean_text(line.lower()).split())

        self.vocabulary = set(word for line in split_reviews for word in line)
        self.word2index = {word: index for index, word in enumerate(self.vocabulary)}
        self.index2word = {index: word for index, word in enumerate(self.vocabulary)}
        self.tokens = [[self.word2index[word] for word in line if word in self.word2index] for line in split_reviews]

    @staticmethod
    def clean_text(text):
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        return text

    def train(self):
        self.features = []
        self.labels = []
        for line in self.tokens[:-10]:
            for index in range(len(line) - 4):
                self.features.append([line[index], line[index + 1], line[index + 3], line[index + 4]])
                self.labels.append(self.onehot(line[index + 2]))

    def eval(self):
        self.features = []
        self.labels = []
        for line in self.tokens[-10:]:
            for index in range(len(line) - 4):
                self.features.append([line[index], line[index + 1], line[index + 3], line[index + 4]])
                self.labels.append(self.onehot(line[index + 2]))

    def encode(self, text):
        words = self.clean_text(text.lower()).split()
        return [self.word2index[word] for word in words]

    def decode(self, tokens):
        return " ".join([self.index2word[index] for index in tokens])

    def onehot(self, token):
        ebd = np.zeros(len(self.vocabulary))
        ebd[token] = 1
        return ebd

    @staticmethod
    def argmax(vector):
        return [np.argmax(vector)]

    def estimate(self, predictions):
        count = 0
        for i in range(len(predictions)):
            if self.argmax(predictions[i].data[0]) == self.argmax(self.labels[i].data):
                count += 1
        return count / len(predictions)

## 模型

### 线性层

In [61]:
class Linear(Layer):

    def __init__(self, in_size, out_size):
        super().__init__()
        self.weight = Tensor(np.random.randn(out_size, in_size) * np.sqrt(2 / in_size))
        self.bias = Tensor(np.zeros(out_size))

    def forward(self, x: Tensor):
        p = Tensor(x.data @ self.weight.data.T + self.bias.data)

        def gradient_fn():
            self.weight.grad += p.grad.T @ x.data / len(x.data)
            self.bias.grad += np.sum(p.grad, axis=0) / len(x.data)
            x.grad += p.grad @ self.weight.data / len(x.data)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

    def parameters(self):
        return [self.weight, self.bias]

    def __str__(self):
        return f'Linear[weight{self.weight.data.shape}; bias{self.bias.data.shape}]'

### 顺序层

In [62]:
class Sequential(Layer):

    def __init__(self, layers):
        super().__init__()
        self.layers = layers

    def train(self):
        for l in self.layers:
            l.train()

    def eval(self):
        for l in self.layers:
            l.eval()

    def forward(self, x: Tensor):
        for l in self.layers:
            x = l(x)
        return x

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]

    def __str__(self):
        return '\n'.join(str(l) for l in self.layers if str(l))

### 嵌入层

In [63]:
class Embedding(Layer):

    def __init__(self, vocabulary_size, embedding_size, axis=1):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size
        self.axis = axis

        self.weight = Tensor(np.random.randn(embedding_size, vocabulary_size) * np.sqrt(2 / vocabulary_size))

    def forward(self, x: Tensor):
        p = Tensor(np.sum(self.weight.data.T[x.data], axis=self.axis))

        def gradient_fn():
            if type(self.weight.grad) is not np.ndarray:
                self.weight.grad = np.zeros_like(self.weight.data)
            self.weight.grad.T[x.data] += p.grad / len(x.data)

        p.gradient_fn = gradient_fn
        p.parents = {self.weight}
        return p

    def parameters(self):
        return [self.weight]

    def __str__(self):
        return f'Embedding[weight{self.weight.data.shape}; vocabulary={self.vocabulary_size}; embedding={self.embedding_size}]'

### ReLU激活函数

In [64]:
class ReLU(Layer):

    def forward(self, x: Tensor):
        p = Tensor(np.maximum(0, x.data))

        def gradient_fn():
            x.grad += p.grad * (p.data > 0)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

    def __str__(self):
        return f'ReLU[]'

### Tanh激活函数

In [65]:
class Tanh(Layer):

    def forward(self, x: Tensor):
        p = Tensor(np.tanh(x.data))

        def gradient_fn():
            x.grad += p.grad * (1 - p.data ** 2)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

    def __str__(self):
        return f'Tanh[]'

### Sigmoid激活函数

In [66]:
class Sigmoid(Layer):

    def __init__(self, clip_range=(-100, 100)):
        super().__init__()
        self.clip_range = clip_range

    def forward(self, x: Tensor):
        z = np.clip(x.data, self.clip_range[0], self.clip_range[1])
        p = Tensor(1 / (1 + np.exp(-z)))

        def gradient_fn():
            x.grad += p.grad * p.data * (1 - p.data)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

    def __str__(self):
        return f'Sigmoid[]'

### Softmax激活函数

In [67]:
class Softmax(Layer):

    def __init__(self, axis=-1):
        super().__init__()
        self.axis = axis

    def forward(self, x: Tensor):
        exp = np.exp(x.data - np.max(x.data, axis=self.axis, keepdims=True))
        p = Tensor(exp / np.sum(exp, axis=self.axis, keepdims=True))

        def gradient_fn():
            grad = np.sum(p.data * p.grad, axis=self.axis, keepdims=True)
            x.grad += p.data * (p.grad - grad)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

    def __str__(self):
        return f'Softmax[]'

## 长短期记忆层

In [68]:
class LSTM(Layer):

    def __init__(self, vocabulary_size, embedding_size):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size

        self.embedding = Embedding(vocabulary_size, embedding_size)
        self.forget_gate = Linear(embedding_size * 2, embedding_size)
        self.input_gate = Linear(embedding_size * 2, embedding_size)
        self.output_gate = Linear(embedding_size * 2, embedding_size)
        self.cell_update = Linear(embedding_size * 2, embedding_size)
        self.output = Linear(embedding_size, vocabulary_size)
        self.sigmoid = Sigmoid()
        self.tanh = Tanh()

        self.layers = [self.embedding,
                       self.forget_gate,
                       self.input_gate,
                       self.output_gate,
                       self.cell_update,
                       self.output,
                       self.sigmoid,
                       self.tanh]

    def __call__(self, x: Tensor, c: Tensor, h: Tensor):
        return self.forward(x, c, h)

    def forward(self, x: Tensor, c: Tensor, h: Tensor):
        if not c:
            c = Tensor(np.zeros((1, self.embedding_size)))
        if not h:
            h = Tensor(np.zeros((1, self.embedding_size)))

        embedding_feature = self.embedding(x)
        concat_feature = self.tanh(embedding_feature.concat(h, axis=1))
        forget_hidden = self.sigmoid(self.forget_gate(concat_feature))
        input_hidden = self.sigmoid(self.input_gate(concat_feature))
        output_hidden = self.sigmoid(self.output_gate(concat_feature))
        cell_hidden = self.tanh(self.cell_update(concat_feature))
        cell_feature = forget_hidden * c + input_hidden * cell_hidden
        hidden_feature = output_hidden * self.tanh(cell_feature)

        return (self.output(hidden_feature),
                Tensor(cell_feature.data),
                Tensor(hidden_feature.data))

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]

    def __str__(self):
        return '\n'.join(str(l) for l in self.layers if str(l))

### 损失函数（交叉熵）

In [69]:
class CELoss(Loss):

    def loss(self, p: Tensor, y: Tensor):
        exp = np.exp(p.data - np.max(p.data, axis=-1, keepdims=True))
        softmax = exp / np.sum(exp, axis=-1, keepdims=True)

        log = np.log(np.clip(softmax, 1e-10, 1))
        ce = Tensor(0 - np.sum(y.data * log) / len(p.data))

        def gradient_fn():
            p.grad += (softmax - y.data) / len(p.data)

        ce.gradient_fn = gradient_fn
        ce.parents = {p}
        return ce

### 损失函数（二元交叉熵）

In [70]:
class BCELoss(Loss):

    def loss(self, p: Tensor, y: Tensor):
        clipped = np.clip(p.data, 1e-7, 1 - 1e-7)
        bce = Tensor(-np.mean(y.data * np.log(clipped)
                              + (1 - y.data) * np.log(1 - clipped)))

        def gradient_fn():
            p.grad += (clipped - y.data) / (clipped * (1 - clipped) * len(p.data))

        bce.gradient_fn = gradient_fn
        bce.parents = {p}
        return bce

### 优化器（随机梯度下降）

In [71]:
class SGDOptimizer(Optimizer):

    def step(self):
        for p in self.parameters:
            p.data -= p.grad * self.lr

### 优化器（Adam）

In [72]:
class AdamOptimizer(Optimizer):

    def __init__(self, parameters, lr=0.01, betas=(0.9, 0.999), eps=1e-8):
        super().__init__(parameters, lr)
        self.beta1, self.beta2 = betas
        self.eps = eps

        self.m = [None for _ in range(len(parameters))]
        self.v = [None for _ in range(len(parameters))]
        self.t = 0

    def step(self):
        self.t += 1

        for idx, p in enumerate(self.parameters):
            if p is not None and p.grad is not None:
                grad = p.grad.reshape(p.data.shape)

                if self.m[idx] is None:
                    self.m[idx] = np.zeros_like(p.data)
                    self.v[idx] = np.zeros_like(p.data)

                self.m[idx] = self.beta1 * self.m[idx] + (1 - self.beta1) * grad
                self.v[idx] = self.beta2 * self.v[idx] + (1 - self.beta2) * (grad ** 2)
                m_hat = self.m[idx] / (1 - self.beta1 ** self.t)
                v_hat = self.v[idx] / (1 - self.beta2 ** self.t)
                p.data -= m_hat / (np.sqrt(v_hat) + self.eps) * self.lr

### 长短期记忆神经元网络模型

In [73]:
class LSTMModel(Model):

    def train(self, dataset, epochs):
        self.layer.train()
        dataset.train()

        for epoch in range(epochs):
            cell = hidden = None
            for i in range(len(dataset)):
                feature, label = dataset[i]

                prediction, cell, hidden = self.layer(feature, cell, hidden)
                loss = self.loss_fn(prediction, label)

                self.optimizer.reset()
                loss.backward()
                self.optimizer.step()

    def test(self, dataset):
        self.layer.eval()
        dataset.eval()

        predictions = []
        cell = hidden = None
        for i in range(len(dataset)):
            feature, label = dataset[i]
            prediction, cell, hidden = self.layer(feature, cell, hidden)
            predictions.append(prediction)
        return predictions

## 设置

### 学习率

In [74]:
LEARNING_RATE = 0.01

### 轮次

In [75]:
EPOCHS = 20

## 训练

### 迭代

In [76]:
dataset = IMDBDataset('tinyimdb.csv')
layer = LSTM(len(dataset.vocabulary), 64)
loss = CELoss()
optimizer = AdamOptimizer(layer.parameters(), lr=LEARNING_RATE)

model = LSTMModel(layer, loss, optimizer)
model.train(dataset, EPOCHS)
print(layer)

Embedding[weight(64, 86); vocabulary=86; embedding=64]
Linear[weight(64, 128); bias(64,)]
Linear[weight(64, 128); bias(64,)]
Linear[weight(64, 128); bias(64,)]
Linear[weight(64, 128); bias(64,)]
Linear[weight(86, 64); bias(86,)]
Sigmoid[]
Tanh[]


## 验证

### 测试

In [77]:
predictions = model.test(dataset)
print(f'Accuracy: {dataset.estimate(predictions)}')

Accuracy: 0.5732484076433121


### 对比

In [78]:
features, labels = dataset.items()
for i in range(len(predictions)):
    pos = np.argmax(predictions[i].data[0])
    print(f'Feature: {dataset.decode(features.data[i])} | '
          f'Label: {dataset.decode(dataset.argmax(labels.data[i]))} | '
          f'Prediction: {dataset.decode(dataset.argmax(predictions[i].data[0]))}')

Feature: worst movie awful music | Label: with | Prediction: the
Feature: movie with music the | Label: awful | Prediction: actor
Feature: with awful the actor | Label: music | Prediction: was
Feature: awful music actor did | Label: the | Prediction: the
Feature: music the did a | Label: actor | Prediction: actress
Feature: the actor a boring | Label: did | Prediction: did
Feature: actor did boring job | Label: a | Prediction: a
Feature: did a job actress | Label: boring | Prediction: wonderful
Feature: a boring actress director | Label: job | Prediction: job
Feature: boring job director character | Label: actress | Prediction: actress
Feature: job actress character screenplay | Label: director | Prediction: director
Feature: actress director screenplay scene | Label: character | Prediction: character
Feature: director character scene but | Label: screenplay | Prediction: screenplay
Feature: character screenplay but or | Label: scene | Prediction: actress
Feature: screenplay scene or i