# 文本生成

In [1]:
import re
from abc import abstractmethod, ABC
import numpy as np

np.random.seed(99)

## Foundation

### Tensor

In [2]:
class Tensor:

    def __init__(self, data):
        self.data = np.array(data)
        self.grad = np.zeros_like(self.data)
        self.gradient_fn = lambda: None
        self.parents = set()

    def backward(self):
        topo = []
        visited = set()

        def build_topo(t):
            if t not in visited:
                visited.add(t)
                for p in t.parents:
                    build_topo(p)
                topo.append(t)

        build_topo(self)

        self.grad = np.ones_like(self.data)
        for t in reversed(topo):
            t.gradient_fn()

    def shape(self):
        return self.data.shape

    def size(self):
        return np.prod(self.data.shape[1:])

    def __str__(self):
        return str(self.data)

    def __add__(self, other):
        p = Tensor(self.data + other.data)

        def gradient_fn():
            self.grad += p.grad
            other.grad += p.grad

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def __sub__(self, other):
        p = Tensor(self.data - other.data)

        def gradient_fn():
            self.grad += p.grad
            other.grad += -p.grad

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def __mul__(self, other):
        p = Tensor(self.data * other.data)

        def gradient_fn():
            self.grad += p.grad * other.data
            other.grad += p.grad * self.data

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def __truediv__(self, other):
        p = Tensor(self.data / other.data)

        def gradient_fn():
            self.grad += p.grad / other.data
            other.grad += -p.grad * self.data / (other.data ** 2)

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def __matmul__(self, other):
        p = Tensor(np.matmul(self.data, other.data))

        def gradient_fn():
            self.grad += np.matmul(p.grad, other.data.swapaxes(-1, -2))
            other.grad += np.matmul(self.data.swapaxes(-1, -2), p.grad)

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def transpose(self, axes=None):
        p = Tensor(np.transpose(self.data, axes))

        def gradient_fn():
            if axes is None:
                self.grad += np.transpose(p.grad)
            else:
                idx = np.argsort(axes)
                self.grad += np.transpose(p.grad, idx)

        p.gradient_fn = gradient_fn
        p.parents = {self}
        return p

    @property
    def T(self):
        return self.transpose()

    def concat(self, other, axis):
        p = Tensor(np.concatenate([self.data, other.data], axis=axis))

        def gradient_fn():
            grad = np.split(p.grad, [self.data.shape[axis]], axis=axis)
            self.grad += grad[0]
            other.grad += grad[1]

        p.gradient_fn = gradient_fn
        p.parents = {self, other}
        return p

    def reshape(self, shape):
        p = Tensor(np.reshape(self.data, shape))

        def gradient_fn():
            self.grad += np.reshape(p.grad, self.data.shape)

        p.gradient_fn = gradient_fn
        p.parents = {self}
        return p

### Base Dataset

In [3]:
class Dataset(ABC):

    def __init__(self, batch_size=1):
        self.batch_size = batch_size
        self.load()
        self.train()

    @abstractmethod
    def load(self):
        pass

    def train(self):
        self.features = self.train_features
        self.labels = self.train_labels

    def eval(self):
        self.features = self.test_features
        self.labels = self.test_labels

    def shape(self):
        return Tensor(self.features).size(), Tensor(self.labels).size()

    def items(self):
        return Tensor(self.features), Tensor(self.labels)

    def __len__(self):
        return len(self.features) // self.batch_size

    def __getitem__(self, index):
        start = index * self.batch_size
        end = start + self.batch_size
        return Tensor(self.features[start: end]), Tensor(self.labels[start: end])

    @abstractmethod
    def estimate(self, predictions):
        pass

### Base Layer

In [4]:
class Layer(ABC):

    def __init__(self):
        self.training = True

    def __call__(self, *args):
        return self.forward(*args)

    def train(self):
        self.training = True

    def eval(self):
        self.training = False

    @abstractmethod
    def forward(self, *args):
        pass

    def parameters(self):
        return []

    def __str__(self):
        return ''

### Base Loss Function

In [5]:
class Loss(ABC):

    def __call__(self, p: Tensor, y: Tensor):
        return self.loss(p, y)

    @abstractmethod
    def loss(self, p: Tensor, y: Tensor):
        pass

### Base Optimizer

In [6]:
class Optimizer(ABC):

    def __init__(self, parameters, lr):
        self.parameters = parameters
        self.lr = lr

    def reset(self):
        for p in self.parameters:
            p.grad = np.zeros_like(p.data)

    @abstractmethod
    def step(self):
        pass

### Base Model

In [7]:
class Model(ABC):

    def __init__(self, layer, loss, optimizer):
        self.layer = layer
        self.loss = loss
        self.optimizer = optimizer

    @abstractmethod
    def train(self, dataset, epochs):
        pass

    @abstractmethod
    def test(self, dataset):
        pass

## Data

### LLM Dataset

In [8]:
class LLMDataset(Dataset):

    def __init__(self, filename, batch_size, stride=1):
        self.filename = filename
        self.stride = stride
        super().__init__(batch_size)

    def load(self):
        with open(self.filename, 'r', encoding='utf-8') as f:
            self.text = f.read().lower()

        self.vocabulary = sorted(set(self.split_text(self.text)))
        self.vocabulary.extend(['<|eos|>', '<|unk|>'])
        self.word2index = {word: index for index, word in enumerate(self.vocabulary)}
        self.index2word = {index: word for index, word in enumerate(self.vocabulary)}
        self.tokens = self.encode(self.text)
        self.batches = []
        for i in range(1, len(self.tokens) - self.batch_size, self.stride):
            self.batches.append(self.tokens[i: i + self.batch_size])

    @staticmethod
    def split_text(text):
        words = re.split(r'([,.:;?_!"()\']|\s)', text.lower())
        return [t.strip() for t in words if t.strip()]

    def train(self):
        self.features = []
        self.labels = []
        for i in range(1, len(self.batches) * 8 // 10):
            self.features.append(self.batches[i])
            self.labels.append(self.onehot(self.batches[i + 1]))

    def eval(self):
        self.features = []
        self.labels = []
        for i in range(len(self.batches) * 8 // 10 + 1, len(self.batches) - 1):
            self.features.append(self.batches[i])
            self.labels.append(self.onehot(self.batches[i + 1]))

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return Tensor(self.features[index]), Tensor(self.labels[index])

    def encode(self, text):
        words = self.split_text(text)
        words = [word if word in self.word2index else '<|unk|>' for word in words]
        return [self.word2index[word] for word in words]

    def decode(self, tokens):
        text = " ".join([self.index2word[index] for index in tokens])
        text = re.sub(r'\s+([,.:;?!)\]}>])', r'\1', text)
        text = re.sub(r'([([<{])\s+', r'\1', text)
        text = re.sub(r'(")\s+(.*?)\s+(")', r'\1\2\3', text)
        text = re.sub(r"(')\s+(.*?)\s+(')", r'\1\2\3', text)
        return text.strip()

    def onehot(self, tokens):
        ebd = np.zeros((len(tokens), len(self.vocabulary)))
        ebd[np.arange(len(tokens)), tokens] = 1
        return ebd

    @staticmethod
    def argmax(vector):
        return [np.argmax(vector)]

    def estimate(self, predictions):
        count = 0
        for i in range(len(predictions)):
            if self.argmax(predictions[i].data[-1]) == self.argmax(np.array(self.labels[i].data)[-1]):
                count += 1
        return count / len(predictions)

## Model

### Linear Layer

In [9]:
class Linear(Layer):

    def __init__(self, in_size, out_size):
        super().__init__()
        self.weight = Tensor(np.random.randn(out_size, in_size) * np.sqrt(2 / in_size))
        self.bias = Tensor(np.zeros(out_size))

    def forward(self, x: Tensor):
        p = Tensor(x.data @ self.weight.data.T + self.bias.data)

        def gradient_fn():
            self.weight.grad += p.grad.T @ x.data
            self.bias.grad += np.sum(p.grad, axis=0)
            x.grad += p.grad @ self.weight.data

        p.gradient_fn = gradient_fn
        p.parents = {self.weight, self.bias, x}
        return p

    def parameters(self):
        return [self.weight, self.bias]

    def __str__(self):
        return f'weight: {self.weight}\nbias: {self.bias}'

### Sequential Layer

In [10]:
class Sequential(Layer):

    def __init__(self, layers):
        super().__init__()
        self.layers = layers

    def train(self):
        for l in self.layers:
            l.train()

    def eval(self):
        for l in self.layers:
            l.eval()

    def forward(self, x: Tensor):
        for l in self.layers:
            x = l(x)
        return x

    def parameters(self):
        return [p for l in self.layers for p in l.parameters()]

    def __str__(self):
        return '\n'.join(str(l) for l in self.layers if str(l))

### Embedding Layer

In [11]:
class Embedding(Layer):

    def __init__(self, vocabulary_size, embedding_size, axis=None):
        super().__init__()
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size
        self.axis = axis

        self.weight = Tensor(np.random.randn(embedding_size, vocabulary_size) * np.sqrt(2 / vocabulary_size))

    def forward(self, x: Tensor):
        weights = self.weight.data.T[x.data]
        p = Tensor(np.sum(weights, axis=self.axis) if self.axis is not None else weights)

        def gradient_fn():
            if type(self.weight.grad) is not np.ndarray:
                self.weight.grad = np.zeros_like(self.weight.data)
            self.weight.grad.T[x.data] += p.grad / len(x.data)

        p.gradient_fn = gradient_fn
        p.parents = {self.weight}
        return p

    def parameters(self):
        return [self.weight]

### Triu Layer

In [12]:
class Triu(Layer):

    def __init__(self, value=-np.inf):
        super().__init__()
        self.value = value

    def forward(self, x: Tensor):
        axes = list(range(x.data.ndim))
        axes[-2], axes[-1] = axes[-1], axes[-2]
        mask = np.triu(np.ones(x.data.shape)).transpose(axes)
        p = Tensor(x.data)
        p.data[mask == 0] = self.value

        def gradient_fn():
            x.grad += p.grad * mask

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

### Softmax Activation Function

In [13]:
class Softmax(Layer):

    def __init__(self, axis=-1):
        super().__init__()
        self.axis = axis

    def forward(self, x: Tensor):
        exp = np.exp(x.data - np.max(x.data, axis=self.axis, keepdims=True))
        p = Tensor(exp / np.sum(exp, axis=self.axis, keepdims=True))

        def gradient_fn():
            grad = np.sum(p.data * p.grad, axis=self.axis, keepdims=True)
            x.grad += p.data * (p.grad - grad)

        p.gradient_fn = gradient_fn
        p.parents = {x}
        return p

### Cross Entropy Loss Function

In [14]:
class CELoss(Loss):

    def loss(self, p: Tensor, y: Tensor):
        exp = np.exp(p.data - np.max(p.data, axis=-1, keepdims=True))
        softmax = exp / np.sum(exp, axis=-1, keepdims=True)

        log = np.log(np.clip(softmax, 1e-10, 1))
        ce = Tensor(0 - np.sum(y.data * log) / len(y.data))

        def gradient_fn():
            p.grad += (softmax - y.data) / len(y.data)

        ce.gradient_fn = gradient_fn
        ce.parents = {p}
        return ce

### Adam Optimizer

In [15]:
class AdamOptimizer(Optimizer):

    def __init__(self, parameters, lr=0.01, betas=(0.9, 0.999), eps=1e-8):
        super().__init__(parameters, lr)
        self.beta1, self.beta2 = betas
        self.eps = eps

        self.m = [None for _ in range(len(parameters))]
        self.v = [None for _ in range(len(parameters))]
        self.t = 0

    def step(self):
        self.t += 1

        for idx, p in enumerate(self.parameters):
            if p is not None and p.grad is not None:
                grad = p.grad.reshape(p.data.shape)

                if self.m[idx] is None:
                    self.m[idx] = np.zeros_like(p.data)
                    self.v[idx] = np.zeros_like(p.data)

                self.m[idx] = self.beta1 * self.m[idx] + (1 - self.beta1) * grad
                self.v[idx] = self.beta2 * self.v[idx] + (1 - self.beta2) * (grad ** 2)
                m_hat = self.m[idx] / (1 - self.beta1 ** self.t)
                v_hat = self.v[idx] / (1 - self.beta2 ** self.t)
                p.data -= m_hat / (np.sqrt(v_hat) + self.eps) * self.lr

### GPT Embedding Layer

In [16]:
class GPTEmbedding(Sequential):

    def __init__(self, vocabulary_size, context_size, embedding_size):
        self.vocabulary_size = vocabulary_size
        self.context_size = context_size
        self.embedding_size = embedding_size

        self.token_embedding = Embedding(self.vocabulary_size, self.embedding_size)
        self.positional_embedding = Embedding(self.context_size, self.embedding_size)

        layers = [self.token_embedding, self.positional_embedding]
        super().__init__(layers)

    def forward(self, x: Tensor):
        token = self.token_embedding(x)
        position = self.positional_embedding(Tensor(range(len(x.data))))
        return token + position

### GPT Attention Layer

In [17]:
class GPTAttention(Sequential):

    def __init__(self, context_size, embedding_size, heads=1):
        self.context_size = context_size
        self.embedding_size = embedding_size
        self.heads = heads

        self.attention_query = Linear(self.embedding_size, self.embedding_size * self.heads)
        self.attention_key = Linear(self.embedding_size, self.embedding_size * self.heads)
        self.attention_value = Linear(self.embedding_size, self.embedding_size * self.heads)
        self.triu = Triu()
        self.softmax = Softmax()
        self.merge = Linear(self.heads * self.embedding_size, self.embedding_size)

        layers = [self.attention_query, self.attention_key, self.attention_value, self.triu, self.softmax, self.merge]
        super().__init__(layers)

    def forward(self, x: Tensor):
        query = self.attention_query(x).reshape((-1, self.heads, self.embedding_size))
        key = self.attention_key(x).reshape((-1, self.heads, self.embedding_size))
        value = self.attention_value(x).reshape((-1, self.heads, self.embedding_size))

        scores = self.triu(query @ key.transpose((0, 2, 1)))
        weights = self.softmax(scores)
        vectors = self.merge((weights @ value).reshape((-1, self.heads * self.embedding_size)))
        return vectors

### GPT Output Layer

In [18]:
class GPTOutput(Sequential):

    def __init__(self, vocabulary_size, embedding_size):
        self.vocabulary_size = vocabulary_size
        self.embedding_size = embedding_size

        self.output = Linear(self.embedding_size, self.vocabulary_size)

        layers = [self.output]
        super().__init__(layers)

    def forward(self, x: Tensor):
        return self.output(x)

### GPT Layer

In [19]:
class GPT(Sequential):

    def __init__(self, vocabulary_size, context_size, embedding_size, heads=1):
        self.vocabulary_size = vocabulary_size
        self.context_size = context_size
        self.embedding_size = embedding_size
        self.heads = heads

        self.embedding = GPTEmbedding(self.vocabulary_size, self.context_size, self.embedding_size)
        self.attention = GPTAttention(self.context_size, self.embedding_size, self.heads)
        self.output = GPTOutput(self.vocabulary_size, self.embedding_size)

        layers = [self.embedding, self.attention, self.output]
        super().__init__(layers)

    def forward(self, x: Tensor):
        x = self.embedding(x)
        x = self.attention(x)
        return self.output(x)

### LLM Neural Network Model

In [20]:
class LLMModel(Model):

    def train(self, dataset, epochs):
        self.layer.train()
        dataset.train()

        for epoch in range(epochs):
            for i in range(len(dataset)):
                features, labels = dataset[i]

                predictions = self.layer(features)
                error = self.loss(predictions, labels)

                self.optimizer.reset()
                error.backward()
                self.optimizer.step()

    def test(self, dataset):
        self.layer.eval()
        dataset.eval()

        predictions = []
        for i in range(len(dataset)):
            feature, label = dataset[i]
            prediction = self.layer(feature)
            predictions.append(prediction)
        return predictions

## Configuration

### Context Size

In [21]:
CONTEXT_SIZE = 6

### Embedding Size

In [22]:
EMBEDDING_SIZE = 8

### Heads

In [23]:
HEADS = 2

### Learning Rate

In [24]:
LEARNING_RATE = 0.005

### Epochs

In [25]:
EPOCHS = 10

## Training

In [26]:
dataset = LLMDataset('../tinybook.txt', CONTEXT_SIZE)
layer = GPT(len(dataset.vocabulary), CONTEXT_SIZE, EMBEDDING_SIZE, HEADS)
loss = CELoss()
optimizer = AdamOptimizer(layer.parameters(), lr=LEARNING_RATE)

model = LLMModel(layer, loss, optimizer)
model.train(dataset, EPOCHS)

## Testing

### Estimating

In [27]:
predictions = model.test(dataset)

print(f'Accuracy: {dataset.estimate(predictions)}')

Accuracy: 0.17307692307692307


### Generating

In [28]:
def generate(layer, dataset, prompt, max_new_tokens=50, temperature=0.8, top_k=None):
    tokens = dataset.encode(prompt)

    for _ in range(max_new_tokens):
        feature = Tensor(tokens[-dataset.batch_size:])
        predictions = layer.forward(feature)

        prediction = predictions.data[-1, :] / temperature

        if top_k is not None:
            v = np.partition(prediction, -top_k)[-top_k]
            prediction[prediction < v] = -float('Inf')

        exp = np.exp(prediction - np.max(prediction))
        probs = exp / np.sum(exp)

        token = np.random.choice(len(probs), p=probs)
        tokens.append(token)

    return dataset.decode(tokens)

prompt = "The sun rises over the hills."
text = generate(layer, dataset, prompt=prompt)

print(f"Prompt: {prompt}")
print(f"Generated: {text}")

Prompt: The sun rises over the hills.
Generated: the sun rises over the hills. "stay on the mountain is not on the path." be found climb too high. "tom takes biggest adventure and a long," his mother says. "his village." stay on the mill. "his leaves." stay on the
