# This notebook contains an implementation for language modelling. 
At the core, a language model is a sequence classifier that uses all the tokens produced so far as input in order to produce a probability density function over all possible next tokens (a token could be a word, a character, or something inbetween). We can then either use the "best possible guess" of the classifier as the next token, or we can sample from the distribution according to the distribution. 

In fact, producing a probability density function comes for free, when we build a neural classifier that uses a softmax output activation. Therefore, nothing actually changes from "before", when we simply built classifiers.

Once we have trained the model, we repeatedly ask for next tokens, and add these to the context. This is called "autoregressive sequence generation".

In [1]:
import torch
import torch.nn as nn
import ipywidgets as widgets
import random
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
# load the data

START_SYMBOL = "<s>"
END_SYMBOL = "</s>"

data = open('data/merkel-de.txt', 'r').read() # should be simple plain text file
characters = set(data)
characters = list(sorted(characters))
characters.append(START_SYMBOL)
characters.append(END_SYMBOL)
characters.remove('\n')
NUM_CHARACTERS = len(characters)
sentences = data.splitlines()
int2char = list(characters)
char2int = {c:i for i,c in enumerate(characters)}
print(characters)
print(sentences[0:4])

[' ', '!', '"', '#', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xad', '½', 'Ä', 'É', 'Ö', 'Ü', 'ß', 'á', 'ä', 'ç', 'è', 'é', 'ê', 'ï', 'ò', 'ó', 'ô', 'ö', 'ú', 'ü', 'ă', 'ć', 'ę', 'ğ', 'ł', 'ń', 'ō', 'ř', 'ś', 'ž', '̈', '‐', '–', '‘', '’', '‚', '“', '”', '„', '…', '<s>', '</s>']
['Liebe Mitbürgerinnen und Mitbürger, jetzt geht es los. Der Anstoß zur Fußball-Weltmeisterschaft steht unmittelbar bevor. Millionen haben auf diesen Augenblick gewartet - nicht nur in Deutschland, sondern in der ganzen Welt.', 'Vor dem Eröffnungsspiel gegen Costa Rica bin ich noch einmal mit Jürgen Klinsmann und unserer Nationalmannschaft zusammengetroffen. Jeder

In [11]:
INPUT_SIZE = NUM_CHARACTERS
EMBED_SIZE = 8
HIDDEN_SIZE = 64
LAYERS = 2
MAX_GENERATION_LENGTH = 80
# okay, what's a recurrent neural network anyway? see https://calvinfeng.gitbook.io/machine-learning-notebook/supervised-learning/recurrent-neural-network/recurrent_neural_networks
# At every time step, we have an input, 

NUM_CLASSES = NUM_CHARACTERS
class LM(nn.Module):
    def __init__(self):
        super(LM, self).__init__()
        self.embed = torch.nn.Embedding(INPUT_SIZE, EMBED_SIZE)
        torch.nn.init.xavier_uniform_(self.embed.weight)
        self.rnn = nn.GRU(EMBED_SIZE, HIDDEN_SIZE, LAYERS)
        self.final_layer = nn.Linear(HIDDEN_SIZE, NUM_CLASSES)

    def forward(self, xs : torch.tensor):
        xs = self.embed(xs)
        rnn_outputs, _ = self.rnn(xs)
        results = self.final_layer(rnn_outputs)
        return results

    def forwardx(self, xs : torch.tensor):
        xs = self.embed(xs)
        h_n = torch.zeros(LAYERS, HIDDEN_SIZE)
        #c_n = torch.zeros(LAYERS, HIDDEN_SIZE)
        rnn_outputs = []
        for x in xs:
            x = x[None,:]
            rnn_output, h_n = self.rnn(x, h_n)
            rnn_outputs.append(rnn_output)
        rnn_outputs = torch.cat(rnn_outputs)
        results = self.final_layer(rnn_outputs)
        return results

    def generate(self, xs=torch.tensor([char2int[START_SYMBOL]])) -> torch.tensor:
        classification = None
        h_n = torch.zeros(LAYERS, HIDDEN_SIZE)
        #c_n = torch.zeros(LAYERS, HIDDEN_SIZE)
        output = []
        xs = self.embed(xs)
        while ((classification == None) or (classification.item() != char2int[END_SYMBOL])) and (len(output) < MAX_GENERATION_LENGTH):
            rnn_outputs, h_n = self.rnn(xs, h_n)
            classification = torch.argmax(self.final_layer(rnn_outputs[-1]), dim=0)
            output.append(classification)
            xs = self.embed(classification)[None,:]
        output = torch.stack(output[:-1]) if len(output) > 0 else torch.tensor([])
        return output

In [13]:
#training_data = ["hello"] * 50
#training_data = ["abcdefghijklmnopqrstuvwxyz"] * 30
training_data = ["Möglicherweise haben Sie bei einem Fußballspiel schon einmal etwas von einer Bananenflanke gehört."] * 100
#training_data = sentences * 5
MAX_EPOCHS = 20

def to_vector(sentence : str, noend=False) -> torch.tensor:
    sentence = [START_SYMBOL] + list(sentence)
    if not noend:
        sentence.append(END_SYMBOL)
    return torch.tensor([char2int[c] for c in sentence])

lm = LM()
optimizer = torch.optim.Adam(lm.parameters())

def training(training_data, validation_data=[]):
    training_data = [to_vector(s) for s in training_data]
    validation_data = [to_vector(s) for s in validation_data]
    for epoch in range(MAX_EPOCHS):
        print(("Epoch {} starting".format(epoch)))
        #random.shuffle(training_data)
        for s in training_data:
            optimizer.zero_grad()
            all_input = s[:-1]
            all_predictions = s[1:]
            outputs = lm(all_input)
            losses = nn.functional.cross_entropy(outputs, all_predictions)
            loss = torch.sum(losses)
            loss.backward()
            optimizer.step()
        print("forced: " + "".join([int2char[x] for x in torch.argmax(lm(to_vector("Möglicherweise haben Sie bei einem Fußballspiel schon", True)), dim=1)]))
        print("freeee: " + "".join([int2char[x] for x in lm.generate()]))
    return lm


lm = training(training_data)
result = lm(to_vector("", True))
#result = torch.topk(result, 3, dim=1)
print("".join([int2char[x] for x in torch.argmax(result, dim=1)]))
#result

Epoch 0 starting
forced: eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
freeee: eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
Epoch 1 starting
forced: ööggcceeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee
freeee: ööggcceeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee          nnnnnnnnnnnnnnnnnn
Epoch 2 starting
forced: Möglccceeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee               
freeee: Möglccceeeeeeeeeeeeeeeeeeeeeeeeeeeee                                        nnn
Epoch 3 starting
forced: Möglicheeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeellllllllll     
freeee: Möglicheeeeeeeeeeeeeeeeeeeeeeeeee e e         llll                           nn
Epoch 4 starting
forced: Möglicheeeeeeeee    eeeeee  eeeee eeeeellllleelllll   
freeee: Möglicheeeeeeeeeeee eeeeeeeeee ee e e e e e e e elllll l e e e e e e e e e e e 
Epoch 5 starting
forced: Möglicheeweeee e    eeee e   e  e  eeeellllleellll    
freeee: Möglicheeeeeeee ee e e e e e e e e e e e e e e e ellllll el el e el e e 

In [119]:
#print("".join([int2char[x] for x in lm.generate()]))
print("".join([int2char[x] for x in lm.generate(to_vector("Möglicherweise haben Sie bei einem Fuß", True))]))

allspiel eier einerwas einerwas einerwas einerwas einerwas einerwas einerwas ei


In [76]:
int2char[47]

'o'