# Model the properties of text
This model will learn to predict what will follow in the sequence. Such a model is called a language model.

Most language models use words as units of the sequence. We will use characters. There are two reasons for this:

- a word-based model ignores the whitespace between words (may be  relevant   for    poetry)
- there are fewer different characters than words and there are more characters than words in every line of poetry, which simplifies our task given little training data

Language models can be used for _generation_, i.e., we can sample texts from the model that are similar to the training data.

In [None]:
# import statements for modules that we may need below.
import dynet as dy
import sys
from random import shuffle

# read in the words and set up the "input vocabulary" (in this case: all characters)
data = []
# we'll use only one style of lines -- sound-poetry can be hard to distinguish from random results...
with open('parlando.lines') as f:
    data.extend([l.strip() for l in f.readlines()])

characters = set("".join(data))
characters.add("<EOS>") # special tag for end of sentence

int2char = list(characters)
char2int = {c:i for i,c in enumerate(characters)}

VOCAB_SIZE = len(characters)

In [None]:
# return compute loss of RNN for one sentence
def do_one_sentence(rnn, params, line):
    # setup the sentence
    dy.renew_cg()
    s0 = rnn.initial_state()
    R = dy.parameter(params["R"])
    bias = dy.parameter(params["bias"])
    lookup = params["lookup"]
    line = ["<EOS>"] + list(line) + ["<EOS>"]
    line = [char2int[c] for c in line]
    s = s0
    loss = []
    for char,next_char in zip(line,line[1:]):
        s = s.add_input(lookup[char])
        probs = R*s.output() + bias
        loss.append(dy.pickneglogsoftmax(probs,next_char))
    loss = dy.esum(loss)
    return loss

# generate from model:
def generate(rnn, params):
    def sample(probs):
        rnd = random.random()
        for i,p in enumerate(probs):
            rnd -= p
            if rnd <= 0: break
        return i
    # setup the sentence
    dy.renew_cg()
    s0 = rnn.initial_state()
    
    R = dy.parameter(params["R"])
    bias = dy.parameter(params["bias"])
    lookup = params["lookup"]
    
    s = s0.add_input(lookup[char2int["<EOS>"]])
    out=[]
    while True:
        probs = dy.softmax(R*s.output() + bias)
        probs = probs.vec_value()
        next_char = sample(probs)
        out.append(int2char[next_char])
        if out[-1] == "<EOS>": break
        s = s.add_input(lookup[next_char])
    return "".join(out[:-1]) # strip the <EOS>

# train, and generate every 5 samples
def train(rnn, params, lines):
    trainer = trainer_type(pc)
    for i in range(ITERATIONS):
        for line in lines:
            loss = do_one_sentence(rnn, params, line)
            loss_value = loss.value()
            loss.backward()
            trainer.update()
        if i % 5 == 0: 
            print("%.10f" % loss_value, end="\t")
            print(generate(rnn, params))

In [None]:
ITERATIONS = 200

INPUT_DIM = 40
HIDDEN_DIM = 50
LAYERS = 1

builder_type = dy.SimpleRNNBuilder
#builder_type = dy.LSTMBuilder

pc = dy.ParameterCollection()
rnn = builder_type(LAYERS, INPUT_DIM, HIDDEN_DIM, pc)
# add parameters for the hidden->output part for both lstm and srnn
params = {}
params["lookup"] = pc.add_lookup_parameters((VOCAB_SIZE, INPUT_DIM))
params["R"] = pc.add_parameters((VOCAB_SIZE, HIDDEN_DIM))
params["bias"] = pc.add_parameters((VOCAB_SIZE))

trainer_type = dy.SimpleSGDTrainer

In [None]:
sentence = "blaukraut bleibt blaukraut und brautkleid bleibt brautkleid"
train(rnn, params, [sentence])

In [None]:
train(rnn, params, data)