# Train language model

In [57]:
import itertools, numpy

import keras
from keras.models import Model
from keras.layers import Embedding, TimeDistributed, Dense, Input, LSTM, Dropout

## Prepare training data

In [8]:
with open('./data/joint.txt', 'r') as f:
    dataset = [line.strip().decode('utf8') for line in f if line.strip()]

In [9]:
len(dataset)

14881

In [67]:
START_CHAR = 0
END_CHAR = 1
REAL_CHAR_OFFSET = 2
DEFAULT_MAXLEN = 100

def make_training_data(samples, maxlen = DEFAULT_MAXLEN, stride = 50):
    char2code = { c : i + REAL_CHAR_OFFSET
                 for i, c
                 in enumerate(sorted(set(''.join(samples)))) }
    chars_number = len(char2code) + REAL_CHAR_OFFSET

    inputs = []
    outputs = []
    for sample in samples:
        converted = [START_CHAR] + map(char2code.__getitem__, sample) + [END_CHAR]
        for start in xrange(0, len(converted) - stride / 2, stride):
            in_chunk = converted[start : start + maxlen]
            out_chunk = converted[start + 1 : start + maxlen + 1]

            in_res = numpy.ones(maxlen, dtype = 'uint16') * END_CHAR
            in_res[:len(in_chunk)] = in_chunk
            
            out_res = numpy.zeros((maxlen, chars_number), dtype = 'uint16')
            out_res[:len(out_chunk), out_chunk] = 1
            out_res[len(out_chunk):, END_CHAR] = 1

            inputs.append(in_res)
            outputs.append(out_res)
    
    code2char = { i : c for c, i in char2code.viewitems() }
    code2char[START_CHAR] = '_START_'
    code2char[END_CHAR] = '_END_'
    def _decoder(line):
        return ''.join(map(code2char.__getitem__, line))

    return numpy.vstack(inputs), numpy.stack(outputs), len(code2char), _decoder

In [68]:
x, y, chars_number, decoder = make_training_data(dataset[:10])

In [69]:
(x.shape, y.shape, chars_number)

((56, 100), (56, 100, 75), 75)

## Basic Char-RNN

In [None]:
crnn_input = Input((DEFAULT_MAXLEN,), dtype = 'uint16')
crnn_embeddings = Embedding(DEFAULT_MAXLEN, 32)(crnn_input)
crnn_lstm1 = LSTM(256, return_sequences = True)(crnn_embeddings)
crnn_do1 = Dropout(0.2)(crnn_lstm1)
crnn_lstm2 = LSTM(256, return_sequences = True)(crnn_do1)
crnn_do2 = Dropout(0.2)(crnn_lstm2)
crnn_dense = TimeDistributed(Dense(chars_number))
crnn_output = Activation('softmax')

crnn = Model(input = [crnn_input], output = [crnn_output])
crnn.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [58]:
LSTM??