## Recurrent neural network with an LSTM unit

In [135]:
import numpy as np
import pandas as pd
import gensim
import sklearn
from keras.models import Sequential
from keras.layers import LSTM, Dense, Activation, Embedding, Input, TimeDistributed, Dropout, Masking
from keras.optimizers import RMSprop

In [189]:
# hyperparameters

B = 50  #  batch size
R = 300 #  rnn size
S = 4   #  max_sequence len
E = 300 #  embedding size

#### reading and preparing data for training and evaluation

In [190]:
# retrieve fastText embeddings
# vec_model = gensim.models.fasttext.FastText.load_fasttext_format('fasttext/cc.fi.300.bin')
vec_model = gensim.models.KeyedVectors.load_word2vec_format('fasttext/crawl-300d-2M.vec', limit=250000)

In [191]:
# generate training and validation examples for the network
def generate(path, line_limit=13000, mode='train'):
    with open(path, 'r', encoding='utf8') as src:
        word_count = 0
        line_number = 0
        
        x = np.zeros((B, S, E))
        y = np.zeros((B, S, E))
        word_seqs = [None for _ in range(B)]
        lemma_seqs = [None for _ in range(B)]
        word_seq = []
        lemma_seq = []
        x_seq = []
        y_seq = []
        i = 0
        for line in src:
            line_number += 1
            if line_number > line_limit:
                return        
            if len(x_seq) == S and len(y_seq) == S:
                x[i] = np.array(x_seq)
                y[i] = np.array(y_seq)
                word_seqs[i] = word_seq[:]
                lemma_seqs[i] = lemma_seq[:] 
                if mode == 'train':
                    x_seq.pop(0)
                    y_seq.pop(0)
                    word_seq.pop(0)
                    lemma_seq.pop(0)
                else:
                    x_seq = []
                    y_seq = []
                    word_seq = []
                    lemma_seq = []
                i += 1
                if i >= B:
                    yield x, y, word_seqs, lemma_seqs
                    x = np.zeros((B, S, E))
                    y = np.zeros((B, S, E))
                    word_seqs = [None for _ in range(B)]
                    lemma_seqs = [None for _ in range(B)]
                    i = 0
                    word_count += S
            if len(line) > 2 and line[0] != '#':
                values = line.split()
                if '-' not in values[0]:
                    try:
                        word = vec_model[values[1]]
                        lemma_vec = vec_model[values[2]]
                    except:
                        word = np.zeros(E)
                        lemma_vec = np.zeros(E)
                    x_seq.append(word)
                    y_seq.append(lemma_vec)
                    word_seq.append(values[1])
                    lemma_seq.append(values[2])                        
            else:
                x_seq = []
                y_seq = []
                word_seq = []
                lemma_seq = []                    

In [209]:
# 12.2k lines amount to 10k tokens
train_set = [(X, Y) for X, Y, _, _ in generate('UD_English-EWT/en_ewt-ud-train.conllu', line_limit=12200)]

# 2.5k lines amount to 2k tokens
dev_batches = [(x, y, w, l) for x, y, w, l in generate('UD_English-EWT/en_ewt-ud-dev.conllu', line_limit=2500, mode='dev')]
test_batches = [(x, y, w, l) for x, y, w, l in generate('UD_English-EWT/en_ewt-ud-test.conllu', line_limit=2500, mode='dev')]

#### defining and training the network

In [210]:
M = Sequential()
M.add(Masking(mask_value=.0, input_shape=(S, E)))
M.add(LSTM(R, return_sequences=True))
M.add(Dropout(.2))
M.add(TimeDistributed(Dense(E, activation='linear')))
M.compile(loss='cosine_proximity', optimizer='rmsprop')

In [211]:
for epoch in range(100):
    
    train_loss = 0
    train_batch_c = 0
    for X, Y in train_set:
        train_loss += M.train_on_batch(X, Y)
        train_batch_c += 1
    
    dev_loss = 0
    dev_batch_c = 0
    for X, Y, _, _ in dev_batches:
        dev_loss += M.test_on_batch(X, Y)
        dev_batch_c += 1
    
    if epoch == 0 or (epoch + 1) % 10 == 0:
        print('epoch:', epoch + 1, 
              '\ttrain loss: {0:.4f}'.format(train_loss / train_batch_c), 
              '\tdev loss: {0:.4f}'.format(dev_loss / dev_batch_c))

    np.random.shuffle(train_set)

epoch: 1 	train loss: -0.7171 	dev loss: -0.8671
epoch: 10 	train loss: -0.9192 	dev loss: -0.9474
epoch: 20 	train loss: -0.9262 	dev loss: -0.9533
epoch: 30 	train loss: -0.9305 	dev loss: -0.9555
epoch: 40 	train loss: -0.9333 	dev loss: -0.9572
epoch: 50 	train loss: -0.9354 	dev loss: -0.9583
epoch: 60 	train loss: -0.9369 	dev loss: -0.9591
epoch: 70 	train loss: -0.9380 	dev loss: -0.9592
epoch: 80 	train loss: -0.9390 	dev loss: -0.9591
epoch: 90 	train loss: -0.9397 	dev loss: -0.9594
epoch: 100 	train loss: -0.9400 	dev loss: -0.9593


#### evaluate on test set

In [212]:
correct = 0
count = 0
for X, Y, W, L in test_batches:
    pred = M.predict_on_batch(X)
    for i, seq in enumerate(pred):
        for j, pred_y in enumerate(seq):
            if np.sum(X[i][j]) == 0:
                nearest = W[i][j]  # identity backoff for oov tokens
            else:
                nearest = vec_model.most_similar(positive=[pred_y], topn=1)[0][0]
            if nearest == L[i][j]:
                correct += 1
            count += 1
#             print('w', W[i][j], '\tl', L[i][j], '\tpred', nearest, nearest == L[i][j])

In [213]:
print('final test accuracy: {0:.2f}%'.format(100 * correct / count))
print('correctly lemmatized tokens:', correct)
print('all tokens:', count)

final test accuracy: 93.00%
correctly lemmatized tokens: 1860
all tokens: 2000


#### demonstration

In [214]:
def lemmatize(tokens):
    """
    input: list of tokens
    output: list of input tokens' predicted lemmas
    """
    lemmas = []
    for i in range(0, len(tokens), S):
        x = np.zeros((1, S, E))
        oov = []
        for j, t in enumerate(tokens[i:min(i + S, len(tokens))]):
            try:
                x[0][j] = vec_model[t]
            except:
                oov.append(j)
        y = M.predict([x], batch_size=1)
        predicted_lemmas = []
        for j in range(min(i + S, len(tokens)) - i):
            if j in oov:
                predicted_lemmas.append(tokens[i + j])
            else:
                predicted_lemmas.append(vec_model.most_similar(positive=[y[0][j]], topn=1)[0][0])
        lemmas += predicted_lemmas    
    return lemmas

In [225]:
lemmatize("I knew him because he had attended my school .".split(' '))

['I', 'know', 'he', 'because', 'he', 'have', 'attend', 'my', 'school', '.']