## Linear regression models

In [1]:
import numpy as np
import pandas as pd
import nltk
import gensim
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.optimizers import RMSprop
from nltk.translate import bleu_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### reading and preparing data

In [2]:
# reading fastText word embeddings
# vec_model = gensim.models.fasttext.FastText.load_fasttext_format('fasttext/cc.fi.300.bin')
vec_model = gensim.models.KeyedVectors.load_word2vec_format('fasttext/crawl-300d-2M.vec', limit=250000)

In [3]:
# reads conll dataset
def read_table(conll_path):
    table = pd.read_table(conll_path, names=["#", "word", "lemma", "cp", "p", "f", "h", "d", "m1", "m2"],
                          delimiter='\t', encoding='utf-8', error_bad_lines=False)
    return table.dropna()

In [4]:
train_tsv = read_table('UD_English-EWT/en_ewt-ud-train.conllu')
dev_tsv = read_table('UD_English-EWT/en_ewt-ud-dev.conllu')
test_tsv = read_table('UD_English-EWT/en_ewt-ud-test.conllu')

In [5]:
# limit the number of training examples
train_words = train_tsv["word"].values[:10000]
train_lemmas = train_tsv["lemma"].values[:10000]

dev_words = dev_tsv["word"].values[:2000]
dev_lemmas = dev_tsv["lemma"].values[:2000]

test_words = test_tsv["word"].values[:2000]
test_lemmas = test_tsv["lemma"].values[:2000]

In [6]:
# get data for training and evaluation

oov = set()

def get_vec_form(words, lemmas):
    x = {'form': [], 'vec': []}
    y = {'form': [], 'vec': []}
    for w, l in zip(words, lemmas):
        try:
            new_x = vec_model[w]
        except:
            oov.add(w)
            continue
        try:
            new_y = vec_model[l]
        except:
            oov.add(l)
            continue
        x['vec'].append(new_x)
        x['form'].append(w)
        y['vec'].append(new_y)
        y['form'].append(l)
    x['vec'] = np.array(x['vec'])
    y['vec'] = np.array(y['vec'])
    return x, y


train_x, train_y = get_vec_form(train_words, train_lemmas)
dev_x, dev_y = get_vec_form(dev_words, dev_lemmas)
test_x, test_y = get_vec_form(test_words, test_lemmas)

In [7]:
len(train_x['vec']), len(dev_x['vec']), len(test_x['vec'])

(9783, 1964, 1960)

In [8]:
def evaluate(predicted_vecs, lemmas):
    correct = 0
    for i, pred in enumerate(predicted_vecs):
        nearest = vec_model.most_similar(positive=[pred], topn=1)
        if nearest[0][0] == lemmas[i]:
            correct += 1
    return correct / len(lemmas), correct, len(lemmas)

### Linear regression with MSE loss

In [57]:
model = Sequential()
model.add(Dense(300, activation='linear', input_shape=(300,)))
model.compile(loss='mean_squared_error', optimizer=RMSprop())

In [58]:
model.fit(train_x['vec'], train_y['vec'], validation_data=(dev_x['vec'], dev_y['vec']), 
          epochs=50, batch_size=128, verbose=False)

<keras.callbacks.History at 0x7fe605c04240>

In [59]:
test_pred = model.predict(test_x['vec'])
score, correct, count = evaluate(test_pred, test_y['form'])

In [60]:
for w, l in zip(test_words, test_lemmas):
    if w in oov or l in oov:
        count += 1
        if w == l:
            correct += 1  # identity backoff
            
score = correct / count

In [61]:
print('{:.2f}%\t- accuracy on test set'.format(100 * score))
print('{}\t- correctly lemmatized tokens'.format(correct))
print('{}\t- total'.format(count))

85.90%	- accuracy on test set
1718	- correctly lemmatized tokens
2000	- total


### Linear regression with mean absolute error

In [30]:
model2 = Sequential()
model2.add(Dense(300, activation='linear', input_shape=(300,)))
model2.compile(loss='mean_absolute_error', optimizer=RMSprop())

In [31]:
model2.fit(train_x['vec'], train_y['vec'], validation_data=(dev_x['vec'], dev_y['vec']), 
           epochs=150, batch_size=128, verbose=False)

<keras.callbacks.History at 0x7fe603daaa58>

In [32]:
test_pred2 = model2.predict(test_x['vec'])
score2, correct2, count2 = evaluate(test_pred2, test_y['form'])

In [35]:
for w, l in zip(test_words, test_lemmas):
    if w in oov or l in oov:
        count2 += 1
        if w == l:
            correct2 += 1  # identity backoff
            
score2 = correct2 / count2

In [36]:
print('{:.2f}%\t- accuracy on test set'.format(100 * score2))
print('{}\t- correctly lemmatized tokens'.format(correct2))
print('{}\t- total'.format(count2))

81.20%	- accuracy on test set
1624	- correctly lemmatized tokens
2000	- total


### Linear regression with cosine proximity loss

In [62]:
model3 = Sequential()
model3.add(Dense(300, activation='linear', input_shape=(300,)))
model3.compile(loss='cosine_proximity', optimizer=RMSprop())

In [63]:
model3.fit(train_x['vec'], train_y['vec'], validation_data=(dev_x['vec'], dev_y['vec']), 
           epochs=50, batch_size=128, verbose=False)

<keras.callbacks.History at 0x7fe5b8661208>

In [64]:
test_pred3 = model3.predict(test_x['vec'])
score3, correct3, count3 = evaluate(test_pred3, test_y['form'])

In [65]:
for w, l in zip(test_words, test_lemmas):
    if w in oov or l in oov:
        count3 += 1
        if w == l:
            correct3 += 1  # identity backoff
            
score3 = correct3 / count3

In [66]:
print('{:.2f}%\t- accuracy on test set'.format(100 * score3))
print('{}\t- correctly lemmatized tokens'.format(correct3))
print('{}\t- total'.format(count3))

87.55%	- accuracy on test set
1751	- correctly lemmatized tokens
2000	- total


Using cosine proximity produced significantly better results than MSE and MAE.

#### Demonstration

In [84]:
def lemmatize(tokens):
    """
    input: list of tokens
    output: list of input tokens' predicted lemmas
    """
    lemmas = []
    for token in tokens:
        try:
            vec = vec_model[token].reshape((1, 300))
            pred = model3.predict(vec)[0]
            lemmas.append(vec_model.most_similar(positive=[pred], topn=1)[0][0])
        except Exception as e:
            print(e)
            print(token)
            lemmas.append(token)
    return lemmas

In [86]:
lemmatize("I knew him because he had attended my school .".split(' '))

['I', 'know', 'he', 'because', 'he', 'have', 'attend', 'my', 'school', '.']