## Linear regression models

In [2]:
import numpy as np
import pandas as pd
import nltk
import gensim
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.optimizers import RMSprop
from nltk.translate import bleu_score

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### reading and preparing data

In [3]:
# reading fastText word embeddings
vec_model = gensim.models.fasttext.FastText.load_fasttext_format('fasttext/cc.fi.300.bin')

In [4]:
# reads conll dataset
def read_table(conll_path):
    table = pd.read_table(conll_path, names=["#", "word", "lemma", "cp", "p", "f", "h", "d", "m1", "m2"],
                          delimiter='\t', encoding='utf-8', error_bad_lines=False)
    return table.dropna()

In [5]:
train_tsv = read_table('UD_Finnish-TDT-master/fi_tdt-ud-train.conllu')
dev_tsv = read_table('UD_Finnish-TDT-master/fi_tdt-ud-dev.conllu')
test_tsv = read_table('UD_Finnish-TDT-master/fi_tdt-ud-test.conllu')

In [6]:
# limit the number of training examples
train_words = train_tsv["word"].values[:10000]
train_lemmas = train_tsv["lemma"].values[:10000]

dev_words = dev_tsv["word"].values[:2000]
dev_lemmas = dev_tsv["lemma"].values[:2000]

test_words = test_tsv["word"].values[:2000]
test_lemmas = test_tsv["lemma"].values[:2000]

In [29]:
# get data for training and evaluation
def get_vec_form(words, lemmas):
    x = {'form': [], 'vec': []}
    y = {'form': [], 'vec': []}
    for w, l in zip(words, lemmas):
        try:
            new_x = vec_model.wv[w]
            new_y = vec_model.wv[l]
        except:
            continue
        x['vec'].append(new_x)
        x['form'].append(w)
        y['vec'].append(new_y)
        y['form'].append(l)
    x['vec'] = np.array(x['vec'])
    y['vec'] = np.array(y['vec'])
    return x, y


train_x, train_y = get_vec_form(train_words, train_lemmas)
dev_x, dev_y = get_vec_form(dev_words, dev_lemmas)
test_x, test_y = get_vec_form(test_words, test_lemmas)

In [30]:
len(train_x['vec']), len(dev_x['vec']), len(test_x['vec'])

(9904, 1974, 1996)

In [37]:
def evaluate(predicted_vecs, lemmas):
    correct = 0
    for i, pred in enumerate(predicted_vecs):
        nearest = vec_model.wv.most_similar(positive=[pred], topn=1)
        if nearest[0][0] == lemmas[i]:
            correct += 1
    return correct / len(lemmas)

### Linear regression with MSE loss

In [31]:
model = Sequential()
model.add(Dense(300, activation='linear', input_shape=(300,)))
model.compile(loss='mean_squared_error', optimizer=RMSprop())

In [32]:
model.fit(train_x['vec'], train_y['vec'], validation_data=(dev_x['vec'], dev_y['vec']), 
          epochs=40, batch_size=128, verbose=True)

Train on 9904 samples, validate on 1974 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f84b0d0fb70>

In [33]:
test_pred = model.predict(test_x['vec'])

In [38]:
evaluate(test_pred, test_y['form'])

0.6743486973947895

### Linear regression with cosine proximity loss

In [41]:
model2 = Sequential()
model2.add(Dense(300, activation='linear', input_shape=(300,)))
model2.compile(loss='cosine_proximity', optimizer=RMSprop())

In [42]:
model2.fit(train_x['vec'], train_y['vec'], validation_data=(dev_x['vec'], dev_y['vec']), 
           epochs=40, batch_size=128, verbose=True)

Train on 9904 samples, validate on 1974 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f84b0d36278>

In [45]:
test_pred2 = model2.predict(test_x['vec'])
evaluate(test_pred2, test_x['form'])

0.7314629258517034