## Evaluation of baseline models

In [1]:
import pandas as pd
import gensim
from nltk.translate import bleu_score

In [2]:
def read_table(conll_path):
    table = pd.read_table(conll_path, names=["#", "word", "lemma", "cp", "p", "f", "h", "d", "m1", "m2"],
                          delimiter='\t', encoding='utf-8', error_bad_lines=False)
    return table.dropna()

In [3]:
train_tsv = read_table('UD_English-EWT/en_ewt-ud-train.conllu')
dev_tsv = read_table('UD_English-EWT/en_ewt-ud-dev.conllu')
test_tsv = read_table('UD_English-EWT/en_ewt-ud-test.conllu')

In [4]:
train_words = train_tsv["word"].values[:10000]
train_lemmas = train_tsv["lemma"].values[:10000]

dev_words = dev_tsv["word"].values[:2000]
dev_lemmas = dev_tsv["lemma"].values[:2000]

test_words = test_tsv["word"].values[:2000]
test_lemmas = test_tsv["lemma"].values[:2000]

In [5]:
len(train_words), len(dev_words), len(test_words)

(10000, 2000, 2000)

In [6]:
def read_sents(path):
    with open(path, 'r', encoding='utf8') as src:
        sents = []
        lem_seqs = []
        sent = []
        seq = []
        for line in src:
            if len(line) > 2 and not line.startswith('#'):
                values = line.split('\t')
                sent.append(values[1])
                seq.append(values[2])
            elif len(sent) > 0:
                sents.append(sent)
                lem_seqs.append(seq)
                sent = []
                seq = []
    return sents, lem_seqs

In [7]:
test_sents, test_references = read_sents('UD_English-EWT/en_ewt-ud-test.conllu')

In [8]:
test_sents[0], test_references[0]

(['What', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?'],
 ['what', 'if', 'Google', 'morph', 'into', 'GoogleOS', '?'])

In [9]:
corpus_references = [[reference] for reference in test_references]
corpus_references[0]

[['what', 'if', 'Google', 'morph', 'into', 'GoogleOS', '?']]

In [24]:
def evaluate(predictions, lemmas):
    score = 0
    for prediction, lemma in zip(predictions, lemmas):
        if prediction == lemma:
            score += 1
    example_count = len(predictions)
    percentage = 100 * score / example_count
    print('{}\t- total examples count'.format(example_count))
    print('{}\t- correctly lemmatized'.format(score))
    print('{:.2f}%\t- accuracy on test set'.format(percentage))
    return percentage


### Baseline 1:  identity function

In [25]:
evaluate(dev_words, dev_lemmas)

2000	- total examples count
1577	- correctly lemmatized
78.85%	- accuracy on test set


78.85

In [26]:
evaluate(test_words, test_lemmas)

2000	- total examples count
1563	- correctly lemmatized
78.15%	- accuracy on test set


78.15

In [27]:
bleu_score.corpus_bleu(corpus_references, test_sents)

0.5799287236130222

### Baseline 2: most frequent lemma among seen in the corpus, with identity backoff for unknown words

In [28]:
w2l_count = {}
for word, lemma in zip(train_words, train_lemmas):
    if word in w2l_count:
        w2l_count[word][lemma] = w2l_count[word].get(lemma, 0) + 1
    else:
        w2l_count[word] = {lemma: 1}
        
w2l = {}
for word in w2l_count:
    w2l[word] = sorted(w2l_count[word], key=w2l_count[word].get)[-1]

In [29]:
dev_predictions = [w2l.get(word, word) for word in dev_words]
test_predictions = [w2l.get(word, word) for word in test_words]

In [30]:
dev_percentage = evaluate(dev_predictions, dev_lemmas)

2000	- total examples count
1853	- correctly lemmatized
92.65%	- accuracy on test set


In [31]:
test_percentage = evaluate(test_predictions, test_lemmas)

2000	- total examples count
1828	- correctly lemmatized
91.40%	- accuracy on test set


In [32]:
predicted_seq = [[w2l.get(w, w) for w in sent] for sent in test_sents]
predicted_seq[0]

['what', 'if', 'Google', 'Morphed', 'Into', 'GoogleOS', '?']

In [22]:
bleu_score.corpus_bleu(corpus_references, predicted_seq)

0.7727198014704395