## Evaluation of baseline models

In [54]:
import pandas as pd
import gensim
from nltk.translate import bleu_score

In [3]:
def read_table(conll_path):
    table = pd.read_table(conll_path, names=["#", "word", "lemma", "cp", "p", "f", "h", "d", "m1", "m2"],
                          delimiter='\t', encoding='utf-8', error_bad_lines=False)
    return table.dropna()

In [4]:
train_tsv = read_table('UD_Finnish-TDT-master/fi_tdt-ud-train.conllu')
dev_tsv = read_table('UD_Finnish-TDT-master/fi_tdt-ud-dev.conllu')
test_tsv = read_table('UD_Finnish-TDT-master/fi_tdt-ud-test.conllu')

In [8]:
train_words = train_tsv["word"].values[:10000]
train_lemmas = train_tsv["lemma"].values[:10000]

dev_words = dev_tsv["word"].values[:2000]
dev_lemmas = dev_tsv["lemma"].values[:2000]

test_words = test_tsv["word"].values[:2000]
test_lemmas = test_tsv["lemma"].values[:2000]

In [36]:
len(train_words), len(dev_words), len(test_words)

(10000, 2000, 2000)

In [16]:
def read_sents(path):
    with open(path, 'r', encoding='utf8') as src:
        sents = []
        lem_seqs = []
        sent = []
        seq = []
        for line in src:
            if len(line) > 2 and not line.startswith('#'):
                values = line.split('\t')
                sent.append(values[1])
                seq.append(values[2])
            elif len(sent) > 0:
                sents.append(sent)
                lem_seqs.append(seq)
                sent = []
                seq = []
    return sents, lem_seqs

In [17]:
test_sents, test_references = read_sents('UD_Finnish-TDT-master/fi_tdt-ud-test.conllu')

In [18]:
test_sents[0], test_references[0]

(['Taas', 'teatteriin'], ['taas', 'teatteri'])

In [19]:
corpus_references = [[reference] for reference in test_references]
corpus_references[0]

[['taas', 'teatteri']]

In [33]:
def evaluate(predictions, lemmas):
    score = 0
    for prediction, lemma in zip(predictions, lemmas):
        if prediction == lemma:
            score += 1
    example_count = len(predictions)
    percentage = 100 * score / example_count
    print('example count:', example_count)
    print('correct count:', score)
    print('correct percentage:', percentage)
    return percentage


### Baseline 1:  identity function

In [34]:
evaluate(dev_words, dev_lemmas)

example count: 2000
correct count: 905
correct percentage: 45.25


45.25

In [35]:
evaluate(test_words, test_lemmas)

example count: 2000
correct count: 947
correct percentage: 47.35


47.35

In [42]:
bleu_score.corpus_bleu(corpus_references, test_sents)

0.12764899924747228

### Baseline 2: most frequent lemma among seen in the corpus, with identity backoff for unknown words

In [31]:
w2l_count = {}
for word, lemma in zip(train_words, train_lemmas):
    if word in w2l_count:
        w2l_count[word][lemma] = w2l_count[word].get(lemma, 0) + 1
    else:
        w2l_count[word] = {lemma: 1}
        
w2l = {}
for word in w2l_count:
    w2l[word] = sorted(w2l_count[word], key=w2l_count[word].get)[-1]

In [37]:
dev_predictions = [w2l.get(word, word) for word in dev_words]
test_predictions = [w2l.get(word, word) for word in test_words]

In [51]:
dev_percentage = evaluate(dev_predictions, dev_lemmas)

example count: 2000
correct count: 1259
correct percentage: 62.95


In [52]:
test_percentage = eval_baseline(test_predictions, test_lemmas)

example count: 2000
correct count: 1330
correct percentage: 66.5


In [49]:
predicted_seq = [[w2l.get(w, w) for w in sent] for sent in test_sents]
predicted_seq[0]

['Taas', 'teatteriin']

In [50]:
bleu_score.corpus_bleu(corpus_references, predicted_seq)

0.29064103544959363