In [18]:
#!/usr/bin/env python3
import argparse
import lzma
import os
import pickle
import sys
from typing import Optional
import urllib.request

import numpy as np
import sklearn.datasets
import sklearn.model_selection
from sklearn.feature_extraction.text import CountVectorizer

parser = argparse.ArgumentParser()
# These arguments will be set appropriately by ReCodEx, even if you change them.
parser.add_argument("--predict", default=None, type=str, help="Path to the dataset to predict")
parser.add_argument("--recodex", default=False, action="store_true", help="Running in ReCodEx")
parser.add_argument("--seed", default=42, type=int, help="Random seed")
# For these and any other arguments you add, ReCodEx will keep your default value.
parser.add_argument("--model_path", default="diacritization.model", type=str, help="Model path")
args = parser.parse_args([] if "__file__" not in globals() else None)   

In [3]:
class Dataset:
    LETTERS_NODIA = "acdeeinorstuuyz"
    LETTERS_DIA = "áčďéěíňóřšťúůýž"

    # A translation table usable with `str.translate` to rewrite characters with dia to the ones without them.
    DIA_TO_NODIA = str.maketrans(LETTERS_DIA + LETTERS_DIA.upper(), LETTERS_NODIA + LETTERS_NODIA.upper())

    def __init__(self,
                 name="fiction-train.txt",
                 url="https://ufal.mff.cuni.cz/~courses/npfl129/2526/datasets/"):
        if not os.path.exists(name):
            print("Downloading dataset {}...".format(name), file=sys.stderr)
            licence_name = name.replace(".txt", ".LICENSE")
            urllib.request.urlretrieve(url + licence_name, filename=licence_name)
            urllib.request.urlretrieve(url + name, filename="{}.tmp".format(name))
            os.rename("{}.tmp".format(name), name)

        # Load the dataset and split it into `data` and `target`.
        with open(name, "r", encoding="utf-8-sig") as dataset_file:
            self.target = dataset_file.read()
        self.data = self.target.translate(self.DIA_TO_NODIA)


In [20]:
# We are training a model.
np.random.seed(args.seed)
train = Dataset()
X = []
y = []
window = 2

for i in range(window, len(train.data) - window):
    if train.data[i].lower() in Dataset.LETTERS_NODIA:
        context = train.data[i - window : i + window + 1]
        X.append(context)
        y.append(train_target[i])

vectorizer = CountVectorizer(analyzer="char", ngram_range=(1, 5))
X = vectorizer.fit_transform(X)
y = np.array(y)

model = sklearn.LogisticRegression(
    multi_class="multinomial", solver="saga",
    C=1.0, max_iter=500, n_jobs=-1
)
model.fit(X, y)

predictions = ""
for i in range(len(test.data)):
    if test.data[i].lower() in Dataset.LETTERS_NODIA:
        # Predict correct diacritic using context window and model
        context = test.data[i - window : i + window + 1]
        X = vectorizer.transform([context])
        pred_letter = model.predict(X)[0]
        predictions += pred_letter
    else:
        # Keep spaces and punctuation unchanged
        predictions += test.data[i]
# Serialize the model.
with lzma.open(args.model_path, "wb") as model_file:
    pickle.dump(model, model_file)

In [None]:
def accuracy(gold: str, system: str) -> float:
    assert isinstance(gold, str) and isinstance(system, str), "The gold and system outputs must be strings"

    gold, system = gold.split(), system.split()
    assert len(gold) == len(system), \
        "The test data has {} words, but got {} instead, aborting".format(len(gold), len(system))

    words, correct = 0, 0
    for gold_token, system_token in zip(gold, system):
        words += 1
        correct += gold_token == system_token

    return correct / words