In [None]:
#!/usr/bin/env python3
# 53907afe-531b-11ea-a595-00505601122b
# b7ea974c-d389-11e8-a4be-00505601122b

# 1. Setup

## 1.1 FS/OS requirements

In [None]:
!cp /kaggle/input/tagger-competition/morpho_analyzer.py /kaggle/working/morpho_analyzer.py
!cp /kaggle/input/tagger-competition/morpho_dataset.py /kaggle/working/morpho_dataset.py
!cp /kaggle/input/tagger-competition/czech_pdt.zipnot /kaggle/working/czech_pdt.zip
!cp /kaggle/input/tagger-competition/czech_pdt_analyses.zipnot /kaggle/working/czech_pdt_analyses.zip

In [None]:
#!pip install -U tensorflow-gpu==2.8 tensorflow-addons==0.16.1 tensorflow-probability==0.16.0 tensorflow-hub==0.12.0 scipy
!pip freeze | grep tensorflow

## 1.2 Python imports

In [None]:
import argparse
import datetime
import os
import re

os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")  # Report only TF errors by default

import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import fasttext
import fasttext.util

from morpho_analyzer import MorphoAnalyzer
from morpho_dataset import MorphoDataset

## 1.3 Fasttext model

In [None]:
fasttext.util.download_model('cs', if_exists='ignore')  # English
ft = fasttext.load_model('cc.cs.300.bin')

## 1.4 Args

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=None, type=int, help="Batch size.")
parser.add_argument("--epochs", default=None, type=int, help="Number of epochs.")
parser.add_argument("--seed", default=42, type=int, help="Random seed.")
parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
parser.add_argument("--checkpoints_period", default=None, type=int, help="Checkpoint callback period.")
parser.add_argument("--stopping_patience", default=None, type=int, help="Early stopping epochs patience.")
parser.add_argument("--cle_dim", default=32, type=int, help="CLE embedding dimension.")
parser.add_argument("--max_sentences", default=None, type=int, help="Maximum number of sentences to load.")
parser.add_argument("--word_masking", default=0.0, type=float, help="Mask words with the given probability.")
parser.add_argument("--label_smoothing", default=None, type=float, help="")
parser.add_argument("--learning_rate", default=0.01, type=float, help="Initial model learning rate.")

args = parser.parse_args([
    '--batch_size=256',
    '--epochs=10',
    '--checkpoints_period=3',
    '--stopping_patience=3',
    '--learning_rate=0.01'
] if "__file__" not in globals() else None)

# Create logdir name
args.logdir = os.path.join(
    "logs",
    "{}-{}-{}".format(
        os.path.basename(globals().get("__file__", "notebook")),
        datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"),
        ",".join(
            (
                "{}={}".format(re.sub("(.)[^_]*_?", r"\1", k), v)
                for k, v in sorted(vars(args).items())
            )
        ),
    ),
)

tf.random.set_seed(args.seed) # tf2.6 (I have gpu issues on tf2.8 unfortunately)
tf.config.threading.set_inter_op_parallelism_threads(args.threads)
tf.config.threading.set_intra_op_parallelism_threads(args.threads)

args

In [None]:
args.decay_steps = int(args.epochs * morpho.train.size / args.batch_size)

# 2. Data

In [None]:
morpho = MorphoDataset("czech_pdt")
# analyses = MorphoAnalyzer("czech_pdt_analyses")

TAGS_NUM = morpho.train.tags.word_mapping.vocabulary_size()
args.decay_steps = int(args.epochs * morpho.train.size / args.batch_size)

In [None]:
def extract_tagging_data(example):
    return (example["forms"], morpho.train.tags.word_mapping(example["tags"]))

def to_categorical(form, tag):
    return (form, tf.one_hot(tag, TAGS_NUM))
    
def create_dataset(name):
    dataset = getattr(morpho, name).dataset
    dataset = dataset.map(extract_tagging_data)
    if args.label_smoothing:
        dataset = dataset.map(to_categorical)
    dataset = (
        dataset.shuffle(len(dataset), seed=args.seed)
        if name == "train"
        else dataset
    )
    dataset = dataset.apply(
        tf.data.experimental.dense_to_ragged_batch(args.batch_size)
    )
    return dataset

train, dev = create_dataset("train"), create_dataset("dev")

In [None]:
tags_stats = morpho.train.tags.word_mapping(np.array([item for sublist in morpho.train.tags.strings for item in sublist]))
filtered_tags_stats = tags_stats[tags_stats < len(np.unique(tags_stats))] # Filtered tags without the last (most frequent) tag
len(tags_stats), len(filtered_tags_stats)

In [None]:
plt.figure(figsize=(20,8))
sns.histplot(tags_stats, discrete=True, kde=True, stat="count").set_title("Tags histogram (including last class)")

In [None]:
plt.figure(figsize=(20,8))
sns.histplot(filtered_tags_stats, discrete=True, kde=True, stat="count").set_title("Tags histogram (excluding last class)")

# 3. Model

In [None]:
# idea taken from https://piazza.com/class/kzmwighamh26wd?cid=284
def fasttext_eager(inputs):
    return np.array([self.ft.get_word_vector(w) for w in inputs])

class Model(tf.keras.Model):

    def __init__(self, args: argparse.Namespace, train: MorphoDataset.Dataset) -> None:
        words = tf.keras.layers.Input(shape=[None], dtype=tf.string, ragged=True)
        #unique_words, unique_words_idx = tf.unique(words.values)
        #letter_seqs = tf.strings.unicode_split(unique_words, "UTF-8")
        #letter_ids = train.forms.char_mapping(letter_seqs)
        #char_embedding = tf.keras.layers.Embedding(train.forms.char_mapping.vocabulary_size(), 32)(letter_ids)
        #cle_gru = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32), merge_mode="concat")(char_embedding)
        #hey = tf.gather(cle_gru, unique_words_idx)
        #cle_embedding = words.with_values(hey)
        
        embedding = tf.numpy_function(func=fasttext_eager, inp=[words.values], Tout=tf.float32)
        embedding = words.with_values(tf.ensure_shape(embedding, (None, ft.get_dimension())))

        sequences = tf.keras.layers.Bidirectional(
            tf.keras.layers.RNN(tfa.rnn.LayerNormLSTMCell(64, recurrent_dropout=0.3), return_sequences=True, return_state=False), merge_mode="sum"
        )(embedding)
        
        hidden = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(64, activation=None))(tf.keras.layers.Concatenate()([embedding, sequences]))
        hidden = tf.keras.layers.TimeDistributed(tf.keras.layers.BatchNormalization())(hidden)
        hidden = tf.keras.layers.TimeDistributed(tf.keras.layers.Activation("swish"))(hidden)
        hidden = tf.keras.layers.Dropout(.3)(hidden)
        
        predictions = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(
            train.tags.word_mapping.vocabulary_size(), activation="softmax"
        ))(hidden)

        super().__init__(inputs=words, outputs=predictions)
        
        if args.label_smoothing:
            loss = lambda yt, yp: tf.losses.CategoricalCrossentropy(label_smoothing=args.label_smoothing)(yt.values, yp.values)
            metrics = [tf.metrics.CategoricalAccuracy(name="accuracy")]
        else:
            loss = lambda yt, yp: tf.losses.SparseCategoricalCrossentropy()(yt.values, yp.values)
            metrics = [tf.metrics.SparseCategoricalAccuracy(name="accuracy")]
            
        self.compile(
            optimizer=tf.optimizers.Adam(learning_rate=tf.keras.optimizers.schedules.CosineDecay(args.learning_rate, args.decay_steps)),
            loss=loss,
            metrics=metrics,
        )

        self.callbacks = list()
        self.tb_callback = tf.keras.callbacks.TensorBoard(args.logdir)
        self.callbacks.append(self.tb_callback)
        if args.checkpoints_period:
            self.checkpoints = tf.keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5', save_weights_only=True, period=args.checkpoints_period) 
            self.callbacks.append(self.checkpoints)
        if args.stopping_patience:
            self.early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=args.stopping_patience)
            self.callbacks.append(self.early_stopping)

In [None]:
model = Model(args, morpho.train)
model.summary()

In [None]:
logs = model.fit(
    train, 
    epochs=args.epochs,
    validation_data=dev,
    shuffle=True,
    callbacks=[model.callbacks],
)

In [None]:
test = create_dataset("test")

In [None]:
os.makedirs(args.logdir, exist_ok=True)
with open(
    os.path.join(args.logdir, "tagger_competition.txt"), "w", encoding="utf-8"
) as predictions_file:
    # TODO: Predict the tags on the test set; update the following prediction
    # command if you use other output structre than in tagger_we.
    predictions = model.predict(test)
    tag_strings = morpho.test.tags.word_mapping.get_vocabulary()
    for sentence in predictions:
        for word in sentence:
            print(tag_strings[np.argmax(word)], file=predictions_file)
        print(file=predictions_file)