# Ethics for NLP: Spring 2022
## Homework 3: Low Ressource Languages

### Imports and configuration

In [4]:
# All import statements defined here
# ----------------
# !pip install torch
# !pip install torchvision
# !pip install torchtext==0.9.0
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy import data
from torchtext.legacy import datasets

import numpy as np

import time
import random
import os

# set a fixed seed for reproducibility
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# hyperparameters
params = {   
    "embedding_dim": 100,
    "hidden_dim":128,
    "n_layers":2,
    "bidirectional": True,
    "dropout":0.25,
    "batch_size": 128
}

### BiLSTM model

In [5]:
# Note: do not change anything in this code,
# it can lead to incorrect results in the final accuracy calculation
# ----------------

class BiLSTMPOSTagger(nn.Module):
    def __init__(
        self,
        input_dim,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout,
        pad_idx,
    ):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0,
        )

        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        # pass text through embedding layer
        embedded = self.dropout(self.embedding(text))

        # pass embeddings into LSTM
        outputs, (hidden, cell) = self.lstm(embedded)

        # outputs holds the backward and forward hidden states in the final layer
        # hidden and cell are the backward and forward hidden and cell states at the final time-step

        # we use our outputs to make a prediction of what the tag should be
        predictions = self.fc(self.dropout(outputs))

        return predictions

### Training and evaluation functions

In [6]:
# function used to train or evaluate model
# depending on the input parameters

def run(mode, lang, model_name):
    print("Running model in {} mode with lang: {}".format(mode, lang))
    TEXT = data.Field(lower=True)
    UD_TAGS = data.Field()

    fields = (("text", TEXT), ("udtags", UD_TAGS))

    train_data, valid_data, test_data = datasets.UDPOS.splits(
        fields=fields,
        path=os.path.join("data", lang),
        train="{}-ud-train.conll".format(lang),
        validation="{}-ud-dev.conll".format(lang),
        test="{}-ud-test.conll".format(lang),
    )
    MIN_FREQ = 2
    print(os.path.join("data", lang))

    TEXT.build_vocab(train_data, min_freq=MIN_FREQ)
    UD_TAGS.build_vocab(train_data)

    if mode == "train":
        print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
        print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
        print()
        print(f"Number of training examples: {len(train_data)}")
        print(f"Number of validation examples: {len(valid_data)}")

        print(f"Number of tokens in the training set: {sum(TEXT.vocab.freqs.values())}")

    print(f"Number of testing examples: {len(test_data)}")

    if mode == "train":
        print("Tag\t\tCount\t\tPercentage\n")
        for tag, count, percent in tag_percentage(UD_TAGS.vocab.freqs.most_common()):
            print(f"{tag}\t\t{count}\t\t{percent*100:4.1f}%")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=params["batch_size"],
        device=device,
    )

    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
    model = BiLSTMPOSTagger(
        input_dim=len(TEXT.vocab),
        embedding_dim=params["embedding_dim"],
        hidden_dim=params["hidden_dim"],
        output_dim=len(UD_TAGS.vocab),
        n_layers=params["n_layers"],
        bidirectional=params["bidirectional"],
        dropout=params["dropout"],
        pad_idx=PAD_IDX,
    )

    if mode == "train":

        def init_weights(m):
            for name, param in m.named_parameters():
                nn.init.normal_(param.data, mean=0, std=0.1)

        def count_parameters(model):
            return sum(p.numel() for p in model.parameters() if p.requires_grad)

        model.apply(init_weights)
        print(f"The model has {count_parameters(model):,} trainable parameters")
        model.embedding.weight.data[PAD_IDX] = torch.zeros(params["embedding_dim"])
        optimizer = optim.Adam(model.parameters())

    TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
    TAG_UNK_IDX = UD_TAGS.vocab.unk_index
    criterion = nn.CrossEntropyLoss(ignore_index=TAG_PAD_IDX)

    model = model.to(device)
    criterion = criterion.to(device)

    if mode == "train":
        N_EPOCHS = 10
        best_valid_loss = float("inf")
        for epoch in range(N_EPOCHS):
            start_time = time.time()
            train_loss, train_acc = train(
                model,
                train_iterator,
                optimizer,
                criterion,
                TAG_PAD_IDX,
                TAG_UNK_IDX,
            )
            valid_loss, valid_acc = evaluate(
                model, valid_iterator, criterion, TAG_PAD_IDX, TAG_UNK_IDX
            )
            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(
                    model.state_dict(), "saved_models/{}.pt".format(model_name)
                )

            print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
            print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
            print(f"\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%")

    try:
        model.load_state_dict(torch.load("saved_models/{}.pt".format(model_name)))
    except Exception as e:
        print(
            "Model file `{}` doesn't exist. You need to train the model by running this code in train mode.".format(
                "saved_models/{}.pt".format(model_name)
            )
        )
        return

    test_loss, test_acc = evaluate(
        model, test_iterator, criterion, TAG_PAD_IDX, TAG_UNK_IDX
    )
    print(f"Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%")


def tag_percentage(tag_counts):
    total_count = sum([count for tag, count in tag_counts])
    tag_counts_percentages = [
        (tag, count, count / total_count) for tag, count in tag_counts
    ]
    return tag_counts_percentages


def categorical_accuracy(preds, y, tag_pad_idx, tag_unk_idx):
    max_preds = preds.argmax(
        dim=1, keepdim=True
    )
    non_pad_elements = torch.nonzero((y != tag_pad_idx) & (y != tag_unk_idx))
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.float().sum(), y[non_pad_elements].shape[0]


def train(model, iterator, optimizer, criterion, tag_pad_idx, tag_unk_idx):

    epoch_loss = 0
    epoch_correct = 0
    epoch_n_label = 0

    model.train()

    for batch in iterator:

        text = batch.text
        tags = batch.udtags

        optimizer.zero_grad()

        predictions = model(text)

        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)

        loss = criterion(predictions, tags)

        correct, n_labels = categorical_accuracy(
            predictions, tags, tag_pad_idx, tag_unk_idx
        )

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_correct += correct.item()
        epoch_n_label += n_labels

    return epoch_loss / len(iterator), epoch_correct / epoch_n_label


def evaluate(model, iterator, criterion, tag_pad_idx, tag_unk_idx):

    epoch_loss = 0
    epoch_correct = 0
    epoch_n_label = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text = batch.text
            tags = batch.udtags

            predictions = model(text)

            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)

            loss = criterion(predictions, tags)

            correct, n_labels = categorical_accuracy(
                predictions, tags, tag_pad_idx, tag_unk_idx
            )

            epoch_loss += loss.item()
            epoch_correct += correct.item()
            epoch_n_label += n_labels

    return epoch_loss / len(iterator), epoch_correct / epoch_n_label


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


### Task 1 (10 points)

#### Evaluate the model on the english data (2 points)

In [7]:
# english

model=BiLSTMPOSTagger()

#### For the rest of this task, train and then evaluate the model on the trained data for the languages in each code cell (8 points)

In [8]:
# czech

In [9]:
# spanish

In [10]:
# arabic

In [11]:
# afrikaans

In [12]:
# lithuanian

In [13]:
# armenian

In [14]:
# tamil

### Task 2 - Discussion (10 points)
In this task we will discuss the received results from your evaluation. 
Each question has an additional markdown cell below for the answer. Please use it and put in you answer there.

##### Question 1: How the performance changes accross language families and available dataset size? Make a conclusion of how the model's prediction depends on the available data. (2 point)


Q1 answer

##### Question 2: What role does the training set size plays for the model? Which problem regarding training sets occurs when you deal with the low-resource languages? (2 points)

Q2 answer

##### Question 3: What do the parameters "n_layers", "bidirectional" and "dropout" (variable "params" in the first code cell) of the LSTM model mean? According to your research results please answer the following questions regarding the low-resource languages: (4 points)
#### - What happens when you increase the variable n_layers and why? 
#### - What changes when the model is unidirectional and why? 
#### - What happens when you increase the dropout and why?

Q3 answer

##### Question 4: Define the term "label noise". After that, please answer what happens if the labels in the training data are noisy? (2 points)

Q4 answer