# Ethics for NLP: Spring 2022
## Homework 3: Low Ressource Languages

### Imports and configuration

In [4]:
# All import statements defined here
# ----------------
# !pip install torch
# !pip install torchvision
# !pip install torchtext==0.9.0
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy import data
from torchtext.legacy import datasets

import numpy as np

import time
import random
import os

# set a fixed seed for reproducibility
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# hyperparameters
params = {   
    "embedding_dim": 100,
    "hidden_dim":128,
    "n_layers":2,
    "bidirectional": True,
    "dropout":0.25,
    "batch_size": 128
}

### BiLSTM model

In [5]:
# Note: do not change anything in this code,
# it can lead to incorrect results in the final accuracy calculation
# ----------------

class BiLSTMPOSTagger(nn.Module):
    def __init__(
        self,
        input_dim,
        embedding_dim,
        hidden_dim,
        output_dim,
        n_layers,
        bidirectional,
        dropout,
        pad_idx,
    ):

        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx=pad_idx)

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0,
        )

        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        # pass text through embedding layer
        embedded = self.dropout(self.embedding(text))

        # pass embeddings into LSTM
        outputs, (hidden, cell) = self.lstm(embedded)

        # outputs holds the backward and forward hidden states in the final layer
        # hidden and cell are the backward and forward hidden and cell states at the final time-step

        # we use our outputs to make a prediction of what the tag should be
        predictions = self.fc(self.dropout(outputs))

        return predictions

### Training and evaluation functions

In [6]:
# function used to train or evaluate model
# depending on the input parameters

def run(mode, lang, model_name):
    print("Running model in {} mode with lang: {}".format(mode, lang))
    TEXT = data.Field(lower=True)
    UD_TAGS = data.Field()

    fields = (("text", TEXT), ("udtags", UD_TAGS))

    train_data, valid_data, test_data = datasets.UDPOS.splits(
        fields=fields,
        path=os.path.join("data", lang),
        train="{}-ud-train.conll".format(lang),
        validation="{}-ud-dev.conll".format(lang),
        test="{}-ud-test.conll".format(lang),
    )
    MIN_FREQ = 2
    print(os.path.join("data", lang))

    TEXT.build_vocab(train_data, min_freq=MIN_FREQ)
    UD_TAGS.build_vocab(train_data)

    if mode == "train":
        print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
        print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
        print()
        print(f"Number of training examples: {len(train_data)}")
        print(f"Number of validation examples: {len(valid_data)}")

        print(f"Number of tokens in the training set: {sum(TEXT.vocab.freqs.values())}")

    print(f"Number of testing examples: {len(test_data)}")

    if mode == "train":
        print("Tag\t\tCount\t\tPercentage\n")
        for tag, count, percent in tag_percentage(UD_TAGS.vocab.freqs.most_common()):
            print(f"{tag}\t\t{count}\t\t{percent*100:4.1f}%")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=params["batch_size"],
        device=device,
    )

    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
    model = BiLSTMPOSTagger(
        input_dim=len(TEXT.vocab),
        embedding_dim=params["embedding_dim"],
        hidden_dim=params["hidden_dim"],
        output_dim=len(UD_TAGS.vocab),
        n_layers=params["n_layers"],
        bidirectional=params["bidirectional"],
        dropout=params["dropout"],
        pad_idx=PAD_IDX,
    )

    if mode == "train":

        def init_weights(m):
            for name, param in m.named_parameters():
                nn.init.normal_(param.data, mean=0, std=0.1)

        def count_parameters(model):
            return sum(p.numel() for p in model.parameters() if p.requires_grad)

        model.apply(init_weights)
        print(f"The model has {count_parameters(model):,} trainable parameters")
        model.embedding.weight.data[PAD_IDX] = torch.zeros(params["embedding_dim"])
        optimizer = optim.Adam(model.parameters())

    TAG_PAD_IDX = UD_TAGS.vocab.stoi[UD_TAGS.pad_token]
    TAG_UNK_IDX = UD_TAGS.vocab.unk_index
    criterion = nn.CrossEntropyLoss(ignore_index=TAG_PAD_IDX)

    model = model.to(device)
    criterion = criterion.to(device)

    if mode == "train":
        N_EPOCHS = 10
        best_valid_loss = float("inf")
        for epoch in range(N_EPOCHS):
            start_time = time.time()
            train_loss, train_acc = train(
                model,
                train_iterator,
                optimizer,
                criterion,
                TAG_PAD_IDX,
                TAG_UNK_IDX,
            )
            valid_loss, valid_acc = evaluate(
                model, valid_iterator, criterion, TAG_PAD_IDX, TAG_UNK_IDX
            )
            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(
                    model.state_dict(), "saved_models/{}.pt".format(model_name)
                )

            print(f"Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s")
            print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%")
            print(f"\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%")

    try:
        model.load_state_dict(torch.load("saved_models/{}.pt".format(model_name)))
    except Exception as e:
        print(
            "Model file `{}` doesn't exist. You need to train the model by running this code in train mode.".format(
                "saved_models/{}.pt".format(model_name)
            )
        )
        return

    test_loss, test_acc = evaluate(
        model, test_iterator, criterion, TAG_PAD_IDX, TAG_UNK_IDX
    )
    print(f"Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%")


def tag_percentage(tag_counts):
    total_count = sum([count for tag, count in tag_counts])
    tag_counts_percentages = [
        (tag, count, count / total_count) for tag, count in tag_counts
    ]
    return tag_counts_percentages


def categorical_accuracy(preds, y, tag_pad_idx, tag_unk_idx):
    max_preds = preds.argmax(
        dim=1, keepdim=True
    )
    non_pad_elements = torch.nonzero((y != tag_pad_idx) & (y != tag_unk_idx))
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.float().sum(), y[non_pad_elements].shape[0]


def train(model, iterator, optimizer, criterion, tag_pad_idx, tag_unk_idx):

    epoch_loss = 0
    epoch_correct = 0
    epoch_n_label = 0

    model.train()

    for batch in iterator:

        text = batch.text
        tags = batch.udtags

        optimizer.zero_grad()

        predictions = model(text)

        predictions = predictions.view(-1, predictions.shape[-1])
        tags = tags.view(-1)

        loss = criterion(predictions, tags)

        correct, n_labels = categorical_accuracy(
            predictions, tags, tag_pad_idx, tag_unk_idx
        )

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_correct += correct.item()
        epoch_n_label += n_labels

    return epoch_loss / len(iterator), epoch_correct / epoch_n_label


def evaluate(model, iterator, criterion, tag_pad_idx, tag_unk_idx):

    epoch_loss = 0
    epoch_correct = 0
    epoch_n_label = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text = batch.text
            tags = batch.udtags

            predictions = model(text)

            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)

            loss = criterion(predictions, tags)

            correct, n_labels = categorical_accuracy(
                predictions, tags, tag_pad_idx, tag_unk_idx
            )

            epoch_loss += loss.item()
            epoch_correct += correct.item()
            epoch_n_label += n_labels

    return epoch_loss / len(iterator), epoch_correct / epoch_n_label


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


### Task 1 (10 points)

#### Evaluate the model on the english data (2 points)

In [18]:
# english
run('train','en','POSTagger_en')

Running model in train mode with lang: en
data\en
Unique tokens in TEXT vocabulary: 8854
Unique tokens in UD_TAG vocabulary: 19

Number of training examples: 12543
Number of validation examples: 2002
Number of tokens in the training set: 204586
Number of testing examples: 2077
Tag		Count		Percentage

NOUN		35315		17.3%
VERB		27508		13.4%
PUNCT		23680		11.6%
ADP		17640		 8.6%
PRON		17183		 8.4%
DET		17148		 8.4%
PROPN		12945		 6.3%
ADJ		12475		 6.1%
ADV		10551		 5.2%
AUX		7895		 3.9%
CONJ		6707		 3.3%
PART		5564		 2.7%
NUM		3999		 2.0%
SCONJ		3842		 1.9%
X		848		 0.4%
INTJ		688		 0.3%
SYM		598		 0.3%
The model has 1,521,067 trainable parameters
Epoch: 01 | Epoch Time: 1m 1s
	Train Loss: 1.728 | Train Acc: 45.41%
	 Val. Loss: 0.902 |  Val. Acc: 76.52%
Epoch: 02 | Epoch Time: 1m 2s
	Train Loss: 0.506 | Train Acc: 84.06%
	 Val. Loss: 0.546 |  Val. Acc: 87.76%
Epoch: 03 | Epoch Time: 1m 2s
	Train Loss: 0.305 | Train Acc: 90.56%
	 Val. Loss: 0.466 |  Val. Acc: 89.70%
Epoch: 04 | Epoch Time: 

#### For the rest of this task, train and then evaluate the model on the trained data for the languages in each code cell (8 points)

In [19]:
# czech
run('train','cs','POSTagger_cs')

Running model in train mode with lang: cs
data\cs
Unique tokens in TEXT vocabulary: 38789
Unique tokens in UD_TAG vocabulary: 19

Number of training examples: 41559
Number of validation examples: 9270
Number of tokens in the training set: 719317
Number of testing examples: 10148
Tag		Count		Percentage

NOUN		175971		24.5%
PUNCT		101318		14.1%
ADJ		86855		12.1%
VERB		79779		11.1%
ADP		71491		 9.9%
PROPN		46100		 6.4%
ADV		37704		 5.2%
PRON		34941		 4.9%
CONJ		26714		 3.7%
NUM		18004		 2.5%
SCONJ		13186		 1.8%
DET		12997		 1.8%
AUX		9953		 1.4%
PART		3825		 0.5%
SYM		415		 0.1%
INTJ		63		 0.0%
X		1		 0.0%
The model has 4,514,567 trainable parameters
Epoch: 01 | Epoch Time: 3m 2s
	Train Loss: 0.848 | Train Acc: 72.14%
	 Val. Loss: 0.270 |  Val. Acc: 91.87%
Epoch: 02 | Epoch Time: 2m 59s
	Train Loss: 0.179 | Train Acc: 94.18%
	 Val. Loss: 0.182 |  Val. Acc: 94.22%
Epoch: 03 | Epoch Time: 3m 3s
	Train Loss: 0.113 | Train Acc: 96.30%
	 Val. Loss: 0.163 |  Val. Acc: 94.77%
Epoch: 04 | Epoch T

In [20]:
# spanish
run('train','es','POSTagger_spanish')

Running model in train mode with lang: es
data\es
Unique tokens in TEXT vocabulary: 17588
Unique tokens in UD_TAG vocabulary: 18

Number of training examples: 14187
Number of validation examples: 1552
Number of tokens in the training set: 382436
Number of testing examples: 274
Tag		Count		Percentage

NOUN		68694		18.0%
ADP		62995		16.5%
DET		53937		14.1%
PUNCT		42218		11.0%
VERB		36185		 9.5%
PROPN		35112		 9.2%
ADJ		22096		 5.8%
PRON		12402		 3.2%
CONJ		12262		 3.2%
ADV		11031		 2.9%
NUM		9812		 2.6%
SCONJ		7095		 1.9%
AUX		5335		 1.4%
X		1778		 0.5%
SYM		1452		 0.4%
PART		32		 0.0%
The model has 2,394,210 trainable parameters
Epoch: 01 | Epoch Time: 1m 35s
	Train Loss: 1.375 | Train Acc: 56.23%
	 Val. Loss: 0.528 |  Val. Acc: 83.40%
Epoch: 02 | Epoch Time: 1m 35s
	Train Loss: 0.403 | Train Acc: 87.47%
	 Val. Loss: 0.264 |  Val. Acc: 91.74%
Epoch: 03 | Epoch Time: 1m 34s
	Train Loss: 0.252 | Train Acc: 92.17%
	 Val. Loss: 0.216 |  Val. Acc: 93.09%
Epoch: 04 | Epoch Time: 1m 37s
	Train

In [21]:
# arabic
run('train','ar','POSTagger_arabic')

Running model in train mode with lang: ar
data\ar
Unique tokens in TEXT vocabulary: 15889
Unique tokens in UD_TAG vocabulary: 18

Number of training examples: 6174
Number of validation examples: 786
Number of tokens in the training set: 225853
Number of testing examples: 704
Tag		Count		Percentage

NOUN		74156		32.8%
ADP		33548		14.9%
ADJ		23424		10.4%
CONJ		19182		 8.5%
PUNCT		17777		 7.9%
X		17626		 7.8%
VERB		17175		 7.6%
PRON		10904		 4.8%
NUM		6191		 2.7%
PART		2996		 1.3%
DET		1537		 0.7%
ADV		827		 0.4%
SYM		316		 0.1%
PROPN		156		 0.1%
AUX		31		 0.0%
INTJ		7		 0.0%
The model has 2,224,310 trainable parameters
Epoch: 01 | Epoch Time: 1m 23s
	Train Loss: 2.016 | Train Acc: 35.50%
	 Val. Loss: 1.463 |  Val. Acc: 55.48%
Epoch: 02 | Epoch Time: 1m 21s
	Train Loss: 0.868 | Train Acc: 72.48%
	 Val. Loss: 0.483 |  Val. Acc: 85.51%
Epoch: 03 | Epoch Time: 1m 21s
	Train Loss: 0.350 | Train Acc: 89.20%
	 Val. Loss: 0.281 |  Val. Acc: 91.40%
Epoch: 04 | Epoch Time: 1m 20s
	Train Loss: 0.20

In [22]:
# afrikaans
run('train','af','POSTagger_afrikaans')

Running model in train mode with lang: af
data\af
Unique tokens in TEXT vocabulary: 2235
Unique tokens in UD_TAG vocabulary: 18

Number of training examples: 1315
Number of validation examples: 194
Number of tokens in the training set: 33894
Number of testing examples: 425
Tag		Count		Percentage

NOUN		7335		21.6%
ADP		4365		12.9%
DET		3769		11.1%
PUNCT		3129		 9.2%
VERB		2957		 8.7%
PRON		2495		 7.4%
AUX		2276		 6.7%
ADJ		2168		 6.4%
CCONJ		1327		 3.9%
ADV		1295		 3.8%
PART		926		 2.7%
SCONJ		716		 2.1%
PROPN		359		 1.1%
SYM		323		 1.0%
X		291		 0.9%
NUM		163		 0.5%
The model has 858,910 trainable parameters
Epoch: 01 | Epoch Time: 0m 8s
	Train Loss: 2.582 | Train Acc: 18.24%
	 Val. Loss: 2.415 |  Val. Acc: 21.27%
Epoch: 02 | Epoch Time: 0m 7s
	Train Loss: 2.356 | Train Acc: 22.84%
	 Val. Loss: 2.279 |  Val. Acc: 25.92%
Epoch: 03 | Epoch Time: 0m 8s
	Train Loss: 2.148 | Train Acc: 31.57%
	 Val. Loss: 1.987 |  Val. Acc: 38.76%
Epoch: 04 | Epoch Time: 0m 8s
	Train Loss: 1.767 | Train Ac

In [23]:
# lithuanian
run('train','lt','POSTagger_lithuanian')

Running model in train mode with lang: lt
data\lt
Unique tokens in TEXT vocabulary: 4392
Unique tokens in UD_TAG vocabulary: 19

Number of training examples: 2341
Number of validation examples: 617
Number of tokens in the training set: 47605
Number of testing examples: 684
Tag		Count		Percentage

NOUN		14933		31.4%
PUNCT		8756		18.4%
VERB		6604		13.9%
ADJ		3274		 6.9%
CCONJ		2136		 4.5%
ADV		1826		 3.8%
PRON		1688		 3.5%
ADP		1490		 3.1%
NUM		1312		 2.8%
DET		1181		 2.5%
X		1066		 2.2%
PROPN		983		 2.1%
PART		951		 2.0%
SCONJ		917		 1.9%
AUX		453		 1.0%
SYM		26		 0.1%
INTJ		9		 0.0%
The model has 1,074,867 trainable parameters
Epoch: 01 | Epoch Time: 0m 17s
	Train Loss: 2.322 | Train Acc: 29.66%
	 Val. Loss: 2.152 |  Val. Acc: 30.69%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 2.067 | Train Acc: 38.39%
	 Val. Loss: 1.923 |  Val. Acc: 43.26%
Epoch: 03 | Epoch Time: 0m 17s
	Train Loss: 1.685 | Train Acc: 48.82%
	 Val. Loss: 1.487 |  Val. Acc: 53.54%
Epoch: 04 | Epoch Time: 0m 16s
	Train 

In [24]:
# armenian
run('train','hy','POSTagger_armenian')

Running model in train mode with lang: hy
data\hy
Unique tokens in TEXT vocabulary: 3800
Unique tokens in UD_TAG vocabulary: 19

Number of training examples: 1975
Number of validation examples: 249
Number of tokens in the training set: 42105
Number of testing examples: 278
Tag		Count		Percentage

NOUN		10524		25.0%
PUNCT		8124		19.3%
VERB		5355		12.7%
ADJ		3317		 7.9%
AUX		2963		 7.0%
CCONJ		1905		 4.5%
ADV		1843		 4.4%
PRON		1636		 3.9%
DET		1609		 3.8%
PROPN		1499		 3.6%
ADP		1301		 3.1%
SCONJ		745		 1.8%
NUM		540		 1.3%
PART		458		 1.1%
X		153		 0.4%
INTJ		105		 0.2%
SYM		28		 0.1%
The model has 1,015,667 trainable parameters
Epoch: 01 | Epoch Time: 0m 14s
	Train Loss: 2.432 | Train Acc: 23.22%
	 Val. Loss: 2.213 |  Val. Acc: 31.20%
Epoch: 02 | Epoch Time: 0m 14s
	Train Loss: 2.204 | Train Acc: 32.36%
	 Val. Loss: 2.012 |  Val. Acc: 39.86%
Epoch: 03 | Epoch Time: 0m 13s
	Train Loss: 1.904 | Train Acc: 41.05%
	 Val. Loss: 1.544 |  Val. Acc: 51.58%
Epoch: 04 | Epoch Time: 0m 14s
	Trai

In [25]:
# tamil
run('train','ta','POSTagger_tamil')

Running model in train mode with lang: ta
data\ta
Unique tokens in TEXT vocabulary: 926
Unique tokens in UD_TAG vocabulary: 15

Number of training examples: 400
Number of validation examples: 80
Number of tokens in the training set: 6329
Number of testing examples: 120
Tag		Count		Percentage

NOUN		1860		29.4%
PROPN		936		14.8%
VERB		747		11.8%
PUNCT		665		10.5%
ADJ		466		 7.4%
AUX		423		 6.7%
PART		383		 6.1%
ADV		251		 4.0%
ADP		184		 2.9%
NUM		156		 2.5%
PRON		147		 2.3%
DET		80		 1.3%
CCONJ		31		 0.5%
The model has 727,239 trainable parameters
Epoch: 01 | Epoch Time: 0m 1s
	Train Loss: 2.505 | Train Acc: 23.26%
	 Val. Loss: 2.242 |  Val. Acc: 29.06%
Epoch: 02 | Epoch Time: 0m 1s
	Train Loss: 2.224 | Train Acc: 29.42%
	 Val. Loss: 2.158 |  Val. Acc: 29.06%
Epoch: 03 | Epoch Time: 0m 1s
	Train Loss: 2.179 | Train Acc: 28.44%
	 Val. Loss: 2.087 |  Val. Acc: 29.30%
Epoch: 04 | Epoch Time: 0m 1s
	Train Loss: 2.114 | Train Acc: 30.56%
	 Val. Loss: 2.050 |  Val. Acc: 29.14%
Epoch: 05 | Ep

In [32]:
for index in range(5):
    with open('test.csv','a',encoding='utf-8') as f:
        f.write(f'{index}\t Hello world\n')
        f.close()

with open('test.csv','r') as f:
    print(f.read())

import pandas as pd
print(pd.read_csv('test.csv',delimiter='\t',encoding='utf-8'))

import os
if os.path.exists("test.csv"):
  os.remove("test.csv")
else:
  print("The file does not exist")

0	 Hello world
1	 Hello world
2	 Hello world
3	 Hello world
4	 Hello world

   0   Hello world
0  1   Hello world
1  2   Hello world
2  3   Hello world
3  4   Hello world


### Task 2 - Discussion (10 points)
In this task we will discuss the received results from your evaluation. 
Each question has an additional markdown cell below for the answer. Please use it and put in you answer there.

##### Question 1: How the performance changes accross language families and available dataset size? Make a conclusion of how the model's prediction depends on the available data. (2 point)


Q1 answer

##### Question 2: What role does the training set size plays for the model? Which problem regarding training sets occurs when you deal with the low-resource languages? (2 points)

Q2 answer

##### Question 3: What do the parameters "n_layers", "bidirectional" and "dropout" (variable "params" in the first code cell) of the LSTM model mean? According to your research results please answer the following questions regarding the low-resource languages: (4 points)
#### - What happens when you increase the variable n_layers and why? 
#### - What changes when the model is unidirectional and why? 
#### - What happens when you increase the dropout and why?

![sdf](C:\Repository\Informatik_Local\Ethics_In_NLP\HW03\img\n_layers.png)

##### Question 4: Define the term "label noise". After that, please answer what happens if the labels in the training data are noisy? (2 points)

Q4 answer