# Practical machine learning and deep learning. Lab 4

# Many-to-many NLP task.

# [Competition](https://www.kaggle.com/t/afa89356762e438cad5f04bf0e23f3ce)

## Goal

Your goal is to implement Neural Network for tagging the part-of-speech entities.

## Submission

Submission format is described at competition page.

> Remember, you can use any structure of the solution. The template classes/function in this file is just the tip for you. 

In [154]:
import pandas as pd
import torch
import warnings

warnings.filterwarnings('ignore')

## Data reading and preprocessing

In [155]:
train = pd.read_csv('/kaggle/input/pmldl-week4-many-to-many-nlp-task/train.csv')
test = pd.read_csv('/kaggle/input/pmldl-week4-many-to-many-nlp-task/test.csv')

In [156]:
train.head()

Unnamed: 0,sentence_id,entity_id,entity,tag
0,0,0,It,PRON
1,0,1,is,VERB
2,0,2,true,ADJ
3,0,3,that,ADP
4,0,4,his,DET


In [157]:
test.head()

Unnamed: 0,id,sentence_id,entity_id,entity
0,0,0,0,In
1,1,0,1,another
2,2,0,2,setback
3,3,0,3,yesterday
4,4,0,4,","


First, let's divide dataset on train and validation. And split the dataframe according to random split.

In [158]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.2
train_split, val_split = train_test_split(range(train['sentence_id'].max()), test_size=VALIDATION_RATIO, random_state=420)

And then split the original dataframe by ids that we splitted.

In [159]:
train_dataframe = train[train['sentence_id'].isin(train_split)]
val_dataframe = train[train['sentence_id'].isin(val_split)]

In [160]:
pos_tags = ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', '.', 'X']
cat2idx = {tag: i for i, tag in enumerate(pos_tags)}
idx2cat = {v: k for k, v in cat2idx.items()}

UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

For working with datasets more efficiently, let's create separate classes for datasets. 



In [161]:
import torch
import numpy as np
torch.manual_seed(420)
from torchtext.vocab import build_vocab_from_iterator


class PosTaggingDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, vocab = None, max_size=100):
        self.dataframe = dataframe
        self._preprocess()
        self.vocab = vocab or self._create_vocab()

    def _preprocess(self):
        # fill missing values in entities
        self.dataframe['entity'] = self.dataframe['entity'].fillna("")

        # Fill missing tag to `other` - `X`
        self.dataframe[self.dataframe['tag'].isna()]['tag'] = 'X'

        # Clean entities column
        self.dataframe['entity'] = self.dataframe['entity'].str.lower()
        
        # Split the dataset, so that we will have 
        # full sentences and full tags by the same index
        self.dataframe.sort_values(by=['sentence_id', 'entity_id'], ascending=[True, True])

        self.sentences = list(self.dataframe.groupby('sentence_id')['entity'].agg(list).reset_index()['entity'])
        self.sentences = [" ".join(l) for l in self.sentences]
        
        self.tags = list(self.dataframe.groupby('sentence_id')['tag'].agg(list).reset_index()['tag'])
    
    def _create_vocab(self):
        # creates vocabulary that is used for encoding 
        # the sequence of tokens (splitted sentence)        
        vocab = build_vocab_from_iterator(
            [sentence.split() for sentence in self.sentences],
            min_freq=1,
            specials=special_symbols,
            special_first=True
        )
        vocab.set_default_index(UNK_IDX)
        return vocab

    def _get_sentence(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sent = self.sentences[index]
        return self.vocab(sent.split())

    def _get_labels(self, index: int) -> list:
        # retrieves tags from dataset by index
        tags = self.tags[index]
        return [cat2idx[tag] for tag in tags]

    def __getitem__(self, index) -> tuple[list, list]:
        return self._get_sentence(index), self._get_labels(index)
    
    def __len__(self) -> int:
        return len(self.sentences)

In [162]:
# Create train dataset
train_dataset = PosTaggingDataset(train_dataframe)
val_dataset = PosTaggingDataset(val_dataframe)

And now we are able to create dataloader faster, because we created torch datasets

In [163]:
import torch
from torch.utils.data import DataLoader

batch_size = 128
max_size = 30

device = 'cuda' if torch.cuda.is_available() else 'cpu'

def collate_batch(batch: list):
    # Collate list of samples into tensor batch
    # As an input we have list of pair from dataset:
    # [([ent1, ent2, ...], [tag1, tag2, ...]), ([ent1, ent2, ...], [tag1, tag2, ...]), ...]
    # as an output, we want to have tensor of entities and tensor of tags 
    sentences_batch, postags_batch = [], []

    for _sent, _postags in batch:
        _sent = _sent[:max_size]
    
        _sent = [PAD_IDX] * ((max_size-len(_sent)) if (len(_sent) < max_size) else 0) + _sent
        _postags = _postags[:max_size]
        _postags = [cat2idx['X']] * ((max_size-len(_postags)) if (len(_postags) < max_size) else 0) + _postags
        sentences_batch.append(_sent); postags_batch.append(_postags)
        

    # Remember, that if we want to perform many to many mapping with our network with recurrent units, 
    # we want pass first item from all sequences as first input, thus
    # we want to have tensor with shape (max_size, ...., batch_size)
    sentences_batch = torch.tensor(sentences_batch).T
    postags_batch = torch.tensor(postags_batch).T
    
    return sentences_batch.to(device), postags_batch.to(device)

train_dataloader = DataLoader(
    train_dataset, batch_size=128, shuffle=True, collate_fn=collate_batch
)
val_dataloader = DataLoader(
    val_dataset, batch_size=128, shuffle=False, collate_fn=collate_batch
)

In [164]:
# just to check that all shapes are correct

for batch in train_dataloader:
    inp, out = batch
    print(inp.shape)
    print(out.shape)
    print(inp[:,0])
    print(out[:,0])
    break

torch.Size([30, 128])
torch.Size([30, 128])
tensor([    1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1, 22908,    14,     4, 43730,     6],
       device='cuda:0')
tensor([11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
        11, 11, 11, 11, 11, 11, 11,  5,  9,  4,  5, 10], device='cuda:0')


## Creating the network

For the many-to-many or seq2seq netoworks, we want to have recurrent units in the network. This gives the ability for network to learn the hidden features and pass the knowledge from one token to other. 

### Embeddings

For embeddings you can use `nn.Embedding` for creating your own features or use pretrained embedding (like GloVe or FastText or Bert).

### Recurrent

For processing sequences you can use recurrent units like `LSTM`.

### Linear

Add simple nn.Linear. ~~This is basic stuff what do you want~~

### Regularization

Remeber to set up Dropout and Batch Normalization for regularization purposes.

In [165]:
import torch.nn as nn

class POSTagger(nn.Module):
    def __init__(self,  vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, dropout_prob):
        
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True)
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)
        self.linear = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.Dropout(p=dropout_prob),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_classes))
        self.dropout = nn.Dropout(p=dropout_prob)
        
        
    def forward(self, text):

        # text shape= [sent len, batch size]
        emb = self.emb(text)
#         print(f'emb shape: {emb.size()}')
    
        lstm = self.lstm(emb)[0]
#         print(f'lstm shape: {lstm.size()}')
        
#         drop = self.dropout(lstm)
#         print(f'drop shape: {drop.size()}')
        
#         norm = self.batch_norm(drop.permute(0, 2, 1)).permute(0, 2, 1)
#         print(f'norm shape: {norm.size()}')
        
        predictions = self.linear(lstm)
#         print(f'predictions shape: {predictions.size()}')
        
        # predictions shape = [sent len, batch size, output dim]
        return predictions

In [166]:
m = POSTagger(len(train_dataset.vocab), 32, 10, 5, 12, 0.2).to('cuda')
i = torch.tensor(np.random.randint(0, 12, size=(120,128))).to('cuda')
o = m(i)
print(torch.max(o, axis=2)[1][:,0])

tensor([10,  2,  2,  2,  2,  2,  2,  2,  2, 10,  2,  2,  2,  2,  2,  2,  2,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2, 10, 10, 10,  2,  2,  2, 10, 10,  2,
         2, 10,  2,  2,  2, 10, 10,  2, 10,  2,  2, 10, 10,  2,  2,  2, 10, 10,
         2, 10,  2,  2,  2, 10, 10,  2,  2, 10,  2,  2,  2,  2,  2, 10,  2,  2,
         2, 10, 10, 10, 10, 10,  2,  2,  2,  2,  2,  2,  2,  2, 10,  2, 10,  2,
         2, 10,  2,  2,  2,  2,  2,  2, 10,  2, 10,  2, 10, 10,  2, 10,  2,  2,
         2,  2,  2, 10, 10,  2,  2, 10,  2, 10,  2,  2], device='cuda:0')


## Training

As for training you should take into account that the shape of your output and shape of the labels. Perform required transformations and use loss function that fits your task.

> Do not forget about tqdm and logging, you want normal training not some unreadable ~~sht~~ logs. 

In [167]:
from tqdm.autonotebook import tqdm

def train_one_epoch(
    model,
    loader,
    optimizer,
    loss_fn,
    epoch_num=-1
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: train",
        leave=True,
    )
    model.train()
    train_loss = 0.0
    total = 0
    for i, batch in loop:
        texts, labels = batch
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward pass and loss calculation
        outputs = model(texts) # max_size, batch_size, classes
        
#         print(labels[-20:,0])
#         print(torch.max(outputs, axis=2)[1][-20:,0])
        
        loss = loss_fn(outputs.permute(0,2,1), labels)
        
        # backward pass
        loss.backward()

        # optimizer run
        optimizer.step()

        train_loss += loss.item()
        loop.set_postfix({"loss": train_loss/((i+1) * labels.size(0) * labels.size(1))})


def val_one_epoch(
    model,
    loader,
    loss_fn,
    epoch_num=-1,
    best_so_far=0.0,
    ckpt_path='best.pt'
):
    
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Epoch {epoch}: val",
        leave=True,
    )
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, labels = batch
        
            # forward pass and loss calculation
            outputs = model(texts)
    
            loss = loss_fn(outputs.permute(0,2,1), labels)
            
            _, predicted = torch.max(outputs, axis=2) # max_size, batch_size, classes
            
#             print(labels[-20:,0])
#             print(torch.max(outputs, axis=2)[1][-20:,0])
            
            total += labels.size(0) * labels.size(1)
        
            correct += (predicted == labels).sum().item()
#             print(predicted[5:10, 5], '\n', labels[5:10, 5])

            val_loss += loss.item()

            loop.set_postfix({"loss": val_loss/total, "acc": correct / total})
        
        if correct / total > best_so_far:
            torch.save(model.state_dict(), ckpt_path)
            print('model saved')
            return correct / total

    return best_so_far

In [168]:
INPUT_DIM = len(train_dataset.vocab)
OUTPUT_DIM = len(pos_tags)

model = POSTagger(INPUT_DIM, 128, 128, 2, OUTPUT_DIM, 0.25).to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

In [169]:
best_so_far = -float('inf')
num_epochs = 3
for epoch in range(num_epochs):
    train_one_epoch(model, train_dataloader, optimizer, loss_fn, epoch_num=epoch)
    best_so_far = val_one_epoch(model, val_dataloader, loss_fn, epoch, best_so_far=best_so_far)

Epoch 0: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 0: val:   0%|          | 0/91 [00:00<?, ?it/s]

model saved


Epoch 1: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 1: val:   0%|          | 0/91 [00:00<?, ?it/s]

Epoch 2: train:   0%|          | 0/361 [00:00<?, ?it/s]

Epoch 2: val:   0%|          | 0/91 [00:00<?, ?it/s]

# Predictions

Write prediction. That's it. No more instructions, you already made it 3 times.

In [170]:
import torch
import numpy as np
torch.manual_seed(420)
from torchtext.vocab import build_vocab_from_iterator


class TestDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, vocab = None, max_size=100):
        self.dataframe = dataframe
        self._preprocess()
        self.vocab = vocab

    def _preprocess(self):
        # fill missing values in entities
        self.dataframe['entity'] = self.dataframe['entity'].fillna("other")

        # Clean entities column
        self.dataframe['entity'] = self.dataframe['entity'].str.lower()
        
        # Split the dataset, so that we will have 
        # full sentences and full tags by the same index
        self.dataframe.sort_values(by=['sentence_id', 'entity_id'], ascending=[True, True])

        self.sentences = list(self.dataframe.groupby('sentence_id')['entity'].agg(list).reset_index()['entity'])
        print(np.sum(len(s) for s in self.sentences))
        

    def _get_sentence(self, index: int) -> list:
        # retrieves sentence from dataset by index
        sent = self.sentences[index]
        return self.vocab(sent)

    def __getitem__(self, index) -> list:
        return self._get_sentence(index)
    
    def __len__(self) -> int:
        return len(self.sentences)

In [171]:
# you can use the same dataset class
test_dataset = TestDataset(test, vocab=train_dataset.vocab)
len(test_dataset)

303025


14441

In [172]:
batch_size = 128
max_size = 128

# remebder that for training we can use pads but for testing we need to write 
# exact length of the sentence into the seubmission
def collate_batch(batch: list):
    sentences_batch, sentences_lengths = [], []
    for _sent in batch:
        sentences_lengths.append(len(_sent))
        _sent = _sent[:max_size]
        _sent = [PAD_IDX] * ((max_size-len(_sent)) if (len(_sent) < max_size) else 0) + _sent
        sentences_batch.append(_sent)

    sentences_batch = torch.tensor(sentences_batch).T
    sentences_lengths = torch.tensor(sentences_lengths)
    return sentences_batch.to(device), sentences_lengths.to(device)

test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [173]:
for i, j in test_dataloader:
    print(i.size(), j.size())
    break
print(len(test_dataloader))

torch.Size([128, 128]) torch.Size([128])
113


In [174]:
def predict(
    model,
    loader,
):
    loop = tqdm(
        enumerate(loader, 1),
        total=len(loader),
        desc=f"Predictions",
        leave=True,
    )
    predictions = []
    l = 0
    with torch.no_grad():
        model.eval()  # evaluation mode
        for i, batch in loop:
            texts, lens = batch
            l += torch.sum(lens)

            # forward pass and loss calculation
            outputs = model(texts) # max_size, batch, classes
            
            _, predicted = torch.max(outputs, 2)
            
            predicted = predicted.permute(1,0).detach().cpu().tolist()
            
            for i in range(len(batch[1])):
                predictions += predicted[i][-lens[i]:]
                
            
#             print(predictions)
    print(l)
    return predictions

In [175]:
ckpt = torch.load("best.pt")
model.load_state_dict(ckpt)

predictions = predict(model, test_dataloader)
predictions[:10]

Predictions:   0%|          | 0/113 [00:00<?, ?it/s]

tensor(303025, device='cuda:0')


[1, 4, 5, 5, 10, 9, 7, 0, 5, 9]

In [176]:
results = pd.Series(predictions).apply(lambda x: idx2cat[x])
results.to_csv('submission.csv', index_label='id')
results

0          ADP
1          DET
2         NOUN
3         NOUN
4            .
          ... 
303020    NOUN
303021     PRT
303022    VERB
303023    NOUN
303024       .
Length: 303025, dtype: object

In [177]:
len(predictions)

303025

In [178]:
len(results[results == 'X'])
# results

1158

[1, 4, 5, 5, 10, 5, 7, 5, 5, 9]

0          ADP
,1          DET
,2         NOUN
,3         NOUN
,4            .
,          ... 
,303020    NOUN
,303021     PRT
,303022    VERB
,303023    NOUN
,303024       .
,Length: 303025, dtype: object