# 05. Training Embedding Vector from Scratch
In this module, rather than using pre-trained embedding vector, we allow an embedding vector changed as same as other parameters in model. In other words, An embedding vector can be trained along with the rest of the parameters.

## Initialized Data Set
As usual, import our dataset

In [5]:
# import data set
import pandas as pd
df = pd.read_csv("data/cleaned-train-tweets.csv", sep="|")

# create PyTorch data set
import torch
from torch.utils.data import Dataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

class DisasterTweetsDataset(Dataset):
    def __init__(self, df, max_vocab_size, vocabulary = None):
        
        # load dataframe
        self.x = df["clean_text"]
        self.y = df["target"]

        # create vocabulary
        if not vocabulary:
            self.vocab = build_vocab_from_iterator(
                [" ".join([str(text) for text in df["clean_text"]]).split()],
                specials=['<unk>'],
                max_tokens = max_vocab_size)
            self.vocab.set_default_index(self.vocab['<unk>'])
        else:
            self.vocab = vocabulary

    @property
    def vocab_size(self):
        # add one more for out of vocab words
        return len(self.vocab) + 1
    
    def get_vocab(self):
        return self.vocab

    def __len__(self):
        return len(self.x)

    def __getitem__(self, i):
        x = self.x[i]
        y = self.y[i]
        return (y, x)

# initiate dataset and vocabulary
dataset = DisasterTweetsDataset(df, 10000)
vocab = dataset.get_vocab()

for i, (label, tweet) in enumerate(dataset):
    print(f'The sentence for Example {i} ---')
    print(tweet)
    print(f'The label for Example {i} ---')
    print(label)
    print()
    if i == 2: break

The sentence for Example 0 ---
deed reason earthquake may allah forgive u
The label for Example 0 ---
1

The sentence for Example 1 ---
forest fire near la ronge sask canada
The label for Example 1 ---
1

The sentence for Example 2 ---
resident asked place notified officer evacuation shelter place order expected
The label for Example 2 ---
1



## Initial Embedding Vector the Collation Function
Now, we have to define the way that we can encode each token into a number. Initially, we should define the initial embedding matrix that have the number of embeddings equals to the vocabulary size and the embedding dimension equal to 300 (our presumed number to resembles other pre-trained embedding matrix). For simplicity, we start with collating our tweet in bag of words. Hence, each tweet will transform to a vector with the dimension equals to the vocabulary size (10,000) and the value for each feature is equal to the frequency of each token in each tweet.

Note that this embedding vector is just the initial vector. In training pipeline, we will also train this embedding matrix along with other parameters.

In [17]:
def collate_into_bow(batch):
    """
    collate the dataset into bag of words representation
    
    input:
        - batch (list(str, int)): a list of dataset in forms of label, text
    return:
        - (tensor): a tensor of labels
        - (tensor): a tensor of bag of words
    """
    def text_pipeline(text):
        """
        create bow vector for each text
        
        input:
            - text (str): a document, a text
        return:
            - (list): a bow vector
        """

        indices = vocab(str(text).split())
        bows = [0] * len(vocab)
        for index in indices:
            bows[index] += 1 / len(indices)
        return bows

    labels, texts = [], []
    for label, text in batch:
        labels.append(int(label))
        texts.append(text_pipeline(text))
    return torch.tensor(labels, dtype=torch.int64), torch.tensor(texts)

## Training Pipeline
Now, we will training our model using bidirectional long-short-term memory RNN.

In [None]:
# Initiate a dataloader and split the data into train and validation dataset
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader

def new_dataloader(dataset, collate_fn, batch_size=64, split_train_ratio=0.7):
    num_train = int(len(df) * split_train_ratio)
    num_valid = len(df) - num_train
    train_data, valid_data = random_split(
        dataset,
        [num_train, num_valid]
    )
    train_dataloader = DataLoader(
        train_data, 
        batch_size=batch_size, 
        shuffle=True,
        collate_fn=collate_fn
    )
    valid_dataloader = DataLoader(
        valid_data, 
        batch_size=batch_size,
        shuffle=False, 
        collate_fn=collate_fn)
    return (train_dataloader, valid_dataloader)

# helper function: repackage hidden
def repackage_hidden(h):
    """
    Wraps hidden states in new Tensors, to detach them from their history.
    """
    if h is None:
        return None
    elif isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


# function for training an epoch
def train_an_epoch(dataloader, model, hidden, loss_function, optimizer, 
                   clip_grad, max_norm):
    model.train() # Sets the module in training mode.
    log_interval = 500

    for idx, (label, text) in enumerate(dataloader):
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        output, hidden = model(text, hidden)
        loss = loss_function(output.view(-1, output.size(-1)), label.view(-1))
        loss.backward()
        if clip_grad:
            torch.nn.utils.clip_grad_norm_(
                parameters=model.parameters(), 
                max_norm=max_norm # default GRAD_CLIP = 1
            )
        optimizer.step()
        if idx % log_interval == 0 and idx > 0:
            print(f'At iteration {idx} the loss is {loss:.3f}.')

# function for calculate the accuracy for a given dataloader
def get_accuracy(dataloader, model):
    model.eval()
    with torch.no_grad():
        hidden = None
        total_acc, total_count = 0, 0
        for _, (label, text) in enumerate(dataloader):
            log_probs, hidden = model(text, hidden)
            predicted_label = torch.argmax(log_probs, dim=1)
            total_acc += (predicted_label == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

# putting all together, create function for training
import matplotlib.pyplot as plt
import time

def training(dataset, collate_fn, batch_size, split_train_ratio, 
             model, epochs, loss_function, optimizer, clip_grad, max_norm):
    
    # create dataloader from dataset
    train_dataloader, valid_dataloader = new_dataloader(
        dataset, collate_fn, batch_size, split_train_ratio)

    # training
    accuracies = []
    max_val_acc = -float("inf")
    for epoch in range(1, epochs + 1):
        epoch_start_time = time.time()
        hidden = None
        train_an_epoch(train_dataloader, model, hidden, loss_function, optimizer,
                       clip_grad, max_norm)
        accuracy = get_accuracy(valid_dataloader, model)
        accuracies.append(accuracy)
        time_taken = time.time() - epoch_start_time
        if epoch % 10 == 0:
            print(f'After epoch {epoch} the validation accuracy is {accuracy:.3f}.')
        # persist the best model
        if accuracy > max_val_acc:
            print("the best model has validation accuracy at {}".format(accuracy))
            best_model = type(model)(
                model.rnn_type,  
                model.input_size, 
                model.hidden_size, 
                model.num_labels,
                model.num_layers, 
                model.dropout
            )
            best_model.load_state_dict(model.state_dict())
            max_val_acc = accuracy
    
    plt.plot(range(1, epochs + 1), accuracies)

In [None]:
# Initiate an RNN classifier
from torch import nn
import torch.nn.functional as F

class BiLSTMClassifier(nn.Module):
    """
    Initialize RNN classifier

    Args:
        - rnn_type (str): "LSTM", "BiLSTM", "GRU", "RNN_TANH", "RNN_RELU"
        - input_size (int): size of embedding vector (number of features) 
            for each word (default: 300)
        - hidden_size (int): the number of features in the hidden state (def: 300)
    """
    def __init__(self, input_size, hidden_size, num_labels, 
                 num_layers, dropout=0.5):
        super(BiLSTMClassifier, self).__init__()
        self.rnn_type = rnn_type
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_labels = num_labels
        self.num_layers = num_layers
        self.dropout = dropout

        self.drop = nn.Dropout(dropout)

        self.encoder = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=embedding_dim)

        self.rnn = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            bidirectional=True
        )

        self.decoder = nn.Linear(
            in_features=hidden_size * 2, 
            out_features=num_labels
        )
        
    
    def forward(self, input, hidden0):
        embedding = self.drop(self.encoder(input))
        output, hidden = self.rnn(embedding, hidden0)
        output = self.drop(output)
        decoded_output = self.decoder(output)
        decoded_output = F.log_softmax(self.decoder(output), dim=1)
        return decoded_output, hidden

## Preprocessing 
To allow the embedding vector be changed. First, we use only "text" data and forget the label data now. For each text row, we compare the centered word with their neighborhood in one window ($W$ = 1). For example, in tweet, `forest fire near la ronge sask canada`, we should generate our new dataset like this: `[fire, near], [fire, forest], [canada, sask], [canada, </s>]` Note that we must add special dummy word `<s>` and `</s>` for taking care the starting and ending. 

The code below processes and creates new dataset for this task


In [None]:
# Getting only text data
W = 1
WINDOW_SIZE = (2 * W + 1)

SENT_START_WORD = '<s>'
SENT_END_WORD = '</s>'
SENT_START_TAG = '<STAG>'
SENT_END_TAG = '<ETAG>'


def add_sent_start_end(data_iter, w):
    for (label, tweet) in data_iter:
        new_tweet = [SENT_START_WORD] * w + tweet + [SENT_END_WORD] * w
        new_label = [SENT_START_TAG] * w + ud_tags + [SENT_END_TAG] * w
        ## MISSING PART: ADD YOUR CODE BELOW
        new_ptb_tags = [SENT_START_TAG] * w + ptb_tags + [SENT_END_TAG] * w
        ## ADD YOUR CODE ABOVE
        yield(new_words, new_ud_tags, new_ptb_tags)

def create_windows(data_iter, w):
    window_size = 2*w + 1
    for (words, ud_tags, ptb_tags) in data_iter:
        words_zip = zip(*[words[i:] for i in range(window_size)])
        ud_zip = zip(*[ud_tags[i:] for i in range(window_size)])
        ## MISSING PART: ADD YOUR CODE BELOW
        ptb_zip = zip(*[ptb_tags[i:] for i in range(window_size)])
        ## ADD YOUR CODE ABOVE
        for word_sseq, ud_sseq, ptb_sseq in zip(
                words_zip, ud_zip, ptb_zip):
            yield(word_sseq, ud_sseq, ptb_sseq)

In [None]:
train_iter_vocab = preprocess_data_seq(train_iter_0, 1)

counter_words = Counter()
counter_ud = Counter()
counter_ptb = Counter()
for (text, pos_ud, pos_ptb) in train_iter_vocab:
    counter_words.update(text)
    counter_ud.update(pos_ud)
    counter_ptb.update(pos_ptb)


vocab_words = torchtext.vocab.vocab(counter_words,  specials = ['<unk>'], 
                    special_first = True)    
vocab_words.set_default_index(0)
vocab_ud = torchtext.vocab.vocab(counter_ud)
vocab_ptb = torchtext.vocab.vocab(counter_ptb)

print(f"{len(vocab_words)} words, {len(vocab_ud)} ud pos classes, {len(vocab_ptb)} ptb pos classes")

In [None]:
W = 1
WINDOW_SIZE = (2 * W + 1)

SENT_START_WORD = '<s>'
SENT_END_WORD = '</s>'
SENT_START_TAG = '<STAG>'
SENT_END_TAG = '<ETAG>'


def add_sent_start_end(data_iter, w):
    for (words, ud_tags, ptb_tags) in data_iter:
        new_words = [SENT_START_WORD] * w + words + [SENT_END_WORD] * w
        new_ud_tags = [SENT_START_TAG] * w + ud_tags + [SENT_END_TAG] * w
        ## MISSING PART: ADD YOUR CODE BELOW
        new_ptb_tags = [SENT_START_TAG] * w + ptb_tags + [SENT_END_TAG] * w
        ## ADD YOUR CODE ABOVE
        yield(new_words, new_ud_tags, new_ptb_tags)
        
def create_windows(data_iter, w):
    window_size = 2*w + 1
    for (words, ud_tags, ptb_tags) in data_iter:
        words_zip = zip(*[words[i:] for i in range(window_size)])
        ud_zip = zip(*[ud_tags[i:] for i in range(window_size)])
        ## MISSING PART: ADD YOUR CODE BELOW
        ptb_zip = zip(*[ptb_tags[i:] for i in range(window_size)])
        ## ADD YOUR CODE ABOVE
        for word_sseq, ud_sseq, ptb_sseq in zip(
                words_zip, ud_zip, ptb_zip):
            yield(word_sseq, ud_sseq, ptb_sseq)
            
def preprocess_data_seq(data_iter, w):
    ## MISSING PART: ADD YOUR CODE BELOW
    return create_windows(add_sent_start_end(data_iter, w), w)


def test_preprocess_data_seq():
    
    # WARNING: The following test assumes a particular default
    # sequence of examples in the PyTorch UDPOS dataset. If you
    # suspect the sequence is different for your dataset, please
    # adapt the test.

    train_iter_0 = torchtext.datasets.UDPOS(split = 'train')    
    train_iter_demo = preprocess_data_seq(train_iter_0, 1)
    ex0 = (('<s>', 'Al', '-'), 
           ('<STAG>', 'PROPN', 'PUNCT'), 
           ('<STAG>', 'NNP', 'HYPH'))
    ex1 = (('Al', '-', 'Zaman'), 
           ('PROPN', 'PUNCT', 'PROPN'), 
           ('NNP', 'HYPH', 'NNP'))
    ex2 = (('-', 'Zaman', ':'), 
           ('PUNCT', 'PROPN', 'PUNCT'), 
           ('HYPH', 'NNP', ':'))
    assert ex0 == next(train_iter_demo)
    assert ex1 == next(train_iter_demo)
    assert ex2 == next(train_iter_demo)
    print('Test passed.')


test_preprocess_data_seq()

In [2]:
vocab

Vocab()

## Training a Word Embedding Vector from Scratch

### Initialized LSTM Model