# Arjuna: Generate Indonesian Poets and Poems using python base NLP model

In [None]:
import numpy as np
from collections import Counter
import string
import torch
from torch.utils.data import Dataset
import pandas as pd
import torch.nn as nn
from torch.utils.data import DataLoader
from argparse import Namespace
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
import re
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Dataset Build up


In [None]:
poet_df = pd.read_csv("/content/drive/MyDrive/Pycon2021/Dataset Puisi - Manual - Sheet1.csv")

In [None]:
poet_df.head()

Unnamed: 0,author,content,poem name,age,type
0,Ramadhan K H,"Seruling berkawan pantun,\nTangiskan derita or...",Priangan Si Jelita,1956.0,"keindahan alam, gadis-gadis"
1,Chairil Anwar,Kalau sampai waktuku\nAku mau tak seorang kan ...,Aku,1943.0,"individualistic, nature, and vitality"
2,Chairil Anwar,Di masa pembangunan ini\r\nTuan hidup kembali\...,Diponegoro,1943.0,perjuangan
3,Chairil Anwar,Kepada pemeluk teguh\r\n\r\nTuhanku\r\nDalam t...,Doa,1943.0,agama
4,Chairil Anwar,Ini muka penuh luka\r\nSiapa punya?\r\n\r\nKu ...,Aku Berkaca,,


In [None]:
poet_df.shape[0]

162

In [None]:
poet_df['content'][0].split("\n\n")

['Seruling berkawan pantun,\nTangiskan derita orang priangan,\nSelendang merah, merah darah\nMenurun di Cikapundung.',
 'Bandung, dasar di danau\nLari bertumpuk di bukit-bukit.',
 'Seruling menyendiri di tepi-tepi\nTangiskan keris hilang di sumur\nMelati putih, putih hati,\nHilang kekasih dikata gugur.',
 'Bandung, dasar di danau\nDerita memantul di kulit-kulit.']

In [None]:
poet_df['content'][0]

'Seruling berkawan pantun,\nTangiskan derita orang priangan,\nSelendang merah, merah darah\nMenurun di Cikapundung.\n\nBandung, dasar di danau\nLari bertumpuk di bukit-bukit.\n\nSeruling menyendiri di tepi-tepi\nTangiskan keris hilang di sumur\nMelati putih, putih hati,\nHilang kekasih dikata gugur.\n\nBandung, dasar di danau\nDerita memantul di kulit-kulit.'

### import NLTK and Tokenizing

In [None]:
poet_dataset = {'first': [], 'second': []}

num_poets = poet_df.shape[0]

for i in range(num_poets):
    sentences = poet_df['content'][i].split("\n\n")
    sentences_len = len(sentences)
    WINDOW_SIZE = 2
    for index in range(sentences_len - WINDOW_SIZE):
        prev_sentence = sentences[index]
        next_sentence = sentences[index + 1]

        # print("1 " + prev_sentence)
        # print("2 " + next_sentence)
        poet_dataset['first'].append(prev_sentence)
        poet_dataset['second'].append(next_sentence)

poet_dataset = pd.DataFrame.from_dict(poet_dataset)

In [None]:
poet_dataset.head()

Unnamed: 0,first,second
0,"Seruling berkawan pantun,\nTangiskan derita or...","Bandung, dasar di danau\nLari bertumpuk di buk..."
1,"Bandung, dasar di danau\nLari bertumpuk di buk...",Seruling menyendiri di tepi-tepi\nTangiskan ke...
2,Kalau sampai waktuku\nAku mau tak seorang kan ...,Aku ini binatang jalang\nDari kumpulannya terb...
3,Aku ini binatang jalang\nDari kumpulannya terb...,Biar peluru menembus kulitku\nAku tetap merada...
4,Biar peluru menembus kulitku\nAku tetap merada...,Luka dan bisa kubawa berlari\nBerlari\nHingga ...


### Document Cleansing and Formating Train Data

the train data would be

prev_sentence | next_sentence

In [None]:
import re
import string

def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

def preprocess_row(row):
    row['first'] = preprocess_text(row['first'])
    row['second'] = preprocess_text(row['second'])
    return row

In [None]:
poet_dataset = poet_dataset.apply(lambda row: preprocess_row(row), axis=1)

In [None]:
poet_dataset.head()

Unnamed: 0,first,second
0,"seruling berkawan pantun , tangiskan derita or...","bandung , dasar di danau lari bertumpuk di buk..."
1,"bandung , dasar di danau lari bertumpuk di buk...",seruling menyendiri di tepi tepi tangiskan ker...
2,kalau sampai waktuku aku mau tak seorang kan m...,aku ini binatang jalang dari kumpulannya terbuang
3,aku ini binatang jalang dari kumpulannya terbuang,biar peluru menembus kulitku aku tetap meradan...
4,biar peluru menembus kulitku aku tetap meradan...,luka dan bisa kubawa berlari berlari hingga hi...


In [None]:
# splitting data into
# train val and test

poet_dataset['split'] = 'train'
def assign_label(row):
    magic_number = np.random.randint(0, 10)
    if magic_number > 6:
        valortest = np.random.randint(0, 2)
        if valortest == 0:
            return 'val'
        elif valortest == 1:
            return 'test'
    else:
        return 'train'
           
poet_dataset['split'] = poet_dataset.apply(lambda row: assign_label(row['split']), axis=1)

In [None]:
poet_dataset

Unnamed: 0,first,second,split
0,"seruling berkawan pantun , tangiskan derita or...","bandung , dasar di danau lari bertumpuk di buk...",val
1,"bandung , dasar di danau lari bertumpuk di buk...",seruling menyendiri di tepi tepi tangiskan ker...,train
2,kalau sampai waktuku aku mau tak seorang kan m...,aku ini binatang jalang dari kumpulannya terbuang,train
3,aku ini binatang jalang dari kumpulannya terbuang,biar peluru menembus kulitku aku tetap meradan...,train
4,biar peluru menembus kulitku aku tetap meradan...,luka dan bisa kubawa berlari berlari hingga hi...,train
...,...,...,...
128,batapa indahnya alam kita ini ombak bergemuruh...,kita berdiri dengan beralaskan tanah kita berd...,train
129,belum nampak mendung menutupi langit seberkas ...,bulan tak ingin membawakan tawa manja kala wak...,train
130,kau adalah tempat yang terindah jauh dari rama...,tempatmu yang penuh dengan pepohonan menjadika...,test
131,saat malam tiba dengan langit yang gemerlap sa...,namun hatiku kian murung saat awan hitam mulai...,train


In [None]:
poet_dataset['split'].unique()

array(['val', 'train', 'test'], dtype=object)

In [None]:
poet_dataset['split'].value_counts()

train    97
test     21
val      15
Name: split, dtype: int64

In [None]:
poet_dataset.to_csv("/content/drive/MyDrive/Pycon2021/poet_dataframe_sentences.csv", index=False)

## Sequence Vocab

In [None]:
class SequenceVocabulary(object):
    """Class to extract and process vocabularies for mapping"""
    
    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {
            idx: token for token, idx in self._token_to_idx.items()
        }
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token
        
        # add begin and end sequence token
        self._begin_of_seq_token = "<BEGIN-OF-SEQUENCE>"
        self._end_of_seq_token = "<END-OF-SEQUENCE>"
        
        self.begin_seq_index = self.add_token(self._begin_of_seq_token)
        self.end_seq_index = self.add_token(self._end_of_seq_token)

        self.mask_index = self.add_token(mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def to_serializeable(self):
        """return a serializeable dictionary"""
        return {
            'token_to_idx': self._token_to_idx,
            'mask_token': self._mask_token,
            'add_unk': self._add_unk,
            'unk_token': self._unk_token
        }
    
    @classmethod
    def from_serializeable(cls, contents):
        """create vocabulary object from serialize dictionary"""
        return cls(**contents)
    
    def add_token(self, token):
        """Add a token and return it's index"""
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
    
    def lookup_token(self, token):
        """get the index of a token 
        if not exist returns the unk_index"""
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index %d is not in the vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

## Vectorizer

In [None]:
class NMTVectorizer(object):
    def __init__(self, source_vocab, target_vocab,
                max_source_length, max_target_length):
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
        
    @classmethod
    def from_dataframe(cls, lang_df):
        source_vocab = SequenceVocabulary()
        target_vocab = SequenceVocabulary()
        
        max_source_length, max_target_length = 0, 0
        
        for _, rows in lang_df.iterrows():
            # source
            source_token = rows["first"].split(" ")
            if len(source_token) > max_source_length:
                max_source_length = len(source_token)
            for token in source_token:
                source_vocab.add_token(token)
            
            # target
            target_token = rows["second"].split(" ")
            if len(target_token) > max_target_length:
                max_target_length = len(target_token)
            for token in target_token:
                target_vocab.add_token(token)
                
        return cls(source_vocab, target_vocab,
                  max_source_length, max_target_length)
    
    def _vectorize(self, indices, vector_length=-1, mask_index=0):
        if vector_length < 0:
            vector_length = len(indices)
        
        vector = np.zeros(vector_length, dtype=np.int64)
        vector[:len(indices)] = indices
        vector[len(indices):] = mask_index
        return vector
    
    def _get_source_indices(self, source_text):
        """
        Source indices adding begin_seq_index and
        end_seq_index
        """
        indices = [self.source_vocab.begin_seq_index]
        indices.extend(self.source_vocab.lookup_token(token) for token in
                       source_text.split(" "))
        indices.append(self.source_vocab.end_seq_index)
        
        return indices
    
    def _get_target_indices(self, target_text):
        indices = [self.target_vocab.lookup_token(token)
                   for token in target_text.split(" ")]
        
        x_indices = [self.target_vocab.begin_seq_index] + indices
        y_indices = indices + [self.target_vocab.end_seq_index]
        
        return x_indices, y_indices
    
    def vectorize(self, source_text, target_text, use_dataset_max_length=True):
        source_length = -1
        target_length = -1
        
        if use_dataset_max_length:
            source_length = self.max_source_length + 2
            target_length = self.max_target_length + 1
        
        source_indices = self._get_source_indices(source_text)
        source_vector = self._vectorize(source_indices,
                                       source_length,
                                       mask_index= self.source_vocab.mask_index)
        
        target_x_indices, target_y_indices = self._get_target_indices(target_text)
        
        target_x_vector = self._vectorize(target_x_indices,
                                         target_length,
                                         self.target_vocab.mask_index)
        target_y_vector = self._vectorize(target_y_indices,
                                         target_length,
                                         self.target_vocab.mask_index)
        return {"source_vector": source_vector,
                "target_x_vector": target_x_vector,
                "target_y_vector": target_y_vector,
                "source_length": len(source_indices)}

## Dataset

In [None]:
class NMTDataset(Dataset):
    def __init__(self, text_df, vectorizer):
        self.text_df = text_df
        self._vectorizer = vectorizer

        self.train_df = self.text_df[self.text_df.split == 'train']
        self.train_size = len(self.train_df)
        
        self.val_df = self.text_df[self.text_df.split == 'val']
        self.val_size = len(self.val_df)
        
        self.test_df = self.text_df[self.text_df.split == 'test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                            'val': (self.val_df, self.val_size),
                            'test': (self.test_df, self.test_size)}
        
        self.set_split('train')
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, text_csv):
        """Load dataset from csv and returns the dataset object
        and vectorizer"""
        text_df = pd.read_csv(text_csv)
        train_text_df = text_df[text_df.split == 'train']
        return cls(text_df,
                   NMTVectorizer.from_dataframe(train_text_df))
    
    def get_vectorizer(self):
        """Get vectorizer"""
        return self._vectorizer
    
    def set_split(self, split='train'):
        """Set the split from data"""
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        Args:
            index (int): the index to the data point
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        
        data_dict = self._vectorizer.vectorize(row['first'],
                                               row['second'])
        
        return {
            'x_source': data_dict["source_vector"],
            'x_target': data_dict["target_x_vector"],
            'y_target': data_dict["target_y_vector"],
            'x_source_length': data_dict["source_length"]
        }
    
    def get_num_batches(self, batch_size):
        """Given the batch size return the number of batches in the dataset"""
        return len(self) // batch_size


def generate_nmt_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """
    Batch Generator
    """
    dataloader = DataLoader(dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last= drop_last)
    
    for data_dict in dataloader:
        lengths = data_dict['x_source_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        
        yield out_data_dict

## Encoder

In [None]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class NMTEncoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size):
        """
        Encoder Module
        Args:
            num_embeddings(int): size of input dimension
            embedding_size(int): embedding dimension
            rnn_hidden_size(int): rnn hidden weight size
        """
        super(NMTEncoder, self).__init__()
        
        self.source_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0)
        self.birnn = nn.GRU(embedding_size, rnn_hidden_size, bidirectional=True, batch_first=True)
        
    def forward(self, x_source, x_lengths):
        """
        Forward Pass the model
        Args:
            x_source (torch.Tensor): the input data tensor.
                x_source.shape is (batch, seq_size)
            x_lengths (torch.Tensor): a vector of lengths for each item in the batch
        Returns:
            a tuple: x_unpacked (torch.Tensor), x_birnn_h (torch.Tensor)
                x_unpacked.shape = (batch, seq_size, rnn_hidden_size * 2)
                x_birnn_h.shape = (batch, rnn_hidden_size * 2)
        """
        x_embedded = self.source_embedding(x_source)
        x_lengths = x_lengths.detach().cpu().numpy()

        # create PackedSequence;
        # x_packed.data.shape=(number_items, embedding_size)
        x_packed = pack_padded_sequence(x_embedded, x_lengths, batch_first=True)
        
        # x_birnn_h.shape = (num_rnn, batch_size, feature_size)
        x_birnn_out, x_birnn_h = self.birnn(x_packed)
        # permute to (batch_size, num_rnn, feature_size)
        x_birnn_h = x_birnn_h.permute(1,0,2)
        
        # flatten features; reshape to (batch_size, num_rnn * feature_size)
        #  (recall: -1 takes the remaining positions, 
        #           flattening the two RNN hidden vectors into 1)
        x_birnn_h = x_birnn_h.contiguous().view(x_birnn_h.size(0), -1)
        
        x_unpacked, _ = pad_packed_sequence(x_birnn_out, batch_first=True)
        return x_unpacked, x_birnn_h

## Decoder

In [None]:
def verbose_attention(encoder_state_vectors, query_vector):
    """A descriptive version of the neural attention mechanism 
    
    Args:
        encoder_state_vectors (torch.Tensor): 3dim tensor from bi-GRU in encoder
        query_vector (torch.Tensor): hidden state in decoder GRU
    """
    batch_size, num_vectors, vector_size = encoder_state_vectors.size()
    
    vector_scores = \
        torch.sum(encoder_state_vectors * query_vector.view(batch_size, 1, vector_size), dim=2)
    
    vector_probabilities = torch.softmax(vector_scores, dim=1)
    
    weighted_vectors = \
        encoder_state_vectors * vector_probabilities.view(batch_size, num_vectors, 1)
    
    context_vectors = torch.sum(weighted_vectors, dim=1)
    return context_vectors, vector_probabilities, vector_scores

def terse_attention(encoder_state_vectors, query_vector):
    """A shorter and more optimized version of the neural attention mechanism
    
    Args:
        encoder_state_vectors (torch.Tensor): 3dim tensor from bi-GRU in encoder
        query_vector (torch.Tensor): hidden state
    """
    vector_scores = torch.matmul(encoder_state_vectors, query_vector.unsqueeze(dim=2)).squeeze()
    
    vector_probabilities = torch.softmax(encoder_state_vectors, dim=-1)
    context_vectors = torch.matmul(encoder_state_vectors.transpose(-2, -1),
                                  vector_probabilities.unsqueeze(dim=2)).squeeze()
    
    return context_vectors, vector_probabilities

class NMTDecoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size, bos_index):
        super(NMTDecoder, self).__init__()
        
        self._rnn_hidden_size = rnn_hidden_size
        self.target_embedding = nn.Embedding(num_embeddings=num_embeddings,
                                            embedding_dim=embedding_size,
                                            padding_idx=0)
        
        self.gru_cell = nn.GRUCell(embedding_size + rnn_hidden_size, rnn_hidden_size)
        self.hidden_map = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        
        self.classifier = nn.Linear(rnn_hidden_size*2, num_embeddings)
        self.bos_index = bos_index
        
    def _init_indices(self, batch_size):
        """returns the BOS index vector"""
        return torch.ones(batch_size, dtype=torch.int64) * self.bos_index
            
    def _init_context_vectors(self, batch_size):
        """returns a zeros vector for initializing the context"""
        return torch.zeros(batch_size, self._rnn_hidden_size)
    
    def forward(self, encoder_state, initial_hidden_state, target_sequence):
        """The forward pass of the model
        
        Args:
            encoder_state (torch.Tensor): the output of the NMTEncoder
            initial_hidden_state (torch.Tensor): The last hidden state in the  NMTEncoder
            target_sequence (torch.Tensor): the target text data tensor
        Returns:
            output_vectors (torch.Tensor): prediction vectors at each output step
        """

        target_sequence = target_sequence.permute(1,0)
        
        h_t = self.hidden_map(initial_hidden_state)
        
        batch_size = encoder_state.size(0)
        
        # initialize context vector
        context_vectors = self._init_context_vectors(batch_size)
        y_t_index = self._init_indices(batch_size)
        
        h_t = h_t.to(encoder_state.device)
        y_t_index = y_t_index.to(encoder_state.device)
        context_vectors = context_vectors.to(encoder_state.device)
        
        output_vectors = []
        self._cached_p_attn = []
        self._cached_ht = []
        self._cached_decoder_state = encoder_state.cpu().detach().numpy()
        
        output_sequence_size = target_sequence.size(0)
        for i in range(output_sequence_size):

            y_t_index = target_sequence[i]
            
            # decoding the vectors
            # 1. embed word and concat with previous context
            y_input_vector = self.target_embedding(y_t_index)
            rnn_input = torch.cat([y_input_vector, context_vectors], dim=1)
            
            # 2. make a GRU step, getting a new hidden vector
            h_t = self.gru_cell(rnn_input, h_t)
            self._cached_ht.append(h_t.cpu().detach().numpy())
            
            # 3. use current vector to attend to encoder state
            context_vectors, p_attn, _ = verbose_attention(encoder_state_vectors=encoder_state,
                                                           query_vector = h_t)
            
            # cache the attention probabilities for visualization
            self._cached_p_attn.append(p_attn.cpu().detach().numpy())
            
            # 4 use current hidden and context vectors
            # to make a prediction for the next word
            prediction_vector = torch.cat((context_vectors, h_t), dim=1)
            score_for_y_t_index = self.classifier(prediction_vector)
            
            # collect the prediction scores
            output_vectors.append(score_for_y_t_index)

        output_vectors = torch.stack(output_vectors).permute(1, 0, 2)
        
        return output_vectors

## NMT Model

In [None]:
class NMTModel(nn.Module):
    def __init__(self, source_vocab_size, source_embedding_size, target_vocab_size,
                target_embedding_size, encoding_size, target_bos_index):
        """
        Args:
            source_vocab_size (int): number of unique words in source language
            source_embedding_size (int): size of the source embedding vectors
            target_vocab_size (int): number of unique words in target language
            target_embedding_size (int): size of the target embedding vectors
            encoding_size (int): the size of the encoder RNN.  
        """
        super(NMTModel, self).__init__()
        
        self.encoder = NMTEncoder(num_embeddings= source_vocab_size,
                                 embedding_size=source_embedding_size,
                                 rnn_hidden_size=encoding_size)
        
        decoding_size = encoding_size * 2
        
        self.decoder = NMTDecoder(num_embeddings= target_vocab_size,
                                 embedding_size= target_embedding_size,
                                 rnn_hidden_size= decoding_size,
                                 bos_index= target_bos_index)
        
    def forward(self, x_source, x_source_lengths, target_sequence):
        """
        The forward pass of the model
        
        Args:
            x_source (torch.Tensor): the source text data tensor. 
                x_source.shape should be (batch, vectorizer.max_source_length)
            x_source_lengths torch.Tensor): the length of the sequences in x_source 
            target_sequence (torch.Tensor): the target text data tensor
        Returns:
            decoded_states (torch.Tensor): prediction vectors at each output step
        """
        encoder_state, final_hidden_states = self.encoder(x_source,
                                                         x_source_lengths)
        decoded_states = self.decoder(encoder_state, final_hidden_states,
                                     target_sequence)
        
        return decoded_states

## Utility functions

In [None]:
def compute_accuracy(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)

    _, y_pred_indices = y_pred.max(dim=1)
    
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()

    return n_correct / n_valid * 100

def normalize_sizes(y_pred, y_true):
    """Normalize tensor sizes
    
    Args:
        y_pred (torch.Tensor): the output of the model
            If a 3-dimensional tensor, reshapes to a matrix
        y_true (torch.Tensor): the target predictions
            If a matrix, reshapes to be a vector
    """
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return y_pred, y_true

def sequence_loss(y_pred, y_true, mask_index):
    y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)


def update_train_state(args, model, train_state):
    """
    Update model and early stopping
    """
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])

    # save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]
         
        # If loss worsened
        if loss_t >= loss_tm1:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

## Training Routine

In [None]:
args = Namespace(
    # Data information
    frequency_cutoff = 25,
    text_csv = '/content/drive/MyDrive/Pycon2021/poet_dataframe_sentences.csv',
    model_filename = '/content/drive/MyDrive/Pycon2021/poet_GRU_model_state.pth',
    # Model HyperParameters
    source_embedding_size=100,
    target_embedding_size=100,
    encoding_size=64,
    # Training HyperParameters
    batch_size = 10,
    early_stopping_criteria=5,
    learning_rate=0.001,
    momentum=0.1,
    num_epochs=100,
    seed=1337,
    cuda=True,
    dropout=0.1
)

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_filename
    }

In [None]:
train_state = make_train_state(args)

if torch.cuda.is_available() and args.cuda:
  args.cuda = True
else:
  args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")
print("Device available ", args.device)

# dataset object
dataset = NMTDataset.load_dataset_and_make_vectorizer(args.text_csv)

# vectorizer
vectorizer = dataset.get_vectorizer()

# classifier
model = NMTModel(source_vocab_size= len(vectorizer.source_vocab),
                target_vocab_size= len(vectorizer.target_vocab),
                source_embedding_size = args.source_embedding_size,
                target_embedding_size= args.target_embedding_size,
                encoding_size= args.encoding_size,
                target_bos_index= vectorizer.target_vocab.begin_seq_index)
model.to(args.device)

# optimizer
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

Device available  cuda


In [None]:
mask_index = vectorizer.target_vocab.mask_index
epoch_bar = tqdm(desc='training routine', 
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size), 
                          position=0, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size), 
                        position=0, 
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_nmt_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        model.train()
        
        for batch_index, batch_dict in enumerate(batch_generator):
            # the training routine is these 5 steps:

            # --------------------------------------    
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = model(batch_dict['x_source'], 
                           batch_dict['x_source_length'], 
                           batch_dict['x_target'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # step 4. use loss to produce gradients
            loss.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()
            # -----------------------------------------
            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss,
                                  acc=running_acc,
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_nmt_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # compute the output
            y_pred = model(batch_dict['x_source'], 
                           batch_dict['x_source_length'], 
                           batch_dict['x_target'])

            # step 3. compute the loss
            loss = sequence_loss(y_pred, batch_dict['y_target'], mask_index)

            # compute the  running loss and running accuracy
            running_loss += (loss.item() - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            
            # Update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=model, 
                                         train_state=train_state)
        
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/9 [00:00<?, ?it/s]

split=val:   0%|          | 0/1 [00:00<?, ?it/s]

## Evaluate

In [None]:
def get_source_sentence(vectorizer, batch_dict, index):
    indices = batch_dict['x_source'][index].cpu().data.numpy()
    vocab = vectorizer.source_vocab
    return sentence_from_indices(indices, vocab)

def get_true_sentence(vectorizer, batch_dict, index):
    return sentence_from_indices(batch_dict['y_target'].cpu().data.numpy()[index], vectorizer.target_vocab)
    
def get_sampled_sentence(vectorizer, batch_dict, index):
    y_pred = model(x_source=batch_dict['x_source'], 
                   x_source_lengths=batch_dict['x_source_length'], 
                   target_sequence=batch_dict['x_target'])
    return sentence_from_indices(torch.max(y_pred, dim=2)[1].cpu().data.numpy()[index], vectorizer.target_vocab)

def get_all_sentences(vectorizer, batch_dict, index):
    return {"source": get_source_sentence(vectorizer, batch_dict, index), 
            "truth": get_true_sentence(vectorizer, batch_dict, index), 
            "sampled": get_sampled_sentence(vectorizer, batch_dict, index)}
    
def sentence_from_indices(indices, vocab, strict=True):
    ignore_indices = set([vocab.mask_index, vocab.begin_seq_index, vocab.end_seq_index])
    out = []
    for index in indices:
        if index == vocab.begin_seq_index and strict:
            continue
        elif index == vocab.end_seq_index and strict:
            return " ".join(out)
        else:
            out.append(vocab.lookup_index(index))
    return " ".join(out)


dataset.set_split('val')
batch_generator = generate_nmt_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)

test_sample = '/content/drive/MyDrive/Pycon2021/Dataset Puisi - Manual - Sheet1.csv'

for batch_dict in batch_generator:
    results = get_all_sentences(vectorizer, batch_dict, 0)
    print(results)
    print('-'*100)
    print("SOURCE : ", results['source'])
    print("SAMPLED: ", results['sampled'])
    print("TRUTH: ",results['truth'])

{'source': 'jam dua belas siang hari . matahari <UNK> di tengah langit . tak ada angin . tak mega . maria zaitun ke luar rumah pelacuran . tanpa <UNK> . tak ada lagi <UNK> . <UNK> <UNK> <UNK> muka . <UNK> ia berjalan . badannya <UNK> . sipilis membakar tubuhnya . penuh borok di <UNK> di leher , di <UNK> , dan di <UNK> . <UNK> merah . <UNK> kering . <UNK> <UNK> . sakit jantungnya <UNK> pula . ia pergi kepada dokter . banyak <UNK> lebih dulu menunggu . ia <UNK> di antara mereka . tiba tiba orang orang <UNK> dan menutup <UNK> mereka . ia <UNK> marah tapi <UNK> <UNK> <UNK> <UNK> . ia <UNK> <UNK> lebih dulu dan tak ada orang <UNK> . maria zaitun , <UNK> sudah banyak padaku , kata dokter . ya , <UNK> . sekarang <UNK> <UNK> ? tak ada . dokter <UNK> kepala dan <UNK> <UNK> . ia <UNK> waktu membuka <UNK> <UNK> <UNK> <UNK> di borok <UNK> . cukup , kata dokter . dan ia tak jadi <UNK> . lalu ia <UNK> kepada <UNK> <UNK> ia <UNK> vitamin c . dengan kaget <UNK> <UNK> kembali vitamin c ? dokter , palin