In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/zouidine/AMT_LLMs.git

Cloning into 'AMT_LLMs'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (16/16), done.[K
Receiving objects: 100% (20/20), 44.45 KiB | 3.17 MiB/s, done.
Resolving deltas: 100% (3/3), done.
remote: Total 20 (delta 3), reused 0 (delta 0), pack-reused 0[K


In [3]:
pip install mosestokenizer

Collecting mosestokenizer
  Downloading mosestokenizer-1.2.1.tar.gz (37 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docopt (from mosestokenizer)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting openfile (from mosestokenizer)
  Downloading openfile-0.0.7-py3-none-any.whl (2.4 kB)
Collecting uctools (from mosestokenizer)
  Downloading uctools-1.3.0.tar.gz (4.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting toolwrapper (from mosestokenizer)
  Downloading toolwrapper-2.1.0.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mosestokenizer, docopt, toolwrapper, uctools
  Building wheel for mosestokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for mosestokenizer: filename=mosestokenizer-1.2.1-py3-none-any.whl size=49171 sha256=544c6016e32469b65aadc540a42fdc80dc9aa7dd62841eae1ca78754915330b5
  Stored in directory: /root/.cache/

In [4]:
pip  install -U farasapy

Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [5]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3

# Prepeocessing

In [6]:
from farasa.segmenter import FarasaSegmenter
from mosestokenizer import *
import torch
import re

class Preprocessing():

    def __init__(self, lang):
        self.lang = lang
        self.word2index = {"<PAD>":0, "<SOS>":1, "<EOS>":2, "<UNK>":3}
        self.index2word = {0:"<PAD>", 1:"<SOS>", 2:"<EOS>", 3:"<UNK>"}
        self.word_count = {}
        self.n_words = 4

    # English preprocessing
    def en_clean(self, sentence):
        return re.sub(r"[^a-zA-Z0-9.?!' ]+", "", sentence)

    def lowercase(self, sentence):
      return sentence.lower()

    # Arabic preprocessing
    def ar_clean(self, sentence):
        #remove punctuations
        arabic_punctuations = '''`÷×؛<>_()*&^%][،/:"'{}~¦+|”…“–ـ\#=-,٬@—‘♫;٪อรอย$♪'''
        translator = str.maketrans('', '', arabic_punctuations)
        sentence = sentence.translate(translator)
        #hindi numbers to arabic numbers
        hindi_nums = "٠١٢٣٤٥٦٧٨٩"
        arabic_nums = "0123456789"
        hindi_to_arabic_map = str.maketrans(hindi_nums, arabic_nums)
        sentence = sentence.translate(hindi_to_arabic_map)
        #remove elongations
        sentence = re.sub(r'(.)\1+', r'\1', sentence)
        return sentence

    def normalize(self, sentence):
        #remove diacritics
        arabic_diacritics = re.compile("""
                                ّ    | # Tashdid
                                َ    | # Fatha
                                ً    | # Tanwin Fath
                                ُ    | # Damma
                                ٌ    | # Tanwin Damm
                                ِ    | # Kasra
                                ٍ    | # Tanwin Kasr
                                ْ    | # Sukun
                                ـ     # Tatwil/Kashida
                            """, re.VERBOSE)
        sentence = re.sub(arabic_diacritics, '', sentence)
        sentence = re.sub("[إأآا]", "ا", sentence)
        sentence = re.sub("ى", "ي", sentence)
        sentence = re.sub("ة", "ه", sentence)
        return sentence

    def clean(self, l_sen):
        if self.lang == 'ar':
            return [self.ar_clean(sen) for sen in l_sen]
        else: return [self.en_clean(sen) for sen in l_sen]

    def tokenize(self, l_sen):
        l = l_sen
        if self.lang == 'ar':
            tokenize = MosesTokenizer(lang="ar")
            farasa_segmenter = FarasaSegmenter(interactive=True)
            l = [farasa_segmenter.segment(sen) for sen in l]
            l = [self.normalize(sen) for sen in l]
            return [["<SOS>"] + tokenize(sen) + ["<EOS>"] for sen in l]
        else:
            tokenize = MosesTokenizer('en', no_escape=True)
            l = [self.lowercase(sen) for sen in l]
            tkns = [["<SOS>"] + tokenize(sen) + ["<EOS>"] for sen in l]
            return [[tkn for tkn in sen if tkn != "'"] for sen in tkns]

    def creat_vocabulary(self, sentences):
        for sentence in sentences:
            for word in sentence:
                if word not in self.word_count:
                    self.word_count[word] = 1
                else: self.word_count[word] += 1
                if word not in self.word2index and self.word_count[word]>=2:
                    self.word2index[word] = self.n_words
                    self.index2word[self.n_words] = word
                    self.n_words += 1

    def creat_tensors(self, l_sen_tkn):
        max_len = max([len(sen) for sen in l_sen_tkn])
        batch = len(l_sen_tkn)
        tensor_data = torch.zeros(batch, max_len, dtype=torch.long)
        tensor_mask = []
        for i in range(batch):
            ids = [self.word2index.get(w, self.word2index["<UNK>"]) for w in l_sen_tkn[i]]
            tensor_data[i, 0:len(ids)] = torch.tensor(ids, dtype=torch.long)
            tensor_mask.append(len(ids))
        return tensor_data, torch.tensor(tensor_mask)

#Model

In [7]:
import math
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence

def get_positional_encoding(d_model, max_length=100):
    positional_encoding = torch.zeros((max_length, d_model))
    for i in range(max_length):
        for j in range(d_model):
            if j % 2 == 0:
                positional_encoding[i, j] = math.sin(i/math.
                                                     pow(10000, j/d_model))
            else:
                positional_encoding[i, j] = math.cos(i/math.
                                                     pow(10000, (j - 1)/d_model))

    ## (1, max_length, d_model)
    positional_encoding = positional_encoding.unsqueeze(0)

    return positional_encoding


class MultiHeadAttention(nn.Module):
    """
    The Multi-Head Attention sublayer.
    """

    def __init__(self, d_model, n_heads, d_queries, d_values, dropout,
                 in_decoder=False):
        super(MultiHeadAttention, self).__init__()

        self.d_model = d_model
        self.n_heads = n_heads

        self.d_queries = d_queries
        self.d_values = d_values
        self.d_keys = d_queries

        self.in_decoder = in_decoder

        self.cast_queries = nn.Linear(d_model, n_heads*d_queries)

        self.cast_keys_values = nn.Linear(d_model, n_heads*(d_queries+d_values))

        self.cast_output = nn.Linear(n_heads*d_values, d_model)

        self.softmax = nn.Softmax(dim=-1)

        self.layer_norm = nn.LayerNorm(d_model)

        self.apply_dropout = nn.Dropout(dropout)

    def forward(self, query_sequences, key_value_sequences,
                key_value_sequence_lengths):
        batch_size = query_sequences.size(0)
        query_sequence_pad_length = query_sequences.size(1)
        key_value_sequence_pad_length = key_value_sequences.size(1)

        # Is this self-attention?
        self_attention = torch.equal(key_value_sequences, query_sequences)

        # Store input for adding later
        input_to_add = query_sequences.clone()

        # Apply layer normalization
        ## (N, query_sequence_pad_length, d_model)
        query_sequences = self.layer_norm(query_sequences)
        if self_attention:
            ## (N, key_value_sequence_pad_length, d_model)
            key_value_sequences = self.layer_norm(key_value_sequences)

        # Project input sequences to queries, keys, values
        ## (N, query_sequence_pad_length, n_heads*d_queries)
        queries = self.cast_queries(query_sequences)
        ## (N, key_value_sequence_pad_length, n_heads*d_keys)
        ## (N, key_value_sequence_pad_length, n_heads*d_values)
        keys, values = self.cast_keys_values(key_value_sequences
                                             ).split(split_size=self.n_heads*
                                                     self.d_keys, dim=-1)

        # Split the last dimension by the n_heads subspaces
        ## (N, query_sequence_pad_length, n_heads, d_queries)
        queries = queries.contiguous().view(batch_size, query_sequence_pad_length,
                                            self.n_heads, self.d_queries)
        ## (N, key_value_sequence_pad_length, n_heads, d_keys)
        keys = keys.contiguous().view(batch_size, key_value_sequence_pad_length,
                                      self.n_heads, self.d_keys)
        ## (N, key_value_sequence_pad_length, n_heads, d_values)
        values = values.contiguous().view(batch_size,
                                          key_value_sequence_pad_length,
                                          self.n_heads, self.d_values)

        # Re-arrange axes such that the last two dimensions are the sequence
        # lengths and the queries/keys/values. And then, for convenience,
        # convert to 3D tensors by merging the batch and n_heads dimensions
        # This is to prepare it for the batch matrix multiplication
        ## (N * n_heads, query_sequence_pad_length, d_queries)
        queries = queries.permute(0, 2, 1, 3
                                  ).contiguous().view(-1,
                                                      query_sequence_pad_length,
                                                      self.d_queries)
        ## (N * n_heads, key_value_sequence_pad_length, d_keys)
        keys = keys.permute(0, 2, 1, 3
                            ).contiguous().view(-1,
                                                key_value_sequence_pad_length,
                                                self.d_keys)
        ## (N * n_heads, key_value_sequence_pad_length, d_values)
        values = values.permute(0, 2, 1, 3
                                ).contiguous().view(-1,
                                                    key_value_sequence_pad_length,
                                                    self.d_values)

        # Perform multi-head attention

        # Perform dot-products
        ## (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        attention_weights = torch.bmm(queries, keys.permute(0, 2, 1))

        # Scale dot-products
        ## (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        attention_weights = (1. / math.sqrt(self.d_keys)) * attention_weights

        # Before computing softmax weights, prevent queries from attending to certain keys

        # MASK 1: keys that are pads
        ## (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        not_pad_in_keys = torch.LongTensor(range(key_value_sequence_pad_length)
        ).unsqueeze(0).unsqueeze(0).expand_as(attention_weights).to(device)
        not_pad_in_keys = not_pad_in_keys < key_value_sequence_lengths.\
                            repeat_interleave(self.n_heads).unsqueeze(
                                1).unsqueeze(2).expand_as(attention_weights)

        attention_weights = attention_weights.masked_fill(~not_pad_in_keys,
                                                          -float('inf'))

        # MASK 2: if this is self-attention in the decoder,
        # keys chronologically ahead of queries
        if self.in_decoder and self_attention:
            # Therefore, a position [n, i, j] is valid only if j <= i
            # torch.tril(), i.e. lower triangle in a 2D matrix, sets j > i to 0
            not_future_mask = torch.ones_like(attention_weights
                                              ).tril().bool().to(device)

            # Mask away by setting such weights to a large negative number,
            # so that they evaluate to 0 under the softmax
            attention_weights = attention_weights.masked_fill(~not_future_mask,
                                                              -float('inf'))

        # Compute softmax along the key dimension
        attention_weights = self.softmax(attention_weights)

        # Apply dropout
        ## (N * n_heads, query_sequence_pad_length, key_value_sequence_pad_length)
        attention_weights = self.apply_dropout(attention_weights)

        # Calculate sequences as the weighted sums of values based on these softmax weights
        ## (N * n_heads, query_sequence_pad_length, d_values)
        sequences = torch.bmm(attention_weights, values)

        # Unmerge batch and n_heads dimensions and restore original order of axes
        ## (N, query_sequence_pad_length, n_heads, d_values)
        sequences = sequences.contiguous().view(batch_size, self.n_heads,
                                                query_sequence_pad_length,
                                                self.d_values).permute(0, 2, 1, 3)

        # Concatenate the n_heads subspaces (each with an output of size d_values)
        ## (N, query_sequence_pad_length, n_heads * d_values)
        sequences = sequences.contiguous().view(batch_size,
                                                query_sequence_pad_length, -1)

        # Transform the concatenated subspace-sequences into a single output of size d_model
        ## (N, query_sequence_pad_length, d_model)
        sequences = self.cast_output(sequences)

        # Apply dropout and residual connection
        sequences = self.apply_dropout(sequences) + input_to_add

        return sequences


class PositionWiseFCNetwork(nn.Module):
    """
    The Position-Wise Feed Forward Network sublayer.
    """

    def __init__(self, d_model, d_inner, dropout):
        super(PositionWiseFCNetwork, self).__init__()

        self.d_model = d_model
        self.d_inner = d_inner

        # Layer-norm layer
        self.layer_norm = nn.LayerNorm(d_model)

        # A linear layer to project from the input size to an intermediate size
        self.fc1 = nn.Linear(d_model, d_inner)

        # ReLU
        self.relu = nn.ReLU()

        # A linear layer to project from the intermediate size to the output size (same as the input size)
        self.fc2 = nn.Linear(d_inner, d_model)

        # Dropout layer
        self.apply_dropout = nn.Dropout(dropout)

    def forward(self, sequences):
        # Store input for adding later
        input_to_add = sequences.clone()  # (N, pad_length, d_model)

        # Apply layer-norm
        sequences = self.layer_norm(sequences)  # (N, pad_length, d_model)

        # Transform position-wise
        ## (N, pad_length, d_inner)
        sequences = self.apply_dropout(self.relu(self.fc1(sequences)))
        sequences = self.fc2(sequences)  # (N, pad_length, d_model)

        # Apply dropout and residual connection
        ## (N, pad_length, d_model)
        sequences = self.apply_dropout(sequences) + input_to_add

        return sequences


class Encoder(nn.Module):
    """
    The Encoder.
    """

    def __init__(self, vocab_size, positional_encoding, d_model, n_heads,
                 d_queries, d_values, d_inner, n_layers, dropout):
        super(Encoder, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        # An embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Set the positional encoding tensor to be un-update-able
        self.positional_encoding.requires_grad = False

        # Encoder layers
        self.encoder_layers = nn.ModuleList([self.make_encoder_layer() for i
                                             in range(n_layers)])

        # Dropout layer
        self.apply_dropout = nn.Dropout(dropout)

        # Layer-norm layer
        self.layer_norm = nn.LayerNorm(d_model)

    def make_encoder_layer(self):
        # A ModuleList of sublayers
        encoder_layer = nn.ModuleList([MultiHeadAttention(d_model=self.d_model,
                                                          n_heads=self.n_heads,
                                                          d_queries=self.d_queries,
                                                          d_values=self.d_values,
                                                          dropout=self.dropout,
                                                          in_decoder=False),
                                       PositionWiseFCNetwork(d_model=self.d_model,
                                                             d_inner=self.d_inner,
                                                             dropout=self.dropout)])

        return encoder_layer

    def forward(self, encoder_sequences, encoder_sequence_lengths):
        # pad-length of this batch only, varies across batches
        pad_length = encoder_sequences.size(1)

        # Sum vocab embeddings and position embeddings
        encoder_sequences = self.embedding(encoder_sequences) *\
                            math.sqrt(self.d_model) +\
                            self.positional_encoding[:, :pad_length, :].to(
                                device)  # (N, pad_length, d_model)

        # Dropout
        encoder_sequences = self.apply_dropout(encoder_sequences)

        # Encoder layers
        for encoder_layer in self.encoder_layers:
            # Sublayers
            encoder_sequences = encoder_layer[0](encoder_sequences,
                                                 encoder_sequences,
                                                 encoder_sequence_lengths)
            encoder_sequences = encoder_layer[1](encoder_sequences)

        # Apply layer-norm
        ## (N, pad_length, d_model)
        encoder_sequences = self.layer_norm(encoder_sequences)

        return encoder_sequences


class Decoder(nn.Module):
    """
    The Decoder.
    """

    def __init__(self, vocab_size, positional_encoding, d_model, n_heads,
                 d_queries, d_values, d_inner, n_layers, dropout):
        super(Decoder, self).__init__()

        self.vocab_size = vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        # An embedding layer
        self.embedding = nn.Embedding(vocab_size, d_model)

        # Set the positional encoding tensor to be un-update-able
        self.positional_encoding.requires_grad = False

        # Decoder layers
        self.decoder_layers = nn.ModuleList([self.make_decoder_layer() for i
                                             in range(n_layers)])

        # Dropout layer
        self.apply_dropout = nn.Dropout(dropout)

        # Layer-norm layer
        self.layer_norm = nn.LayerNorm(d_model)

        # Output linear layer that will compute logits for the vocabulary
        self.fc = nn.Linear(d_model, vocab_size)

    def make_decoder_layer(self):
        # A ModuleList of sublayers
        decoder_layer = nn.ModuleList([MultiHeadAttention(d_model=self.d_model,
                                                          n_heads=self.n_heads,
                                                          d_queries=self.d_queries,
                                                          d_values=self.d_values,
                                                          dropout=self.dropout,
                                                          in_decoder=True),
                                       MultiHeadAttention(d_model=self.d_model,
                                                          n_heads=self.n_heads,
                                                          d_queries=self.d_queries,
                                                          d_values=self.d_values,
                                                          dropout=self.dropout,
                                                          in_decoder=True),
                                       PositionWiseFCNetwork(d_model=self.d_model,
                                                             d_inner=self.d_inner,
                                                             dropout=self.dropout)])

        return decoder_layer

    def forward(self, decoder_sequences, decoder_sequence_lengths,
                encoder_sequences, encoder_sequence_lengths):
        pad_length = decoder_sequences.size(1)

        # Sum vocab embeddings and position embeddings
        decoder_sequences = self.embedding(decoder_sequences) *\
                             math.sqrt(self.d_model) +\
                             self.positional_encoding[:, :pad_length, :].to(
                                 device)  # (N, pad_length, d_model)

        # Dropout
        decoder_sequences = self.apply_dropout(decoder_sequences)

        # Decoder layers
        for decoder_layer in self.decoder_layers:
            # Sublayers
            decoder_sequences = decoder_layer[0](decoder_sequences,
                                                 decoder_sequences,
                                                 decoder_sequence_lengths)
            decoder_sequences = decoder_layer[1](decoder_sequences,
                                                 encoder_sequences,
                                                 encoder_sequence_lengths)
            decoder_sequences = decoder_layer[2](decoder_sequences)

        # Apply layer-norm
        decoder_sequences = self.layer_norm(decoder_sequences)

        # Find logits over vocabulary
        ## (N, pad_length, vocab_size)
        decoder_sequences = self.fc(decoder_sequences)

        return decoder_sequences


class Transformer(nn.Module):
    """
    The Transformer network.
    """

    def __init__(self, in_vocab_size, out_vocab_size, positional_encoding,
                 d_model=256, n_heads=8, d_queries=64, d_values=64,
                 d_inner=1024, n_layers=4, dropout=0.1):
        super(Transformer, self).__init__()

        self.in_vocab_size = in_vocab_size
        self.out_vocab_size = out_vocab_size
        self.positional_encoding = positional_encoding
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_queries = d_queries
        self.d_values = d_values
        self.d_inner = d_inner
        self.n_layers = n_layers
        self.dropout = dropout

        # Encoder
        self.encoder = Encoder(vocab_size=in_vocab_size,
                               positional_encoding=positional_encoding,
                               d_model=d_model,
                               n_heads=n_heads,
                               d_queries=d_queries,
                               d_values=d_values,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               dropout=dropout)

        # Decoder
        self.decoder = Decoder(vocab_size=out_vocab_size,
                               positional_encoding=positional_encoding,
                               d_model=d_model,
                               n_heads=n_heads,
                               d_queries=d_queries,
                               d_values=d_values,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               dropout=dropout)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        """
        Initialize weights in the transformer model.
        """
        # Glorot uniform initialization with a gain of 1.
        for p in self.parameters():
            # Glorot initialization needs at least two dimensions on the tensor
            if p.dim() > 1:
                nn.init.xavier_uniform_(p, gain=1.)

        # Share weights between the embedding layers and the logit layer
        nn.init.normal_(self.encoder.embedding.weight, mean=0.,
                        std=math.pow(self.d_model, -0.5))
        nn.init.normal_(self.decoder.embedding.weight, mean=0.,
                        std=math.pow(self.d_model, -0.5))
        self.decoder.fc.weight = self.decoder.embedding.weight

        print("Model initialized.")

    def forward(self, encoder_sequences, decoder_sequences,
                encoder_sequence_lengths, decoder_sequence_lengths):
        # Encoder
        ## (N, encoder_sequence_pad_length, d_model)
        encoder_sequences = self.encoder(encoder_sequences,
                                         encoder_sequence_lengths)

        # Decoder
        ## (N, decoder_sequence_pad_length, vocab_size)
        decoder_sequences = self.decoder(decoder_sequences,
                                         decoder_sequence_lengths,
                                         encoder_sequences,
                                         encoder_sequence_lengths)

        return decoder_sequences


class LabelSmoothedCE(torch.nn.Module):
    """
    Cross Entropy loss with label-smoothing as a form of regularization.
    """

    def __init__(self, eps=0.1):
        super(LabelSmoothedCE, self).__init__()
        self.eps = eps

    def forward(self, inputs, targets, lengths):
        # Remove pad-positions and flatten
        inputs, _, _, _ = pack_padded_sequence(input=inputs,
                                               lengths=lengths.cpu(),
                                               batch_first=True,
                                               enforce_sorted=False)
        targets, _, _, _ = pack_padded_sequence(input=targets,
                                                lengths=lengths.cpu(),
                                                batch_first=True,
                                                enforce_sorted=False)

        # "Smoothed" one-hot vectors for the gold sequences
        ## (sum(lengths), n_classes), one-hot
        target_vector = torch.zeros_like(inputs).scatter(dim=1,
                                                         index=targets.unsqueeze(1),
                                                         value=1.).to(device)
        target_vector = target_vector * (1. - self.eps) +\
                         self.eps / target_vector.size(1)

        # Compute smoothed cross-entropy loss
        ## (sum(lengths))
        loss = (-1 * target_vector * F.log_softmax(inputs, dim=1)).sum(dim=1)

        # Compute mean loss
        loss = torch.mean(loss)

        return loss

#Train

In [8]:
from tqdm import tqdm
import torch

def train(model, train_loader, device, foldername=""):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = LabelSmoothedCE()
    if foldername != "":
        final_path = foldername + "final_en_ar.pth"

    best_valid_loss = 1e10

    for epoch_no in range(10):
        model.train()
        avg_loss = 0
        with tqdm(train_loader) as it:
            for batch_no, (src, src_mask, trg, trg_mask) in enumerate(it,
                                                                      start=1):
                src, src_mask = src.to(device), src_mask.to(device)
                trg, trg_mask = trg.to(device), trg_mask.to(device)

                optimizer.zero_grad()
                output = model(src, trg, src_mask, trg_mask)
                loss = criterion(output, trg[:,1:], trg_mask-1)
                loss.backward()
                optimizer.step()
                avg_loss += loss.item()

                it.set_postfix(
                    ordered_dict={
                        "avg_epoch_loss": avg_loss / batch_no,
                        "epoch": epoch_no,
                    },
                    refresh=True,
                )
    if foldername != "":
        torch.save(model.state_dict(), final_path)

In [21]:
from nltk.translate.bleu_score import corpus_bleu

def translate(model, tensor, mask, preprocess, max_len=None):
    model.eval()
    outputs = []
    detokenize = MosesDetokenizer(trg_lang)

    encoder_output = model.encoder(tensor, mask)

    for i in range(tensor.shape[0]):

        out = ['<SOS>']

        # Our hypothesis to begin with is just <BOS>
        hypotheses = torch.LongTensor([[preprocess.word2index['<SOS>']]]
                                      ).to(device)  # (1, 1)
        hypotheses_lengths = torch.LongTensor([hypotheses.size(1)]).to(device)

        tkn = 0
        while(tkn != 2):
            output = model.decoder(hypotheses,
                                   hypotheses_lengths,
                                   encoder_output[i].unsqueeze(0),
                                   mask[i].unsqueeze(0))

            tkn = output[:,-1,:].argmax().item()
            out.append(preprocess.index2word[tkn])

            if (len(out)==max_len): break

            hypotheses = torch.cat([hypotheses,
                                    torch.LongTensor([[tkn]]).to(device)],
                                   dim=1)
            hypotheses_lengths += 1
        outputs.append(detokenize(out[1:-1]))

    return outputs

def evaluate(model, ref, tensor, mask, preprocessor):

    detokenize = MosesDetokenizer(trg_lang)
    references = [[detokenize(sen[1:-1])] for sen in ref]
    candidates = translate(model, tensor, mask, preprocessor, 100)

    bleu = corpus_bleu(references, candidates)

    print("BLEU Score:\t", round(100*bleu,2))
    return candidates

#Main

In [10]:
src_lang = 'en'
trg_lang = 'ar'
src_preprocess = Preprocessing(src_lang)
trg_preprocess = Preprocessing(trg_lang)
foldername = "/content/drive/MyDrive/Colab/11-AMT_LLMs/"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# To enhance reproducibility
seed = 1234
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
################################################################################
print("***** Loading Data ...")
train_data_path = '/content/drive/MyDrive/Colab/0-data/IWSLT2016/'
test_data_path = '/content/AMT_LLMs/data/'

src_train = open('{}train_{}.txt'.format(train_data_path, src_lang),
                 encoding='utf-8').read().strip().split('\n')
trg_train = open('{}train_{}.txt'.format(train_data_path, trg_lang),
                 encoding='utf-8').read().strip().split('\n')
src_train = src_preprocess.clean(src_train)
trg_train = trg_preprocess.clean(trg_train)
src_train_f = []
trg_train_f = []
for i in range(len(src_train)):
    if len(src_train[i].split()) < 20 and len(trg_train[i].split()) < 20:
        src_train_f.append(src_train[i])
        trg_train_f.append(trg_train[i])

src_test = open('{}en_examples.txt'.format(test_data_path),
                encoding='utf-8').read().strip().split('\n')
trg_test = open('{}ar_examples.txt'.format(test_data_path),
                encoding='utf-8').read().strip().split('\n')
src_test = src_preprocess.clean(src_test)
trg_test = trg_preprocess.clean(trg_test)
################################################################################
print("\n***** Tokenization ...")
src_train = src_preprocess.tokenize(src_train_f)
trg_train = trg_preprocess.tokenize(trg_train_f)
print("\tTrain Data size\t", len(src_train))

src_test = src_preprocess.tokenize(src_test)
trg_test = trg_preprocess.tokenize(trg_test)
################################################################################
print("\n***** Creating Vocabulary ...")

src_preprocess.creat_vocabulary(src_train)
trg_preprocess.creat_vocabulary(trg_train)

print("\t{} Vocabulary size:\t{}".format(src_lang, src_preprocess.n_words-4))
print("\t{} Vocabulary size:\t{}".format(trg_lang, trg_preprocess.n_words-4))
################################################################################
print("\n***** Creating tensors ...")
src_train_tensor, src_train_mask = src_preprocess.creat_tensors(src_train)
trg_train_tensor, trg_train_mask = trg_preprocess.creat_tensors(trg_train)

src_test_tensor, src_test_mask = src_preprocess.creat_tensors(src_test)
################################################################################
print("\n***** Creating Data loaders ...")
dataset = torch.utils.data.TensorDataset(src_train_tensor, src_train_mask,
                                         trg_train_tensor, trg_train_mask)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=128,
                                           shuffle=True, num_workers=0)
################################################################################
print("\n***** Creating Model ...")
INPUT_DIM = src_preprocess.n_words+1
OUTPUT_DIM = trg_preprocess.n_words+1
positional_encoding = get_positional_encoding(
    256, max_length=max(src_train_tensor.shape[1], trg_train_tensor.shape[1]))

model = Transformer(INPUT_DIM, OUTPUT_DIM, positional_encoding).to(device)
################################################################################
print("\n***** Training ...")
train(model, train_loader, device, foldername=foldername)

***** Loading Data ...

***** Tokenization ...




100%|██████████| 241M/241M [02:01<00:00, 1.99MiB/s]




	Train Data size	 150477

***** Creating Vocabulary ...
	en Vocabulary size:	25449
	ar Vocabulary size:	20736

***** Creating tensors ...

***** Creating Data loaders ...

***** Creating Model ...
Model initialized.

***** Training ...


100%|██████████| 1176/1176 [08:55<00:00,  2.20it/s, avg_epoch_loss=4.14, epoch=0]
100%|██████████| 1176/1176 [09:06<00:00,  2.15it/s, avg_epoch_loss=3.29, epoch=1]
100%|██████████| 1176/1176 [09:01<00:00,  2.17it/s, avg_epoch_loss=2.98, epoch=2]
100%|██████████| 1176/1176 [08:57<00:00,  2.19it/s, avg_epoch_loss=2.81, epoch=3]
100%|██████████| 1176/1176 [08:58<00:00,  2.18it/s, avg_epoch_loss=2.7, epoch=4]
100%|██████████| 1176/1176 [09:03<00:00,  2.17it/s, avg_epoch_loss=2.62, epoch=5]
100%|██████████| 1176/1176 [07:27<00:00,  2.63it/s, avg_epoch_loss=2.56, epoch=6]
100%|██████████| 1176/1176 [08:53<00:00,  2.20it/s, avg_epoch_loss=2.5, epoch=7]
100%|██████████| 1176/1176 [09:02<00:00,  2.17it/s, avg_epoch_loss=2.46, epoch=8]
100%|██████████| 1176/1176 [08:57<00:00,  2.19it/s, avg_epoch_loss=2.41, epoch=9]


In [22]:
print("\n***** Testing ...")
model.load_state_dict(torch.load(foldername + 'final_en_ar.pth'))
candidates = evaluate(model, trg_test, src_test_tensor.to(device),
                      src_test_mask.to(device), trg_preprocess)


***** Testing ...
BLEU Score:	 59.15


In [24]:
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

scorer = BERTScorer(model_type='bert-base-uncased', lang='ar')
detokenize = MosesDetokenizer(trg_lang)
references = [detokenize(sen[1:-1]) for sen in trg_test]
_, _, F1 = scorer.score(candidates, references)
print("BERT Score:\t", round(100*F1.mean().item(),2))

BERT Score:	 82.95
