## Connecting to drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/Github/Natural-Language-Processing/NMT with attention

## Installing trax library

In [1]:
!pip -q install trax

You should consider upgrading via the '/opt/conda/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import random
import numpy as np

import trax
from trax import layers as tl
from trax.fastmath import numpy as fastnp
from trax.supervised import training

!pip list | grep trax

INFO:tensorflow:tokens_length=568 inputs_length=512 targets_length=114 noise_density=0.15 mean_noise_span_length=3.0 
trax                     1.3.4
You should consider upgrading via the '/opt/conda/bin/python3 -m pip install --upgrade pip' command.[0m


# Data Section

## Loading dataset
This dataset contains a subset of (english, german) sentences pairs of medical related texts collected from opus.

In [3]:
# Get generator function for the training set
# This will download the train dataset if no data_dir is specified.
train_stream_fn = trax.data.TFDS('opus/medical',
                                 data_dir='./data/',
                                 keys=('en', 'de'),  #eenglish and german sentences pair
                                 eval_holdout_size=0.01, # 1% for eval
                                 train=True)

# Get generator function for the eval set
eval_stream_fn = trax.data.TFDS('opus/medical',
                                data_dir='./data/',
                                keys=('en', 'de'),
                                eval_holdout_size=0.01, # 1% for eval
                                train=False)

In [4]:
train_stream = train_stream_fn()
eval_stream = eval_stream_fn()
#inside the tuple sentences are stored in bytes format
print('train data (en, de) tuple:', next(train_stream))
print('eval data (en, de) tuple:', next(eval_stream))

train data (en, de) tuple: (b'In the pregnant rat the AUC for calculated free drug at this dose was approximately 18 times the human AUC at a 20 mg dose.\n', b'Bei tr\xc3\xa4chtigen Ratten war die AUC f\xc3\xbcr die berechnete ungebundene Substanz bei dieser Dosis etwa 18-mal h\xc3\xb6her als die AUC beim Menschen bei einer 20 mg Dosis.\n')
eval data (en, de) tuple: (b'Lutropin alfa Subcutaneous use.\n', b'Pulver zur Injektion Lutropin alfa Subkutane Anwendung\n')


## Tokenizing datase
Tokenizing involves splitting sentences into words and then converting words to integers using vocabulary.

In [5]:
VOCAB_FILE = 'ende_32k.subword' #ende_32k.subword is a vocabulary containing both english and german subwords. (hint: byte pair encoding)
VOCAB_DIR = 'data/'
# Tokenizing the dataset.
tokenized_train_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(train_stream)
tokenized_eval_stream = trax.data.Tokenize(vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)(eval_stream)

#adding EOS at the end of each sentence
EOS = 1 #since it is at index 1 in vocab
def append_eos(stream):
    for (inputs, targets) in stream:
        inputs_with_eos = list(inputs) + [EOS]
        targets_with_eos = list(targets) + [EOS]
        yield np.array(inputs_with_eos), np.array(targets_with_eos)


tokenized_train_stream = append_eos(tokenized_train_stream)
tokenized_eval_stream = append_eos(tokenized_eval_stream)

## Filtering out too long sentences

In [6]:
# Filtering too long sentences to not run out of memory.
filtered_train_stream = trax.data.FilterByLength(
    max_length=256, length_keys=[0, 1])(tokenized_train_stream) #length_keys=[0, 1] specifies both inputs and targets in the tuple
filtered_eval_stream = trax.data.FilterByLength(
    max_length=512, length_keys=[0, 1])(tokenized_eval_stream)

## building tokenize() and detokenize() functions for individual sentences

In [7]:
# Setup helper functions for tokenizing and detokenizing sentences
def tokenize(sentence, vocab_file=None, vocab_dir=None):
    EOS = 1
    # trax.data.tokenize takes data generator as input. to convert a sentence into generator we can use iter([sentence])
    inputs =  next(trax.data.tokenize(iter([sentence]), vocab_file=vocab_file, vocab_dir=vocab_dir))
    inputs = list(inputs) + [EOS]
    # Adding the batch dimension to the front of the shape
    batch_inputs = np.reshape(np.array(inputs), [1, -1])
    return batch_inputs


def detokenize(integers, vocab_file=None, vocab_dir=None):
    integers = list(np.squeeze(integers))
    EOS = 1
    # Removing the EOS to decode only the original tokens
    if EOS in integers:
        integers = integers[:integers.index(EOS)] 
    sentence = trax.data.detokenize(integers, vocab_file=vocab_file, vocab_dir=vocab_dir) 
    return sentence

In [8]:
# Tokenize and detokenize a word that is not explicitly saved in the vocabulary file.
# See how it combines the subwords 'hell' and 'o' to form the word 'hello'.
print(f"tokenize('hello'): ", tokenize('hello', vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))
print(f"detokenize([17332, 140, 1]): ", detokenize([17332, 140, 1], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR))

tokenize('hello'):  [[17332   140     1]]
detokenize([17332, 140, 1]):  hello


## Bucketing
Bucketing places similar sized sentences to the same batch. By bucketing we need minimal padding to make equal length sentences in each batch.

In [9]:
'''Bucketing to create streams of batches. Buckets are defined in terms of boundaries and batch sizes.
   Batch_sizes[i] determines the batch size for items with length < boundaries[i]
   So below, we'll take a batch of 256 sentences of length < 8, 128 if length is
   between 8 and 16, and so on -- and only 2 if length > 512.'''

boundaries =  [8,   16,  32, 64, 128, 256, 512]
batch_sizes = [256, 128, 64, 32, 16,    8,   4,  2]
# Create the generators.
train_batch_stream = trax.data.BucketByLength(
      boundaries, batch_sizes,
      length_keys=[0, 1]  # count inputs and targets to length.
      )(filtered_train_stream)

eval_batch_stream = trax.data.BucketByLength(
      boundaries, batch_sizes,
      length_keys=[0, 1]  # count inputs and targets to length.
      )(filtered_eval_stream)

# Add masking for the padding (0s). <pad> token is at index 0 in vocab file
train_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(train_batch_stream)
eval_batch_stream = trax.data.AddLossWeights(id_to_mask=0)(eval_batch_stream)


# let's see the shape of this particular batch (batch length, sentence length)
input_batch, target_batch, mask_batch = next(train_batch_stream)
print("input_batch shape: ", input_batch.shape)
print("target_batch shape: ", target_batch.shape, '\n')

input_batch shape:  (32, 64)
target_batch shape:  (32, 64) 



In [10]:
# pick a random index less than the batch size.
index = random.randrange(len(input_batch))
print('English sentence:', detokenize(input_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print('Tokenized version of english sentence: \n', input_batch[index], '\n')
print('German sentence: ', detokenize(target_batch[index], vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR), '\n')
print('Tokenized version of german sentence: \n', target_batch[index], '\n')

English sentence: The adjusted mean difference was -4.3 points (CI 95% -6.4; -2.1 points, p-value < 0.0001).
 

Tokenized version of english sentence: 
 [   29  9701  1516  2640    53  1581   219     3   199  1164    50  7082
     5  4207 11767    15   330     3   219  7108    15   150     3   135
  1164     2   719    15   980   909 33287   913   266     3  8074  3912
 33022 30650  4729   992     1     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0] 

German sentence:  Die angepasste mittlere Differenz betrug -4,3 Punkte (95 %-Konfidenzintervall: -6,4 bis -2,1 Punkte, p-Wert < 0,0001).
 

Tokenized version of german sentence: 
 [   57 30482  8385   191 14998     5 12919 20657  1581   219   227   199
  2927    50  4207 11770    15 11580  7770 13427  9436 19070     5  2801
    15   330   227   219   248  1581   150   227   135  2927     2   719
    15  1619   909 33287   913   266   227  8074  3912 

# Model Section

<img src = "NMT_Model.png">

## Function to build Encoder
This is a helper function that stacks trax layers to build encoder portion for the NMT model

In [11]:
def input_encoder_fn(input_vocab_size, d_model, num_encoder_layers):  # d_model: depth of embedding (n_units in the LSTM cell)    
    input_encoder = tl.Serial( 
        tl.Embedding(vocab_size=input_vocab_size, d_feature=d_model),
        [tl.LSTM(n_units=d_model) for _ in range(num_encoder_layers)] #LSTM cell returns full sequence as output
    )
    return input_encoder

## Function to build pre-attention decoder
Pre-attention decoder runs on the targets and creates activations that are used as queries in attention.

In [12]:
def pre_attention_decoder_fn(mode, target_vocab_size, d_model): # mode: str: 'train' or 'eval'
    pre_attention_decoder = tl.Serial(
        # shift right to insert start-of-sentence token and implement teacher forcing during training
        # in teacher forcing, actual previous target is used to predict next token instead of decoder preivous output 
        tl.ShiftRight(mode=mode), # It does nothing if mode = "eval"
        tl.Embedding(vocab_size=target_vocab_size, d_feature=d_model),
        tl.LSTM(n_units=d_model) #LSTM cell returns full sequence as output
    )
    return pre_attention_decoder

## Function to prepare Queries, Keys and Values for Attention layer


In [13]:
def prepare_attention_input(encoder_activations, decoder_activations, inputs):
    """
        encoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the input encoder
        decoder_activations fastnp.array(batch_size, padded_input_length, d_model): output from the pre-attention decoder
        inputs fastnp.array(batch_size, padded_input_length): padded input tokens
    """
    # seting the keys and values to the encoder activations
    keys = encoder_activations
    values = encoder_activations
    # setting the queries to the decoder activations
    queries = decoder_activations
    # generating the mask to distinguish real tokens from padding
    mask = inputs != 0
    # adding axes to the mask for attention heads and decoder length.
    mask = fastnp.reshape(mask, (mask.shape[0], 1, 1, mask.shape[1]))
    # broadcasting so that mask shape is (batch_size, attention_heads, decoder_len, encoder_len). Here attention_heads = 1.
    mask = mask + fastnp.zeros((1, 1, decoder_activations.shape[1], 1)) #mask: (batch_size, attention_heads, decoder_len, encoder_len)
    return queries, keys, values, mask

## Neural Machine Translation(NMT) Model

In [14]:
# See NMT_Model.png to understand architecture
def NMT_Model(input_vocab_size=33300, target_vocab_size=33300, d_model=1024, num_encoder_layers=2,
            num_decoder_layers=2, num_attention_heads=4, attention_dropout=0.0, mode='train'):  #mode (str): 'train', 'eval' or 'predict'

    input_encoder = input_encoder_fn(input_vocab_size, d_model, num_encoder_layers)
    pre_attention_decoder = pre_attention_decoder_fn(mode, target_vocab_size, d_model)
    model = tl.Serial( 
      # copying input tokens and target tokens as they will be needed later.
      tl.Select([0, 1, 0, 1]),   #signal stack becomes = [input_tokens, target_tokens, input_tokens, target_tokens]
      # running input encoder on the input and pre-attention decoder on the target.
      tl.Parallel(input_encoder, pre_attention_decoder),   #signal stack becomes = [input_tokens, target_tokens]
      tl.Fn('PrepareAttentionInput', prepare_attention_input, n_out=4),
      # Nest Attention layer inside a Residual layer to add to the pre-attention decoder activations(i.e. queries)
      # The Residual layer will accept a layer as an argument and it will add the output of that layer to the current stack top input.
      tl.Residual(tl.AttentionQKV(d_model, n_heads=num_attention_heads, dropout=attention_dropout, mode=mode)),
      # drop attention mask. signal stack currently has [attention_activations, mask, target_tokens]
      tl.Select([0,2]),
      # RNN decoder
      [tl.LSTM(n_units=d_model) for _ in range(num_decoder_layers)],
      tl.Dense(target_vocab_size),
      tl.LogSoftmax()
    )
    return model

In [15]:
model = NMT_Model(mode='train')
print(model)

Serial_in2_out2[
  Select[0,1,0,1]_in2_out4
  Parallel_in2_out2[
    Serial[
      Embedding_33300_1024
      LSTM_1024
      LSTM_1024
    ]
    Serial[
      ShiftRight(1)
      Embedding_33300_1024
      LSTM_1024
    ]
  ]
  PrepareAttentionInput_in3_out4
  Serial_in4_out2[
    Branch_in4_out3[
      None
      Serial_in4_out2[
        Parallel_in3_out3[
          Dense_1024
          Dense_1024
          Dense_1024
        ]
        PureAttention_in4_out2
        Dense_1024
      ]
    ]
    Add_in2
  ]
  Select[0,2]_in3_out2
  LSTM_1024
  LSTM_1024
  Dense_33300
  LogSoftmax
]


## Define train task and eval task

In [16]:
train_task = training.TrainTask(
    labeled_data = train_batch_stream,
    loss_layer = tl.CrossEntropyLoss(),
    optimizer = trax.optimizers.Adam(0.01),
    # Using learning rate scheduler to have 1000 warmup steps with a max value of 0.01
    lr_schedule= trax.lr.warmup_and_rsqrt_decay(1000, 0.01),
    n_steps_per_checkpoint= 10,
)

eval_task = training.EvalTask(
    labeled_data = eval_batch_stream,
    metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
)


output_dir = 'model/'
# remove old model if it exists.
!rm -f ~/output_dir/model.pkl.gz  

training_loop = training.Loop(model, train_task, eval_tasks=[eval_task], output_dir=output_dir)

## Training

In [17]:
#training_loop.run(20)

## Loading pretrained model

In [18]:
model = NMT_Model(mode='eval')
# initialize weights from a pre-trained model
model.init_from_file("model.pkl.gz", weights_only=True)
model = tl.Accelerate(model)

## Producing output sentence from decoder output representation

## Function to get index of next token
This function takes input sentence tokens and current decoded words tokens(0 initially) as input and returns the index of next word (selected from decoder generated log probabilities). Trax has built in layer named  logsoftmax_sample to select a word from log propabilities. temperature parameter is used to control noise added to log probabilities inside logsoftmax_sample layer.                          


In [19]:
def next_symbol(NMT_Model, input_tokens, cur_output_tokens, temperature):
    output_tokens_len = len(cur_output_tokens)
    # calculating next power of 2 for padding length. For example if len=13 then next power of 2 will be 16. 
    padded_length = np.power(2, int(np.ceil(np.log2(output_tokens_len + 1)))) #We add 1 to avoid log(0).
    padded = cur_output_tokens + [0] * (padded_length - output_tokens_len)
    padded_with_batch = np.expand_dims(padded, axis=0) # adding batch dimention
    # model outputs:  [log probabilities, target tokens]
    output, _ = NMT_Model((input_tokens, padded_with_batch)) #output: (batch_size, decoder_length, vocab_size)
    # getting log probabilities from the last token output
    log_probs = output[0, output_tokens_len, :]
    symbol = int(tl.logsoftmax_sample(log_probs, temperature))
    return symbol, float(log_probs[symbol])

## Function to generate final translated sentence
This will call the next_symbol() function above several times until the next output is the end-of-sentence token (i.e. EOS).

In [20]:
def sampling_decode(input_sentence, NMT_Model = None, temperature=0.0, vocab_file=None, vocab_dir=None):
    input_tokens = tokenize(input_sentence, vocab_file, vocab_dir)
    cur_output_tokens = []
    cur_output = 0
    EOS = 1
    while cur_output != EOS:
        # updating the current output token by getting the index of the next word
        cur_output, log_prob = next_symbol(NMT_Model, input_tokens, cur_output_tokens, temperature)
        cur_output_tokens.append(cur_output)
    sentence = detokenize(cur_output_tokens, vocab_file, vocab_dir) 

    return cur_output_tokens, log_prob, sentence

In [21]:
sampling_decode("I love languages.", model, temperature=0.0, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

([161, 12202, 5112, 3, 1], -0.0001735687255859375, 'Ich liebe Sprachen.')

## Minimum Bayes-Risk Decoding (MBR)
MBR works as following:
1. take several random samples
2. score each sample against all other samples
3. select the one with the highest score<br>
<b>Scoring is done by computing overlap between a pair of samples. Here we consider only unigram overlap.</b> 

In [22]:
def generate_samples(sentence, num_samples, NMT_Model=None, temperature=1, vocab_file=None, vocab_dir=None):
    samples, log_probs = [], []
    for _ in range(num_samples):
        sample_tokens, logp, _ = sampling_decode(sentence, NMT_Model, temperature, vocab_file=vocab_file, vocab_dir=vocab_dir)
        samples.append(sample_tokens)
        log_probs.append(logp)
    return samples, log_probs

In [23]:
generate_samples('I love languages.', 4, model, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)

([[161, 12202, 5112, 3, 1],
  [161, 12202, 10, 5112, 3, 1],
  [161, 12202, 5112, 3, 1],
  [161, 12202, 5112, 3, 1]],
 [-0.0001735687255859375,
  -0.0001087188720703125,
  -0.0001735687255859375,
  -0.0001735687255859375])

## Comparing overlaps
One of the more simple metrics is the Jaccard similarity which gets the intersection over union of two sets.

In [24]:
def jaccard_similarity(candidate, reference):
    can_unigram_set, ref_unigram_set = set(candidate), set(reference)  
    joint_elems = can_unigram_set.intersection(ref_unigram_set)
    all_elems = can_unigram_set.union(ref_unigram_set)
    overlap = len(joint_elems) / len(all_elems)
    return overlap

## Calculating ROUGE score
ROUGE score (similar to f1 score) also calculates overlapping between two samples. When ROUGE score is calculated on unigrams then it is called ROUGE-1
$$score = 2* \frac{(precision * recall)}{(precision + recall)}$$

In [25]:
from collections import Counter
def rouge1_similarity(system, reference):
    # making a frequency tables of the system and reference tokens
    sys_counter = Counter(system)
    ref_counter = Counter(reference)
    overlap = 0
    for token in sys_counter:
        token_count_sys = sys_counter.get(token, 0) # token frequency in system translation        
        token_count_ref = ref_counter.get(token, 0) # token frequency in reference
        # updating the overlap by getting the smaller number between the two token counts above
        overlap += min(token_count_sys, token_count_ref)
    precision = overlap / sum(sys_counter.values())  #by defination
    recall = overlap / sum(ref_counter.values())  #by defination
    
    if precision + recall != 0:
        rouge1_score = 2 * ((precision * recall)/(precision + recall))
    else:
        rouge1_score = 0     
    return rouge1_score

## Overall score
Consider 4-sample list.

1. Get similarity score between sample 1 and sample 2
2. Get similarity score between sample 1 and sample 3
3. Get similarity score between sample 1 and sample 4
4. Get average score of the first 3 steps. This will be the overall score of sample 1.
5. Iterate and repeat until samples 1 to 4 have overall scores.

In [26]:
def average_overlap(similarity_fn, samples, *ignore):
    scores = {}
    for index_candidate, candidate in enumerate(samples):    
        overlap = 0.0
        cnt = 0
        for index_sample, sample in enumerate(samples): 
            if index_candidate == index_sample:
                continue
            sample_overlap = similarity_fn(candidate, sample)
            cnt += 1
            overlap += sample_overlap
        score = overlap/cnt
        scores[index_candidate] = score
    return scores

In practice, it is also common to see the weighted mean being used to calculate the overall score instead of just the arithmetic mean.

In [27]:
def weighted_avg_overlap(similarity_fn, samples, log_probs):
    """
    Args:
        samples (list of lists): tokenized version of the translated sentences
        log_probs (list of float): log probability of the translated sentences
    """
    scores = {}
    for index_candidate, candidate in enumerate(samples):    
        overlap, weight_sum = 0.0, 0.0
        for index_sample, (sample, logp) in enumerate(zip(samples, log_probs)):            
            if index_candidate == index_sample:
                continue
            # converting log probability to linear scale
            sample_p = float(np.exp(logp))
            weight_sum += sample_p
            sample_overlap = similarity_fn(candidate, sample)
            overlap += sample_p * sample_overlap
        score = overlap / weight_sum
        scores[index_candidate] = score
    return scores

## Putting it all together

In [28]:
def mbr_decode(sentence, num_samples, score_fn, similarity_fn, NMT_Model=None, temperature=0.6, vocab_file=None, vocab_dir=None):
    samples, log_probs = generate_samples(sentence, num_samples, NMT_Model, temperature, vocab_file, vocab_dir)
    scores = score_fn(similarity_fn, samples, log_probs)
    max_index = max(scores, key=scores.get)
    translated_sentence = detokenize(samples[max_index], vocab_file, vocab_dir)
    return (translated_sentence, max_index, scores)

In [29]:
TEMPERATURE = 0.6
sen = 'She speaks English and German.'
mbr_decode(sen, 4, weighted_avg_overlap, jaccard_similarity, model, TEMPERATURE, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)[0]

'Sie spricht Englisch und Deutsch.'

In [30]:
sen = 'Congratulations!'
mbr_decode(sen, 4, average_overlap, rouge1_similarity, model, TEMPERATURE, vocab_file=VOCAB_FILE, vocab_dir=VOCAB_DIR)[0]

'Herzlichen Glückwunsch!'