[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tuanh118/uw-fsdl-bert-transfer-translation/blob/master/Notebook.ipynb)

## Set up the Collab VM and import libraries.


In [1]:
# Hack to reference other files in the repo in Collab.
!git clone https://github.com/tuanh118/uw-fsdl-bert-transfer-translation
import sys
sys.path.append('./uw-fsdl-bert-transfer-translation')

# Install required packages.
!pip install transformers

%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import *
from sklearn.model_selection import train_test_split

import io
import os
import time

from CombinedBertTransformerModel import *
from DatasetSequence import *
from functools import partial
from util import *

Cloning into 'uw-fsdl-bert-transfer-translation'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 76 (delta 28), reused 47 (delta 11), pack-reused 0[K
Unpacking objects: 100% (76/76), done.
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)
[K     |████████████████████████████████| 665kB 12.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 58.4MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.

## Data retrieval

In [2]:
# Download the EuroParl French-English corpus.
path_to_fr_en_tar = tf.keras.utils.get_file('fr-en.tgz', origin='https://www.statmt.org/europarl/v7/fr-en.tgz', extract=True)
path_to_fr_en_en_file = os.path.dirname(path_to_fr_en_tar) + "/europarl-v7.fr-en.en"
path_to_fr_en_fr_file = os.path.dirname(path_to_fr_en_tar) + "/europarl-v7.fr-en.fr"


Downloading data from https://www.statmt.org/europarl/v7/fr-en.tgz


## Data processing

In [3]:
# Sets up a BERT tokenizer.
def instantiate_tokenizer():
    return BertTokenizer.from_pretrained('bert-base-uncased')

# Given a path to a text file, load and tokenize each line using the provided tokenizer, then convert each token to an ID and pad all lines to have length max_tokens.
def load_dataset(language_path, tokenizer, num_examples=None, max_tokens=500):
    # Read the data.
    lines = io.open(language_path, encoding='UTF-8').read().strip().splitlines()[:num_examples]

    # Tokenize and add the special start token.
    tokenized_lines = [ ['[CLS]'] + tokenizer.tokenize(line)[:max_tokens-1] + ['[SEP]'] for line in lines ]
    
    # Convert tokens to IDs.
    ids = [ tokenizer.convert_tokens_to_ids(tokenized_line) for tokenized_line in tokenized_lines ]

    # Generate padding masks and segment IDs. These have the same length as the ID sequences after padding.
    # Padding mask is 1 where there is an actual ID and 0 where there is padding. Segment ID is always 0.
    masks = [ [1] * len(tokenized_line) for tokenized_line in tokenized_lines ]
    segments = [ [] for tokenized_line in tokenized_lines ]

    # Pad all ID sequences to the maximum length with zeroes.
    ids = tf.keras.preprocessing.sequence.pad_sequences(ids, maxlen=max_tokens, truncating="post", padding="post", dtype="int")
    masks = tf.keras.preprocessing.sequence.pad_sequences(masks, maxlen=max_tokens, truncating="post", padding="post", dtype="int")
    segments = tf.keras.preprocessing.sequence.pad_sequences(segments, maxlen=max_tokens, truncating="post", padding="post", dtype="int")

    return ids, masks, segments

BATCH_SIZE = 64
d_model = 32
num_examples = BATCH_SIZE * 5
max_tokens = 200
tokenizer = instantiate_tokenizer()
vocab_size = len(tokenizer.vocab)

input_tensor, masks, segments = load_dataset(path_to_fr_en_en_file, tokenizer, num_examples, max_tokens)
target_tensor, _, _ = load_dataset(path_to_fr_en_fr_file, tokenizer, num_examples, max_tokens)

# Split the data into training and validation sets.  No test set for now since we're just experimenting.
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Do some printing to show that the processing worked.
def convert(tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print ("%d ----> %s" % (t, tokenizer.ids_to_tokens[t]))

print("ID to token mapping for first training example (input)")
convert(tokenizer, input_tensor_train[0])
print()
print("ID to token mapping for first training example (target)")
convert(tokenizer, target_tensor_train[0])

def format_batch(x, y):
    """
    Inputs are x and y up to the last character.
    Outputs are y from first character (shifted).
    """
    return [x, y[:, :-1]], y[:, 1:]

train_dataset = DatasetSequence(input_tensor_train, target_tensor_train, batch_size=BATCH_SIZE, format_fn=format_batch)
validation_dataset = DatasetSequence(input_tensor_val, target_tensor_val, batch_size=BATCH_SIZE, format_fn=format_batch)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


ID to token mapping for first training example (input)
101 ----> [CLS]
2053 ----> no
16051 ----> amendments
2031 ----> have
2042 ----> been
3818 ----> proposed
8800 ----> relating
2000 ----> to
6928 ----> monday
1998 ----> and
9857 ----> tuesday
1012 ----> .
102 ----> [SEP]

ID to token mapping for first training example (target)
101 ----> [CLS]
1055 ----> s
1005 ----> '
12943 ----> ag
21205 ----> ##issa
3372 ----> ##nt
2139 ----> de
21860 ----> lund
2072 ----> ##i
3802 ----> et
9388 ----> mar
4305 ----> ##di
1010 ----> ,
15333 ----> je
1050 ----> n
1005 ----> '
9932 ----> ai
14674 ----> pas
2139 ----> de
12719 ----> modifications
1012 ----> .
102 ----> [SEP]


## Model preparation

In [4]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
learning_rate = CustomSchedule(d_model=d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

loss = partial(sparse_categorical_crossentropy_ignoring_padding, padding_label=0)
loss.__name__ = 'loss'
accuracy = partial(sparse_categorical_accuracy_ignoring_padding, padding_label=0)
accuracy.__name__ = 'accuracy'

model = CombinedBertTransformerModel(
    max_tokens=max_tokens,
    vocab_size=vocab_size,
    num_layers=2,
    units=32,
    d_model=d_model,
    num_heads=2,
    dropout=0,
    padding_label=0
)
model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tokenized_output_sentence (Inpu [(None, None)]       0                                            
__________________________________________________________________________________________________
tokenized_input_sentence (Input [(None, None)]       0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, None, 768),  109482240   tokenized_input_sentence[0][0]   
__________________________________________________________________________________________________
look_ahead_mask (Lambda)        (None, 1, None, None 0           tokenized_output_sentence[0][0]  
_____________________________________________________________________________________________

## Model training

In [5]:
# Train and evaluate the model using tf.keras.Model.fit()
model.fit(
    train_dataset,
    validation_data=validation_dataset,
    use_multiprocessing=False,
    workers=1,
    shuffle=True,
    epochs=10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fb7f2c57c18>