[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tuanh118/uw-fsdl-bert-transfer-translation/blob/master/Notebook.ipynb)

## Set up the Collab VM and import libraries.


In [1]:
# Hack to reference other files in the repo in Collab.
!git clone https://github.com/tuanh118/uw-fsdl-bert-transfer-translation
import sys
sys.path.append('./uw-fsdl-bert-transfer-translation')

# Install required packages.
!pip install transformers

%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import *
from sklearn.model_selection import train_test_split

import io
import os
import time

from CombinedBertTransformerModel import *
from DatasetSequence import *
from functools import partial
from util import *

Cloning into 'uw-fsdl-bert-transfer-translation'...
remote: Enumerating objects: 118, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (96/96), done.[K
remote: Total 118 (delta 50), reused 69 (delta 19), pack-reused 0[K
Receiving objects: 100% (118/118), 1.53 MiB | 24.05 MiB/s, done.
Resolving deltas: 100% (50/50), done.
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/48/35/ad2c5b1b8f99feaaf9d7cdadaeef261f098c6e1a6a2935d4d07662a6b780/transformers-2.11.0-py3-none-any.whl (674kB)
[K     |████████████████████████████████| 675kB 14.9MB/s 
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 58.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300

## Data retrieval

In [2]:
# Download the EuroParl French-English corpus.
# Switch "fr" to "es" everywhere in the next 3 lines to learn Spanish instead of French.
path_to_fr_en_tar = tf.keras.utils.get_file('fr-en.tgz', origin='https://www.statmt.org/europarl/v7/fr-en.tgz', extract=True)
path_to_fr_en_en_file = os.path.dirname(path_to_fr_en_tar) + "/europarl-v7.fr-en.en"
path_to_fr_en_fr_file = os.path.dirname(path_to_fr_en_tar) + "/europarl-v7.fr-en.fr"


Downloading data from https://www.statmt.org/europarl/v7/fr-en.tgz


## Data processing

In [3]:
# Given a path to a text file, load and tokenize each line using the provided tokenizer, then convert each token to an ID and pad all lines to have length max_tokens.
def load_dataset(language_path, tokenizer, num_examples=None, max_tokens=500):
    # Read the data.
    lines = io.open(language_path, encoding='UTF-8').read().strip().splitlines()[:num_examples]

    # Tokenize each line, adding the special start and end tokens.
    tokenized_lines = [ [tokenizer.cls_token] + tokenizer.tokenize(line)[:max_tokens-2] + [tokenizer.sep_token] for line in lines ]
    
    # Convert tokens to IDs.
    ids = [ tokenizer.convert_tokens_to_ids(tokenized_line) for tokenized_line in tokenized_lines ]

    # Pad all ID sequences to the maximum length with zeroes.
    ids = tf.keras.preprocessing.sequence.pad_sequences(ids, value=tokenizer.pad_token_id, maxlen=max_tokens, truncating="post", padding="post")

    return ids

BATCH_SIZE = 64
d_model = 128
num_examples = BATCH_SIZE * 500
max_tokens = 200
tokenizer = instantiate_tokenizer()
vocab_size = len(tokenizer.vocab)

input_tensor = load_dataset(path_to_fr_en_en_file, tokenizer, num_examples, max_tokens)
target_tensor = load_dataset(path_to_fr_en_fr_file, tokenizer, num_examples, max_tokens)

# Split the data into training and validation sets.  No test set for now since we're just experimenting.
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Do some printing to show that the processing worked.
def convert(tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print("%d ----> %s" % (t, tokenizer.ids_to_tokens[t]))

print("ID to token mapping for first training example (input)")
convert(tokenizer, input_tensor_train[0])
print()
print("ID to token mapping for first training example (target)")
convert(tokenizer, target_tensor_train[0])

def format_batch(x, y):
    """
    Inputs are x and y up to the last character.
    Outputs are y from first character (shifted).
    """
    return [x, y[:, :-1]], y[:, 1:]

train_dataset = DatasetSequence(input_tensor_train, target_tensor_train, batch_size=BATCH_SIZE, format_fn=format_batch)
validation_dataset = DatasetSequence(input_tensor_val, target_tensor_val, batch_size=BATCH_SIZE, format_fn=format_batch)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


ID to token mapping for first training example (input)
101 ----> [CLS]
16378 ----> secondly
1010 ----> ,
1045 ----> i
2323 ----> should
2036 ----> also
2066 ----> like
2000 ----> to
3602 ----> note
2008 ----> that
1010 ----> ,
13644 ----> broadly
4092 ----> speaking
1010 ----> ,
2026 ----> my
2177 ----> group
6753 ----> supports
1996 ----> the
5292 ----> ha
4232 ----> ##ard
2121 ----> ##er
3189 ----> report
1998 ----> and
4671 ----> express
2135 ----> ##ly
11637 ----> highlights
1996 ----> the
2755 ----> fact
2008 ----> that
2045 ----> there
2001 ----> was
6581 ----> excellent
6792 ----> cooperation
2007 ----> with
1996 ----> the
9680 ----> rap
6442 ----> ##port
11236 ----> ##eur
1012 ----> .
102 ----> [SEP]

ID to token mapping for first training example (target)
101 ----> [CLS]
15333 ----> je
29536 ----> vo
6784 ----> ##ud
14995 ----> ##rai
2015 ----> ##s
4372 ----> en
28880 ----> ##suit
2063 ----> ##e
4189 ----> fair
2063 ----> ##e
2128 ----> re
7849 ----> ##mar
4226 ----> ##que
20

## Model preparation

In [4]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
learning_rate = CustomSchedule(d_model=d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

loss = partial(sparse_categorical_crossentropy_ignoring_padding, padding_label=0)
loss.__name__ = 'loss'
accuracy = partial(sparse_categorical_accuracy_ignoring_padding, padding_label=0)
accuracy.__name__ = 'accuracy'

model = CombinedBertTransformerModel(
    max_tokens=max_tokens,
    vocab_size=vocab_size,
    num_layers=4,
    units=32,
    d_model=d_model,
    num_heads=4,
    dropout=0,
    padding_label=tokenizer.pad_token_id
)
model.compile(optimizer=optimizer, loss=loss, metrics=[accuracy])
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tokenized_output_sentence (Inpu [(None, 200)]        0                                            
__________________________________________________________________________________________________
tokenized_input_sentence (Input [(None, 200)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 200, 768), ( 109482240   tokenized_input_sentence[0][0]   
__________________________________________________________________________________________________
look_ahead_mask (Lambda)        (None, 1, None, 200) 0           tokenized_output_sentence[0][0]  
_____________________________________________________________________________________________

## Model training

In [0]:
# Uncomment this line to load pre-trained weights from a previous run.
#model.load_weights('checkpoint_en_fr_20200531151710')

# Train and evaluate the model using tf.keras.Model.fit()
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    use_multiprocessing=False,
    workers=1,
    shuffle=True,
    epochs=10
)

Epoch 1/10
 81/400 [=====>........................] - ETA: 8:50 - loss: 2.7077 - accuracy: 0.7376

In [0]:
# Save the training history and learned parameters for later examination.
import datetime
import pickle

timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
with open('history_en_fr_' + timestamp, 'wb') as history_file:
    pickle.dump(history.history, history_file)
model.save_weights('checkpoint_en_fr_' + timestamp)