[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tuanh118/uw-fsdl-bert-transfer-translation/blob/add-collab-notebook/Notebook.ipynb)

## Set up the Collab VM and import libraries.


In [1]:
# Hack to reference other files in the repo in Collab.
!git clone https://github.com/tuanh118/uw-fsdl-bert-transfer-translation
import sys
sys.path.append('./uw-fsdl-bert-transfer-translation')

# Install required packages.
!pip install transformers

%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import *
from sklearn.model_selection import train_test_split

import io
import os
import time

from CombinedBertTransformerModel import *

Cloning into 'uw-fsdl-bert-transfer-translation'...
remote: Enumerating objects: 59, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (48/48), done.[K
remote: Total 59 (delta 19), reused 38 (delta 9), pack-reused 0[K
Unpacking objects: 100% (59/59), done.
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl (660kB)
[K     |████████████████████████████████| 665kB 9.1MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 56.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43

## Data retrieval

In [3]:
# Download the EuroParl French-English corpus.
path_to_fr_en_tar = tf.keras.utils.get_file('fr-en.tgz', origin='https://www.statmt.org/europarl/v7/fr-en.tgz', extract=True)
path_to_fr_en_en_file = os.path.dirname(path_to_fr_en_tar) + "/europarl-v7.fr-en.en"
path_to_fr_en_fr_file = os.path.dirname(path_to_fr_en_tar) + "/europarl-v7.fr-en.fr"


Downloading data from https://www.statmt.org/europarl/v7/fr-en.tgz


## Data processing

In [4]:
# Sets up a BERT tokenizer.
def instantiate_tokenizer():
    return BertTokenizer.from_pretrained('bert-base-uncased')

# Given a path to a text file, load and tokenize each line using the provided tokenizer, then convert each token to an ID and pad all lines to have length max_tokens.
def load_dataset(language_path, tokenizer, num_examples=None, max_tokens=500):
    # Read the data.
    lines = io.open(language_path, encoding='UTF-8').read().strip().splitlines()[:num_examples]

    # Tokenize and add the special start token.
    tokenized_lines = [ ['[CLS]'] + tokenizer.tokenize(line)[:max_tokens-1] + ['[SEP]'] for line in lines ]
    
    # Convert tokens to IDs.
    ids = [ tokenizer.convert_tokens_to_ids(tokenized_line) for tokenized_line in tokenized_lines ]

    # Generate padding masks and segment IDs. These have the same length as the ID sequences after padding.
    # Padding mask is 1 where there is an actual ID and 0 where there is padding. Segment ID is always 0.
    masks = [ [1] * len(tokenized_line) for tokenized_line in tokenized_lines ]
    segments = [ [] for tokenized_line in tokenized_lines ]

    # Pad all ID sequences to the maximum length with zeroes.
    ids = tf.keras.preprocessing.sequence.pad_sequences(ids, maxlen=max_tokens, truncating="post", padding="post", dtype="int")
    masks = tf.keras.preprocessing.sequence.pad_sequences(masks, maxlen=max_tokens, truncating="post", padding="post", dtype="int")
    segments = tf.keras.preprocessing.sequence.pad_sequences(segments, maxlen=max_tokens, truncating="post", padding="post", dtype="int")

    return ids, masks, segments

num_examples = 300
max_tokens = 50
tokenizer = instantiate_tokenizer()
input_tensor, masks, segments = load_dataset(path_to_fr_en_en_file, tokenizer, num_examples, max_tokens)
target_tensor, _, _ = load_dataset(path_to_fr_en_fr_file, tokenizer, num_examples, max_tokens)

# Split the data into training and validation sets.  No test set for now since we're just experimenting.
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

# Do some printing to show that the processing worked.
def convert(tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print ("%d ----> %s" % (t, tokenizer.ids_to_tokens[t]))

print("ID to token mapping for first training example (input)")
convert(tokenizer, input_tensor_train[0])
print()
print("ID to token mapping for first training example (target)")
convert(tokenizer, target_tensor_train[0])

BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
embedding_dim = 32
units = 32
vocab_size = len(tokenizer.vocab)

train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(len(input_tensor_train)).batch(BATCH_SIZE, drop_remainder=True)
validation_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val)).shuffle(len(input_tensor_val)).batch(BATCH_SIZE, drop_remainder=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


ID to token mapping for first training example (input)
101 ----> [CLS]
2035 ----> all
1997 ----> of
2023 ----> this
2003 ----> is
1999 ----> in
10388 ----> accordance
2007 ----> with
1996 ----> the
6958 ----> principle
1997 ----> of
4942 ----> sub
5332 ----> ##si
9032 ----> ##dia
15780 ----> ##rity
1998 ----> and
2003 ----> is
3568 ----> therefore
2000 ----> to
2022 ----> be
6551 ----> greatly
10979 ----> welcomed
1012 ----> .
102 ----> [SEP]

ID to token mapping for first training example (target)
101 ----> [CLS]
2000 ----> to
4904 ----> ##ut
8292 ----> ce
6895 ----> ##ci
9193 ----> doi
2102 ----> ##t
3802 ----> et
2890 ----> ##re
2112 ----> part
2594 ----> ##ic
15859 ----> ##uli
7869 ----> ##ere
3672 ----> ##ment
16183 ----> sal
5657 ----> ##ue
4372 ----> en
2744 ----> term
2229 ----> ##es
2139 ----> de
4942 ----> sub
5332 ----> ##si
9032 ----> ##dia
17625 ----> ##rite
1012 ----> .
102 ----> [SEP]


## Model preparation

In [5]:
# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

print(train_dataset.as_numpy_iterator().next()[0][0].shape)

model = CombinedBertTransformerModel(
    max_tokens=max_tokens,
    vocab_size=vocab_size,
    num_layers=2,
    units=32,
    d_model=32,
    num_heads=2,
    dropout=0.2,
    padding_label=0
)
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.summary()

(50,)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
tokenized_output_sentence (Inpu [(None, 50)]         0                                            
__________________________________________________________________________________________________
tokenized_input_sentence (Input [(None, 50)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 50, 768), (N 109482240   tokenized_input_sentence[0][0]   
__________________________________________________________________________________________________
look_ahead_mask (Lambda)        (None, 1, None, 50)  0           tokenized_output_sentence[0][0]  
_____________________________________________________________________________________________

## Model training

In [6]:
# Train and evaluate the model using tf.keras.Model.fit()
# TODO This doesn't work yet.
history = model.fit(
    train_dataset,
    epochs=2,
    steps_per_epoch=115,
    validation_data=validation_dataset,
    validation_steps=7
)

Epoch 1/2


AssertionError: ignored