In [1]:
# !pip install transformers
# !pip install sentencepiece

# Create TF Record files

In [1]:
from transformers import ( 
    T5Tokenizer
)
import time
import os
import re
from multiprocessing import Process, Manager
from pathlib import Path
import tensorflow as tf


# import python files
import sys
sys.path.append("../..")

from python_files.language_tokens import LanguageTokens
from python_files.dataset.tokenize_helper import TokenizeHelper
from python_files.dataset.cnn_daily_mail import CnnDailyMailData
from python_files.dataset.sueddeutsche import SueddeutscheData
from python_files.dataset.tf_record_writer import TfRecordWriter

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Disable GPU 
(otherwise tensoflow will crash while running on multiple threads)

In [3]:
try:
    # Disable all GPUS
    tf.config.set_visible_devices([], 'GPU')
    visible_devices = tf.config.get_visible_devices()
    for device in visible_devices:
        assert device.device_type != 'GPU'
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

In [4]:
tf.config.get_visible_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [5]:
SHUFFEL_SIZE = 1024

learning_rate = 3e-5

model_size = "t5-base"

In [6]:
tf.__version__

'2.4.1'

In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
language_tokens = LanguageTokens(tokenizer, "tf")
prefix_size = language_tokens.prefix_size

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

### Sueddeutsche

In [8]:
# times = []
# for i in range(12, 13):
#     tokenize_helper = TokenizeHelper(tokenizer, prefix_size, num_threads=i)
#     sueddeutsche_data = SueddeutscheData(tokenize_helper, parallel=True)
#     start_time = time.time()
#     test_sueddeutsche = sueddeutsche_data.get_tokenized_multilingual_ds("test")
#     elapsed = time.time() - start_time
#     print("{:.2f} sec".format(elapsed))
#     times.append(elapsed)                                
                                        

In [9]:
# test_sueddeutsche_ds = tf.data.Dataset.from_tensor_slices(test_sueddeutsche)

In [8]:
tokenize_helper = TokenizeHelper(tokenizer, prefix_size, num_threads=12)
sueddeutsche_data = SueddeutscheData(tokenize_helper, parallel=True)

In [9]:
start_time = time.time()
test_sueddeutsche = sueddeutsche_data.get_tokenized_multilingual_ds("test")
elapsed = time.time() - start_time
print("{:.2f} sec".format(elapsed))

16.66 sec


In [14]:
test_sueddeutsche[0]

([<tf.Tensor: shape=(507,), dtype=int32, numpy=
  array([  316,  9387,     9,   221,   426, 13667,    15,  8655,     9,
           229,   732,  5702,  1559,     6,   732,   236, 10974,     7,
          1140,  1872,   624,     6,     3, 20348,    16,    74, 12822,
            10,  2432,  1300,  1458,  3789,     3,   547, 12526,     9,
         24688,    15,    17,    15,     6,    67, 12232,   155,  6125,
            77,    74,  3319,  4195,  6180,  1650, 19447,    46,    67,
         11019,    15,     3, 31616,     5,     3,  7577,  7852,     3,
            15,     7,   319,  4847,     3, 26621,    15, 10388,    15,
             6,   218,    67, 24688,    15,    17,    15,   289,    16,
         14103,     3,   295, 30054,    17,     3,   547,    10,   644,
          5083, 18475,    74, 12899,    23,     9,  1227, 15156,     9,
             6,    74,    34,     9, 10487,  1779,  6226,   195,    18,
            64, 12214,  3003,   776,    23,     6,  3310, 18246,     6,
           211, 

In [11]:
test_sueddeutsche_ds = tf.data.Dataset.from_tensor_slices(test_sueddeutsche)
test_sueddeutsche_ds

<TensorSliceDataset shapes: (((507,), (507,), (150,), (150,)), ((507,), (507,), (150,), (150,))), types: ((tf.int32, tf.int32, tf.int32, tf.int32), (tf.int32, tf.int32, tf.int32, tf.int32))>

In [13]:
tf_record_writer = TfRecordWriter(20)
tf_record_writer.write_to_tfrecord_file(test_sueddeutsche_ds, "../../data/sueddeutsche_test", "sueddeutsche_multilingual")

[11] Saved sueddeutsche_multilingual
[5] Saved sueddeutsche_multilingual
[17] Saved sueddeutsche_multilingual
[8] Saved sueddeutsche_multilingual
[10] Saved sueddeutsche_multilingual
[4] Saved sueddeutsche_multilingual
[6] Saved sueddeutsche_multilingual
[15] Saved sueddeutsche_multilingual
[3] Saved sueddeutsche_multilingual
[13] Saved sueddeutsche_multilingual
[7] Saved sueddeutsche_multilingual
[9] Saved sueddeutsche_multilingual
[12] Saved sueddeutsche_multilingual
[1] Saved sueddeutsche_multilingual
[14] Saved sueddeutsche_multilingual
[19] Saved sueddeutsche_multilingual
[18] Saved sueddeutsche_multilingual
[2] Saved sueddeutsche_multilingual
[16] Saved sueddeutsche_multilingual
[0] Saved sueddeutsche_multilingual


In [14]:
val_sueddeutsche = sueddeutsche_data.get_tokenized_multilingual_ds("val")

In [15]:
val_sueddeutsche_ds = tf.data.Dataset.from_tensor_slices(val_sueddeutsche)
tf_record_writer = TfRecordWriter(20)
tf_record_writer.write_to_tfrecord_file(val_sueddeutsche_ds, "../../data/sueddeutsche_val", "sueddeutsche_multilingual")

[6] Saved sueddeutsche_multilingual
[15] Saved sueddeutsche_multilingual
[14] Saved sueddeutsche_multilingual
[1] Saved sueddeutsche_multilingual
[19] Saved sueddeutsche_multilingual
[10] Saved sueddeutsche_multilingual
[0] Saved sueddeutsche_multilingual
[4] Saved sueddeutsche_multilingual
[5] Saved sueddeutsche_multilingual
[7] Saved sueddeutsche_multilingual
[13] Saved sueddeutsche_multilingual
[8] Saved sueddeutsche_multilingual
[12] Saved sueddeutsche_multilingual
[2] Saved sueddeutsche_multilingual
[11] Saved sueddeutsche_multilingual
[16] Saved sueddeutsche_multilingual
[18] Saved sueddeutsche_multilingual
[17] Saved sueddeutsche_multilingual
[3] Saved sueddeutsche_multilingual
[9] Saved sueddeutsche_multilingual


In [16]:
tokenize_helper = TokenizeHelper(tokenizer, prefix_size, num_threads=12)
sueddeutsche_data = SueddeutscheData(tokenize_helper, parallel=True)
train_sueddeutsche = sueddeutsche_data.get_tokenized_multilingual_ds("train")

In [17]:
start_time = time.time()
train_sueddeutsche_ds = tf.data.Dataset.from_tensor_slices(train_sueddeutsche)
tf_record_writer = TfRecordWriter(16)
tf_record_writer.write_to_tfrecord_file(train_sueddeutsche_ds, "../../data/sueddeutsche_train", "sueddeutsche_multilingual")
elapsed = time.time() - start_time
print("{:.2f} sec".format(elapsed))

|T:8| [  999/13805] | ms/ds_point 216.65 |
|T:15| [  999/13805] | ms/ds_point 213.54 |
|T:1| [  999/13805] | ms/ds_point 221.78 |
|T:5| [  999/13805] | ms/ds_point 220.54 |
|T:7| [  999/13805] | ms/ds_point 220.49 |
|T:0| [  999/13805] | ms/ds_point 229.28 |
|T:9| [  999/13805] | ms/ds_point 229.73 |
|T:11| [  999/13805] | ms/ds_point 232.29 |
|T:2| [  999/13805] | ms/ds_point 237.38 |
|T:10| [  999/13805] | ms/ds_point 237.93 |
|T:3| [  999/13805] | ms/ds_point 242.56 |
|T:4| [  999/13805] | ms/ds_point 247.90 |
|T:14| [  999/13805] | ms/ds_point 248.41 |
|T:6| [  999/13805] | ms/ds_point 257.36 |
|T:12| [  999/13805] | ms/ds_point 260.45 |
|T:13| [  999/13805] | ms/ds_point 266.31 |
|T:8| [ 1999/13805] | ms/ds_point 227.53 |
|T:1| [ 1999/13805] | ms/ds_point 226.71 |
|T:0| [ 1999/13805] | ms/ds_point 226.18 |
|T:7| [ 1999/13805] | ms/ds_point 233.02 |
|T:9| [ 1999/13805] | ms/ds_point 225.36 |
|T:2| [ 1999/13805] | ms/ds_point 221.44 |
|T:10| [ 1999/13805] | ms/ds_point 222.43 |
|T:1

|T:11| [11999/13805] | ms/ds_point 245.48 |
|T:13| [11999/13805] | ms/ds_point 234.30 |
|T:12| [11999/13805] | ms/ds_point 241.20 |
|T:0| [12999/13805] | ms/ds_point 201.98 |
|T:7| [12999/13805] | ms/ds_point 234.54 |
|T:15| [12999/13805] | ms/ds_point 215.85 |
|T:8| [12999/13805] | ms/ds_point 252.20 |
|T:9| [12999/13805] | ms/ds_point 222.86 |
|T:5| [12999/13805] | ms/ds_point 246.58 |
|T:4| [12999/13805] | ms/ds_point 229.07 |
|T:1| [12999/13805] | ms/ds_point 224.16 |
|T:3| [12999/13805] | ms/ds_point 240.71 |
|T:10| [12999/13805] | ms/ds_point 239.45 |
|T:6| [12999/13805] | ms/ds_point 229.65 |
|T:2| [12999/13805] | ms/ds_point 233.71 |
|T:14| [12999/13805] | ms/ds_point 232.10 |
|T:11| [12999/13805] | ms/ds_point 237.69 |
|T:12| [12999/13805] | ms/ds_point 226.38 |
|T:13| [12999/13805] | ms/ds_point 241.66 |
[0] Saved sueddeutsche_multilingual
[7] Saved sueddeutsche_multilingual
[15] Saved sueddeutsche_multilingual
[8] Saved sueddeutsche_multilingual
[5] Saved sueddeutsche_multil

### CNN Daily Mail

In [10]:
tokenize_helper = TokenizeHelper(tokenizer, prefix_size, num_threads=16)
cnn_daily_mail_data = CnnDailyMailData(tokenize_helper, parallel=True)

In [11]:
test_cnn_daily_mail_data = cnn_daily_mail_data.get_tokenized_multilingual_ds("test")
test_cnn_daily_mail_data_ds = tf.data.Dataset.from_tensor_slices(test_cnn_daily_mail_data)

In [11]:
tf_record_writer = TfRecordWriter(20)
tf_record_writer.write_to_tfrecord_file(test_cnn_daily_mail_data_ds, "../../data/cnn_daily_mail_test", "cnn_daily_mail_multilingual")

[12] Saved cnn_daily_mail_multilingual
[1] Saved cnn_daily_mail_multilingual
[11] Saved cnn_daily_mail_multilingual
[18] Saved cnn_daily_mail_multilingual
[9] Saved cnn_daily_mail_multilingual
[16] Saved cnn_daily_mail_multilingual
[0] Saved cnn_daily_mail_multilingual
[15] Saved cnn_daily_mail_multilingual
[6] Saved cnn_daily_mail_multilingual
[13] Saved cnn_daily_mail_multilingual
[5] Saved cnn_daily_mail_multilingual
[10] Saved cnn_daily_mail_multilingual
[7] Saved cnn_daily_mail_multilingual
[8] Saved cnn_daily_mail_multilingual
[2] Saved cnn_daily_mail_multilingual
[3] Saved cnn_daily_mail_multilingual
[4] Saved cnn_daily_mail_multilingual
[19] Saved cnn_daily_mail_multilingual
[14] Saved cnn_daily_mail_multilingual
[17] Saved cnn_daily_mail_multilingual


In [12]:
val_cnn_daily_mail_data = cnn_daily_mail_data.get_tokenized_multilingual_ds("val")
val_cnn_daily_mail_data_ds = tf.data.Dataset.from_tensor_slices(val_cnn_daily_mail_data)

tf_record_writer = TfRecordWriter(20)
tf_record_writer.write_to_tfrecord_file(val_cnn_daily_mail_data_ds, "../../data/cnn_daily_mail_val", "cnn_daily_mail_multilingual")

[18] Saved cnn_daily_mail_multilingual
[3] Saved cnn_daily_mail_multilingual
[1] Saved cnn_daily_mail_multilingual
[19] Saved cnn_daily_mail_multilingual
[17] Saved cnn_daily_mail_multilingual
[11] Saved cnn_daily_mail_multilingual
[5] Saved cnn_daily_mail_multilingual
[0] Saved cnn_daily_mail_multilingual
[2] Saved cnn_daily_mail_multilingual
[7] Saved cnn_daily_mail_multilingual
[15] Saved cnn_daily_mail_multilingual
[4] Saved cnn_daily_mail_multilingual
[14] Saved cnn_daily_mail_multilingual
[13] Saved cnn_daily_mail_multilingual
[9] Saved cnn_daily_mail_multilingual
[10] Saved cnn_daily_mail_multilingual
[8] Saved cnn_daily_mail_multilingual
[12] Saved cnn_daily_mail_multilingual
[16] Saved cnn_daily_mail_multilingual
[6] Saved cnn_daily_mail_multilingual


In [13]:
train_cnn_daily_mail_data = cnn_daily_mail_data.get_tokenized_multilingual_ds("train")

In [14]:
train_cnn_daily_mail_data_ds = tf.data.Dataset.from_tensor_slices(train_cnn_daily_mail_data)

tf_record_writer = TfRecordWriter(16)
tf_record_writer.write_to_tfrecord_file(train_cnn_daily_mail_data_ds, "../../data/cnn_daily_mail_train", "cnn_daily_mail_multilingual")

|T:7| [  999/17944] | ms/ds_point 218.61 |
|T:5| [  999/17944] | ms/ds_point 220.42 |
|T:6| [  999/17944] | ms/ds_point 232.11 |
|T:4| [  999/17944] | ms/ds_point 239.35 |
|T:8| [  999/17944] | ms/ds_point 247.85 |
|T:3| [  999/17944] | ms/ds_point 411.63 |
|T:2| [  999/17944] | ms/ds_point 424.15 |
|T:9| [  999/17944] | ms/ds_point 430.51 |
|T:1| [  999/17944] | ms/ds_point 436.64 |
|T:0| [  999/17944] | ms/ds_point 438.97 |
|T:7| [ 1999/17944] | ms/ds_point 239.98 |
|T:6| [ 1999/17944] | ms/ds_point 239.41 |
|T:8| [ 1999/17944] | ms/ds_point 223.44 |
|T:10| [  999/17944] | ms/ds_point 477.10 |
|T:5| [ 1999/17944] | ms/ds_point 266.55 |
|T:4| [ 1999/17944] | ms/ds_point 301.18 |
|T:11| [  999/17944] | ms/ds_point 613.35 |
|T:12| [  999/17944] | ms/ds_point 615.02 |
|T:9| [ 1999/17944] | ms/ds_point 190.81 |
|T:3| [ 1999/17944] | ms/ds_point 216.63 |
|T:14| [  999/17944] | ms/ds_point 616.82 |
|T:13| [  999/17944] | ms/ds_point 625.12 |
|T:15| [  999/17944] | ms/ds_point 592.69 |
|T:2|

|T:5| [12999/17944] | ms/ds_point 211.98 |
|T:13| [11999/17944] | ms/ds_point 190.67 |
|T:1| [11999/17944] | ms/ds_point 212.90 |
|T:11| [11999/17944] | ms/ds_point 190.13 |
|T:12| [11999/17944] | ms/ds_point 190.20 |
|T:0| [11999/17944] | ms/ds_point 217.82 |
|T:14| [11999/17944] | ms/ds_point 196.15 |
|T:4| [12999/17944] | ms/ds_point 205.80 |
|T:10| [12999/17944] | ms/ds_point 188.63 |
|T:6| [13999/17944] | ms/ds_point 210.38 |
|T:7| [13999/17944] | ms/ds_point 194.95 |
|T:8| [13999/17944] | ms/ds_point 208.17 |
|T:15| [11999/17944] | ms/ds_point 195.78 |
|T:9| [13999/17944] | ms/ds_point 185.92 |
|T:3| [12999/17944] | ms/ds_point 206.05 |
|T:13| [12999/17944] | ms/ds_point 184.16 |
|T:1| [12999/17944] | ms/ds_point 192.22 |
|T:12| [12999/17944] | ms/ds_point 188.01 |
|T:2| [12999/17944] | ms/ds_point 228.33 |
|T:11| [12999/17944] | ms/ds_point 197.57 |
|T:5| [13999/17944] | ms/ds_point 206.09 |
|T:14| [12999/17944] | ms/ds_point 195.60 |
|T:0| [12999/17944] | ms/ds_point 212.35 |
|

## Read Data

In [14]:
import numpy as np
from os import listdir
MAX_ARTICLE_LEN = 512
MAX_HIGHLIGHT_LEN = 150
bucket = ""
GLOBAL_BATCH_SIZE = 4

def get_tf_record_files(directory):
    file_list = []
    for item in listdir(directory):
        if item.split(".")[-1] == "tfrecord":
            file_list.append("{}/{}".format(directory, item))
    return file_list

def get_tfrecord_dataset(folder):
    features = {
        'ger_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'ger_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'ger_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'ger_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),

        'en_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'en_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN-prefix_size], tf.int64),
        'en_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        'en_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
    }
    
    dataset = tf.data.TFRecordDataset(get_tf_record_files(folder))

    # Taken from the TensorFlow models repository: https://github.com/tensorflow/models/blob/befbe0f9fe02d6bc1efb1c462689d069dae23af1/official/nlp/bert/input_pipeline.py#L24
    def decode_record(record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example


    def select_data_from_record(record):
        i  = np.random.randint(4) 
        if i == 0:
            return tf.concat([language_tokens.de_de_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['ger_y'], record['ger_y_ids']
        elif i == 1:
            return tf.concat([language_tokens.en_de_prefix, record['en_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['ger_y'], record['ger_y_ids']
        elif i == 2:
            return tf.concat([language_tokens.de_en_prefix, record['ger_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['en_y'], record['en_y_ids']
        elif i == 3:
            return tf.concat([language_tokens.en_en_prefix, record['en_x']], axis=0), tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['en_y'], record['en_y_ids']
 
    dataset = dataset.map(lambda record: decode_record(record, features))
    dataset = dataset.map(select_data_from_record)
    dataset = dataset.shuffle(10000)
    return dataset.batch(GLOBAL_BATCH_SIZE)


root_folder = "../data/"
validation_dataset = get_tfrecord_dataset(root_folder + "cnn_daily_mail_train/")