In [1]:
import tensorflow as tf
print(tf.__version__)

import os
from transformers import ( 
    T5Tokenizer
)

import numpy as np

import pickle

2.4.1


In [2]:
root_folder = "../.."

model_size = "t5-base"

MAX_ARTICLE_LEN = 512

MAX_HIGHLIGHT_LEN = 150


BATCH_SIZE = 8

In [3]:
class LanguageTokens:
    def __init__(self, tokenizer, tf_or_pt: str) -> None:
        super().__init__()
        self.en_de_prefix = tokenizer("summarize English to German: ", return_tensors=tf_or_pt).input_ids
        self.de_en_prefix = tokenizer("summarize German to English: ", return_tensors=tf_or_pt).input_ids
        self.en_en_prefix = tokenizer("summarize English to English: ", return_tensors=tf_or_pt).input_ids
        self.de_de_prefix = tokenizer("summarize German to German: ", return_tensors=tf_or_pt).input_ids

        if tf_or_pt == "tf":
            self.en_de_prefix = tf.reshape(self.en_de_prefix, (-1,))
            self.de_en_prefix = tf.reshape(self.de_en_prefix, (-1,))
            self.en_en_prefix = tf.reshape(self.en_en_prefix, (-1,))
            self.de_de_prefix = tf.reshape(self.de_de_prefix, (-1,))
        elif tf_or_pt == "pt":
            self.en_de_prefix = self.en_de_prefix.reshape(-1,)
            self.de_en_prefix = self.de_en_prefix.reshape(-1,)
            self.en_en_prefix = self.en_en_prefix.reshape(-1,)
            self.de_de_prefix = self.de_de_prefix.reshape(-1,)

        # check if last token is end of sequence token and remove it
        if self.en_de_prefix[-1] == 1:
            self.en_de_prefix = self.en_de_prefix[:-1]
            self.de_en_prefix = self.de_en_prefix[:-1]
            self.en_en_prefix = self.en_en_prefix[:-1]
            self.de_de_prefix = self.de_de_prefix[:-1]

        assert self.en_de_prefix.shape[0] == self.de_en_prefix.shape[0] == self.en_en_prefix.shape[0] == self.de_de_prefix.shape[0], "All perfixes must have the same size"
        self.prefix_size = self.en_de_prefix.shape[0]
tokenizer = T5Tokenizer.from_pretrained(model_size)
language_tokens = LanguageTokens(tokenizer, "tf")
prefix_size = language_tokens.prefix_size
language_token_order = ["de_de", "en_de", "de_en", "en_en"]

In [4]:
# Taken from the TensorFlow models repository: https://github.com/tensorflow/models/blob/befbe0f9fe02d6bc1efb1c462689d069dae23af1/official/nlp/bert/input_pipeline.py#L24
class TFRecordLoader:

    def __init__(self, root_folder, language_tokens, is_colab=True) -> None:
        super().__init__()

        self.root_folder = root_folder
        self.is_colab = is_colab
        self.language_tokens = language_tokens

    @staticmethod
    def listdir_gcp(directory):
        list_dir = !gsutil ls -r {directory}
        return list_dir

    def get_tf_record_files(self, directory):
        file_list = []
        if self.is_colab:
            list_dir = self.listdir_gcp(directory)
        else:
            from os import listdir
            list_dir = listdir(directory)
            list_dir = [directory + "/"+ item for item in list_dir]
            

        for item in list_dir:
            if item.split(".")[-1] == "tfrecord":
                file_list.append(str(item))
        return file_list

    def decode_record(self, record, features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t
        return example

    def select_data_from_record(self, record):
        return [
            tf.concat([self.language_tokens.de_de_prefix, record['ger_x']], axis=0),
            tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['ger_y'],
            record['ger_y_ids'],
            tf.concat([self.language_tokens.en_de_prefix, record['en_x']], axis=0),
            tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['ger_y'],
            record['ger_y_ids'],
            tf.concat([self.language_tokens.de_en_prefix, record['ger_x']], axis=0),
            tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['ger_x_mask']], axis=0), record['en_y'],
            record['en_y_ids'],
            tf.concat([self.language_tokens.en_en_prefix, record['en_x']], axis=0),
            tf.concat([tf.ones(prefix_size, dtype=tf.int32), record['en_x_mask']], axis=0), record['en_y'],
            record['en_y_ids'],
        ]

    def get_tfrecord_dataset(self, folder):
        features = {
            'ger_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN - prefix_size], tf.int64),
            'ger_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN - prefix_size], tf.int64),
            'ger_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
            'ger_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),

            'en_x': tf.io.FixedLenFeature([MAX_ARTICLE_LEN - prefix_size], tf.int64),
            'en_x_mask': tf.io.FixedLenFeature([MAX_ARTICLE_LEN - prefix_size], tf.int64),
            'en_y': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
            'en_y_ids': tf.io.FixedLenFeature([MAX_HIGHLIGHT_LEN], tf.int64),
        }

        dataset = tf.data.TFRecordDataset(self.get_tf_record_files(self.root_folder + folder))

        dataset = dataset.map(lambda record: self.decode_record(record, features))
        dataset = dataset.map(self.select_data_from_record)
        return dataset.batch(BATCH_SIZE)

In [5]:
tf_record_loader = TFRecordLoader(root_folder, language_tokens, is_colab=False)

test_dataset = tf_record_loader.get_tfrecord_dataset("/data/cnn_daily_mail_test/")
test_dataset.prefetch(1024)

<PrefetchDataset shapes: ((None, 512), (None, 512), (None, 150), (None, 150), (None, 512), (None, 512), (None, 150), (None, 150), (None, 512), (None, 512), (None, 150), (None, 150), (None, 512), (None, 512), (None, 150), (None, 150)), types: (tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32, tf.int32)>

In [6]:
def get_all_language_combinations(ds):
    for i in range(1, 5):
        yield ds[(i-1)*4], ds[i*4-3], ds[i*4-2], ds[i*4-1]


In [7]:
def decode(batch):
     return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in batch]

In [8]:
def test_encoding(ds):
    for ds_item in ds.take(1):
        for j, (input_ids, input_mask, y, y_ids) in enumerate(get_all_language_combinations(ds_item)):
            print(decode(input_ids))
        
test_encoding(test_dataset)

['summarize German to German: Eine Gruppe Chelsea-Fans stürmte diese Woche das Emirates Stadium von Arsenal in einem unbeschwerten Versuch, ein blaues Zeichen zu setzen, während sich die Premier-League-Clubs auf den Kampf am Sonntag vorbereiten. Die vier Fans bezeichneten sich selbst als "C-Team" und trugen Jose Mourinho, Diego Costa, Didier Drogba und Cesc Fabregas Masken, um ihre Identität zu verbergen, bevor sie vor Arsenals Haus im Norden Londons parkten. Als der berühmte Soundtrack des A-Teams einsetzt, behauptet ein Sprecher: "Im Jahr 2004 hat ein portugiesischer General, der als Spezialeinheit bekannt ist, eine Crack-Kommandoeinheit zusammengestellt. Die Gruppe der Chelsea-Fans zündete blaue Rauchbomben und verhüllte das Arsenal-Schild vor den Emiraten. Ein Mann mit einer Jose-Mourinho-Maske blickt beim Chelsea-Fankanal-Streich in die Kamera. Einer der Männer mit einer Diego-Costa-Maske feiert, als blauer Rauch die rote Seite Londons erreicht. "Diese Männer dominierten die Premi

["summarize English to German: A group of Chelsea fans stormed Arsenals Emirates Stadium this week in a lighthearted attempt to lay a blue marker down as the Premier League clubs prepare to do battle on Sunday. The four fans labelled themselves the 'C-Team' while wearing Jose Mourinho, Diego Costa, Didier Drogba and Cesc Fabregas masks to hide their identity before parking up outside Arsenal's north London home. As the famous A-Team soundtrack kicks in, a narrator claims: 'In 2004, a Portuguese general known as the special one put together a crack commando unit. The group of Chelsea fans set off blue smoke bombs and covered the Arsenal sign outside the Emirates . A man in a Jose Mourinho mask looks to the camera during the Chelsea Fans Channel prank . One of the men, wearing a Diego Costa mask, celebrates as blue smoke hits the red side of London . 'These men dominated the Premier League from their west London underground. 'Today's mission: To bring a touch of class to the war-torn are

['summarize German to English: Eine Gruppe Chelsea-Fans stürmte diese Woche das Emirates Stadium von Arsenal in einem unbeschwerten Versuch, ein blaues Zeichen zu setzen, während sich die Premier-League-Clubs auf den Kampf am Sonntag vorbereiten. Die vier Fans bezeichneten sich selbst als "C-Team" und trugen Jose Mourinho, Diego Costa, Didier Drogba und Cesc Fabregas Masken, um ihre Identität zu verbergen, bevor sie vor Arsenals Haus im Norden Londons parkten. Als der berühmte Soundtrack des A-Teams einsetzt, behauptet ein Sprecher: "Im Jahr 2004 hat ein portugiesischer General, der als Spezialeinheit bekannt ist, eine Crack-Kommandoeinheit zusammengestellt. Die Gruppe der Chelsea-Fans zündete blaue Rauchbomben und verhüllte das Arsenal-Schild vor den Emiraten. Ein Mann mit einer Jose-Mourinho-Maske blickt beim Chelsea-Fankanal-Streich in die Kamera. Einer der Männer mit einer Diego-Costa-Maske feiert, als blauer Rauch die rote Seite Londons erreicht. "Diese Männer dominierten die Prem

["summarize English to English: A group of Chelsea fans stormed Arsenals Emirates Stadium this week in a lighthearted attempt to lay a blue marker down as the Premier League clubs prepare to do battle on Sunday. The four fans labelled themselves the 'C-Team' while wearing Jose Mourinho, Diego Costa, Didier Drogba and Cesc Fabregas masks to hide their identity before parking up outside Arsenal's north London home. As the famous A-Team soundtrack kicks in, a narrator claims: 'In 2004, a Portuguese general known as the special one put together a crack commando unit. The group of Chelsea fans set off blue smoke bombs and covered the Arsenal sign outside the Emirates . A man in a Jose Mourinho mask looks to the camera during the Chelsea Fans Channel prank . One of the men, wearing a Diego Costa mask, celebrates as blue smoke hits the red side of London . 'These men dominated the Premier League from their west London underground. 'Today's mission: To bring a touch of class to the war-torn ar