In [1]:
!pip install -r requirements.txt

Collecting blis==0.7.4
  Downloading blis-0.7.4-cp36-cp36m-manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 6.7 MB/s eta 0:00:01
Collecting catalogue==2.0.1
  Downloading catalogue-2.0.1-py3-none-any.whl (9.6 kB)
Collecting click==7.1.2
  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 13.3 MB/s eta 0:00:01
[?25hCollecting contextvars==2.4
  Downloading contextvars-2.4.tar.gz (9.6 kB)
Collecting cymem==2.0.5
  Downloading cymem-2.0.5-cp36-cp36m-manylinux2014_x86_64.whl (35 kB)
Collecting dataclasses==0.8
  Downloading dataclasses-0.8-py3-none-any.whl (19 kB)
Collecting filelock==3.0.12
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting gcloud==0.18.3
  Downloading gcloud-0.18.3.tar.gz (454 kB)
[K     |████████████████████████████████| 454 kB 13.4 MB/s eta 0:00:01
Collecting googleapis-common-protos==1.53.0
  Downloading googleapis_common_protos-1.53.0-py2.py3-none-any.whl (198 kB)

Collecting murmurhash==1.0.5
  Downloading murmurhash-1.0.5-cp36-cp36m-manylinux2014_x86_64.whl (20 kB)
Collecting nltk==3.5
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 13.3 MB/s eta 0:00:01
Collecting oauth2client==4.1.3
  Downloading oauth2client-4.1.3-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 15.2 MB/s eta 0:00:01
Collecting pandas==1.1.5
  Downloading pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
[K     |████████████████████████████████| 9.5 MB 12.1 MB/s eta 0:00:01
Collecting pathy==0.4.0
  Downloading pathy-0.4.0-py3-none-any.whl (36 kB)
Collecting preshed==3.0.5
  Downloading preshed-3.0.5-cp36-cp36m-manylinux2014_x86_64.whl (126 kB)
[K     |████████████████████████████████| 126 kB 12.8 MB/s eta 0:00:01
Collecting pydantic==1.7.3
  Downloading pydantic-1.7.3-cp36-cp36m-manylinux2014_x86_64.whl (9.2 MB)
[K     |████████████████████████████████| 9.2 MB 10.0 MB/s eta 0:00:01
[?25hCollecting p

Collecting thinc==8.0.2
  Downloading thinc-8.0.2-cp36-cp36m-manylinux2014_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 12.7 MB/s eta 0:00:01
[?25hCollecting tokenizers==0.10.1
  Downloading tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 12.8 MB/s eta 0:00:01
[?25hCollecting torch==1.8.1
  Downloading torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl (804.1 MB)
[K     |████████████████████████████████| 804.1 MB 13.4 MB/s eta 0:00:01    |▋                               | 16.4 MB 12.0 MB/s eta 0:01:06     |█████▊                          | 144.5 MB 12.4 MB/s eta 0:00:54     |██████▉                         | 172.1 MB 12.0 MB/s eta 0:00:53     |██████████████                  | 349.4 MB 12.1 MB/s eta 0:00:38     |███████████████▉                | 398.0 MB 12.1 MB/s eta 0:00:34     |███████████████████             | 478.8 MB 13.7 MB/s eta 0:00:24     |███████████████████▉            | 498.6 MB 11.9 MB/

In [7]:
import tensorflow as tf
print(tf.__version__)

from os import listdir
from transformers import ( 
    T5Tokenizer, 
    TFT5ForConditionalGeneration
)

import numpy as np
import time
import re
import pickle

# import python files
import sys
sys.path.append("../..")

# spacy 
from spacy.lang.de import German 
from spacy.lang.en import English 

from python_files.language_tokens import LanguageTokens
from python_files.model_loader import ModelLoader
from python_files.tf_record_loader import TFRecordLoader


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

2.4.1


In [8]:
from python_files.dataset.sueddeutsche import SueddeutscheData
from python_files.dataset.tokenize_helper import TokenizeHelper
from python_files.dataset.tf_record_writer import TfRecordWriter

In [9]:
model_size = "t5-base"

MAX_ARTICLE_LEN = 512

MAX_HIGHLIGHT_LEN = 150

BATCH_SIZE = 8

root_folder = "../.."

In [10]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [11]:
try:
    # Disable all GPUS
    tf.config.set_visible_devices([], 'GPU')
    visible_devices = tf.config.get_visible_devices()
    for device in visible_devices:
        assert device.device_type != 'GPU'
except:
    # Invalid device or cannot modify virtual devices once initialized.
    pass

In [12]:
tokenizer = T5Tokenizer.from_pretrained(model_size)
language_tokens = LanguageTokens(tokenizer, "tf")
prefix_size = language_tokens.prefix_size
language_token_order = ["de_de", "en_de", "de_en", "en_en"]

In [13]:
def transform(x):
    x = " ".join(x.split("; ")[1:])
    x = re.sub("'(.*)'", r"\1", x)
    return x  # + "</s>"

def get_sueddeutsche_data(name, language):
    if language == "en":
        highlights_path = "../../data/sueddeutsche/highlights_{}_{}_cleaned".format(language, name)
        article_path = "../../data/sueddeutsche/articles_{}_{}_cleaned".format(language, name)
    else:
        highlights_path = "../../data/sueddeutsche/highlights_{}_{}".format(language, name)
        article_path = "../../data/sueddeutsche/articles_{}_{}".format(language, name)

    articles = [transform(x.rstrip()) for x in open(article_path).readlines()]
    highlights = [transform(x.rstrip()) for x in open(highlights_path).readlines()]
    assert len(articles) == len(highlights), "sueddeutsche articles:{} highlights:{}".format(len(articles),
                                                                                             len(highlights))
    return articles, highlights

In [14]:
de_x, de_y = get_sueddeutsche_data("test", "de")
en_x, en_y = get_sueddeutsche_data("test", "en")

In [15]:
tokenizer("x.")

{'input_ids': [3, 226, 5, 1], 'attention_mask': [1, 1, 1, 1]}

In [16]:
tokenizer.decode([5])

'.'

In [17]:
def split_at_values(lst, values):
    indices = [i for i, x in enumerate(lst) if x in values]
    ret_list = []
    for i, (start, end) in enumerate(zip([0, *indices], [*indices, len(lst)])):
        if i == 0:
            ret_list.append(lst[start:end+1])
        else:
            ret_list.append(lst[start+1:end+1])
    return ret_list

In [18]:
tokenizer("test sentence asd asljdasd lkasd", max_length=512-prefix_size, truncation=True, padding='max_length')


{'input_ids': [794, 7142, 38, 26, 38, 40, 354, 7664, 26, 3, 40, 1258, 7, 26, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [19]:
def shift_seq_right(seq):
    ones = tf.zeros([1], dtype=tf.int32)
    return tf.concat([ones, seq[:-1]], axis=0)

def tokenize_highlights(text):
    y = tokenizer(text, return_tensors="tf", max_length=150, truncation=True, padding='max_length').input_ids
    y = tf.squeeze(y)
    y_ids = shift_seq_right(y)

    return y, y_ids

In [35]:
def switch_first_five_sentences(x):
    encode_x = tokenizer(x, max_length=512-prefix_size, truncation=True, padding='max_length')
    input_ids = encode_x.input_ids
    split_input_ids = split_at_values(input_ids, {5})
    list_ids = split_input_ids[5:] + split_input_ids[:5]
    ret = []
    for item in list_ids:
        ret += item
    ret = tf.convert_to_tensor(ret)
    return ret, tf.convert_to_tensor(encode_x.attention_mask)


x_de_list = []
x_de_mask_list = []
x_en_list = []
x_en_mask_list = []
y_de_list = [] 
y_de_ids_list = []
y_en_list = [] 
y_en_ids_list = []
for x_de_i, y_de_i, x_en_i, y_en_i in zip(de_x, de_y, en_x, en_y):
    x_de, x_de_mask = switch_first_five_sentences(x_de_i)
    x_en, x_en_mask = switch_first_five_sentences(x_de_i)
    y_de, y_de_ids = tokenize_highlights(y_de_i)
    y_en, y_en_ids = tokenize_highlights(y_en_i)
    
    x_de_list.append(x_de)
    x_de_mask_list.append(x_de_mask)
    x_en_list.append(x_en)
    x_en_mask_list.append(x_en_mask)

    y_de_list.append(y_de)
    y_de_ids_list.append(y_de_ids)
    y_en_list.append(y_en)
    y_en_ids_list.append(y_en_ids)

In [36]:
test_sueddeutsche_ds = tf.data.Dataset.from_tensor_slices(((x_de_list, x_de_mask_list, y_de_list, y_de_ids_list),(x_en_list, x_en_mask_list, y_en_list, y_en_ids_list)))

In [37]:
tf_record_writer = TfRecordWriter(20)
tf_record_writer.write_to_tfrecord_file(test_sueddeutsche_ds, "../../data/sueddeutsche_test_switch", "sueddeutsche_multilingual")

[9] Saved sueddeutsche_multilingual
[8] Saved sueddeutsche_multilingual
[10] Saved sueddeutsche_multilingual
[13] Saved sueddeutsche_multilingual
[16] Saved sueddeutsche_multilingual
[19] Saved sueddeutsche_multilingual
[4] Saved sueddeutsche_multilingual
[18] Saved sueddeutsche_multilingual
[3] Saved sueddeutsche_multilingual
[2] Saved sueddeutsche_multilingual
[14] Saved sueddeutsche_multilingual
[5] Saved sueddeutsche_multilingual
[11] Saved sueddeutsche_multilingual
[0] Saved sueddeutsche_multilingual
[17] Saved sueddeutsche_multilingual
[7] Saved sueddeutsche_multilingual
[6] Saved sueddeutsche_multilingual
[15] Saved sueddeutsche_multilingual
[1] Saved sueddeutsche_multilingual
[12] Saved sueddeutsche_multilingual


## Write Tokenized Version