In [10]:
from transformers import ( 
    T5Tokenizer
)

import re
import tensorflow as tf

import numpy as np

import pickle

# import python files
import sys
sys.path.append("../..")

from python_files.language_tokens import LanguageTokens

In [11]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [12]:
try:
    # Disable all GPUS
    tf.config.set_visible_devices([], 'GPU')
    visible_devices = tf.config.get_visible_devices()
    for device in visible_devices:
        assert device.device_type != 'GPU'
except:
    print("Invalid device or cannot modify virtual devices once initialized.")
    pass

tf.config.get_visible_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]

In [13]:
def transform(x):
    x = " ".join(x.split("; ")[1:])
    x = re.sub("'(.*)'", r"\1", x)
    return x  # + "</s>"

def get_sueddeutsche_data(name, language):
    article_path = "../../data/sueddeutsche/articles_{}_{}".format(language, name)
    highlights_path = "../../data/sueddeutsche/highlights_{}_{}".format(language, name)

    articles = [transform(x.rstrip()) for x in open(article_path).readlines()]
    highlights = [transform(x.rstrip()) for x in open(highlights_path).readlines()]
    assert len(articles) == len(highlights), "sueddeutsche articles:{} highlights:{}".format(len(articles),
                                                                                             len(highlights))
    return articles, highlights

In [14]:
de_x, de_y = get_sueddeutsche_data("train", "de")
en_x, en_y = get_sueddeutsche_data("train", "en")

In [15]:
de_y[4]

'Schwarz-Gelb ist noch nicht jene Traumkoalition, die sich die Beteiligten erhofft hatten. Welche Minister aus Merkels Regierungsmannschaft leisten trotzdem gute Arbeit? Und welche sind Fehlbesetzungen? Stimmen Sie ab!'

In [16]:
en_y[4]

'Black and yellow is not yet the dream coalition that those involved had hoped for. Which ministers in Merkels government team are still doing a good job? And what are miscues? Vote! In fact, its not.'

In [17]:
de_y[100]

'Microsoft-Deutschland-Chef Berg über den hiesigen Standort, die Bedeutung der Computer-Messe Cebit - und seine Jogging-Erlebnisse mit Konzernchef Ballmer.'

In [18]:
en_y[100]

"Microsoft Germany CEO Berg on the local location, the importance of the Cebit computer fair - and his jogging experiences with CEO Ballmer. In fact, it's not."

In [19]:
len(" In fact, it's not."), len(" In fact, its not.")

(19, 18)

In [20]:
en_y[100][-19:], en_y[4][-18:]

(" In fact, it's not.", ' In fact, its not.')

### Count prahse

In [21]:
count_1 = 0
count_2 = 0

for i in en_y:
    if i[-19:] == " In fact, it's not.":
        count_1 += 1
    if i[-18:] == " In fact, its not.":
        count_2 += 1
        
print("{} of {} highlight end with the phrase \"In fact, it's not.\"".format(count_1, len(en_y)))
print("{} of {} highlight end with the phrase \"In fact, its not.\"".format(count_2, len(en_y)))

168432 of 220887 highlight end with the phrase "In fact, it's not."
46870 of 220887 highlight end with the phrase "In fact, its not."


### Count beginnings in highlights

In [26]:
count = 0
for i in range(len(en_x)):
    two_sents = ". ".join(en_x[i].split(". ")[:2])
    if en_y[i]  == two_sents:
        count += 1
count

29

In [13]:
data_list = ["train", "val", "test"]
def remove_phrase(name):
    x, y = get_sueddeutsche_data(name, "en")
    new_y = []
    new_x = []
    
    for line in y:
        if line[-19:] == " In fact, it's not." or line[-18:] == " In fact, its not.":
            new_y.append(line[:-19])
        else:
            new_y.append(line)
    
    for line in x:
        if line[-19:] == " In fact, it's not." or line[-18:] == " In fact, its not.":
            new_x.append(line[:-19])
        else:
            new_x.append(line)
    with open("../../data/sueddeutsche/highlights_{}_{}_cleaned".format("en", name), "a") as file:
        for i, line in enumerate(new_y):
            file.write(str(i) + "; " + line + "\n")
    with open("../../data/sueddeutsche/articles_{}_{}_cleaned".format("en", name), "a") as file:
        for i, line in enumerate(new_x):
            file.write(str(i) + "; " + line + "\n")
            
for item in data_list:
    remove_phrase(item)