# German Text Summary with the tranlated CNN Daily Mail Dataset
and T5 from Huggingface Pytorch

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from pathlib import Path
import torch
import re
import time

In [2]:
BATCH_SIZE = 16

SHUFFEL_SIZE = 1024

# device = "cpu"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

learning_rate = 3e-5

first_run = False 

In [3]:
print(device)

cuda:0


In [4]:
def write_data(iter_dataset, name, path="data/"):
    
    articles_file = Path(path + name + "/article").open("w")
    highlights_file = Path(path + name + "/highlights").open("w")

    for item in iter_dataset:
        articles_file.write(item["article"].decode("utf-8") + "\n")
        articles_file.flush()
        highlights_file.write(item["highlights"].decode("utf-8").replace("\n", " ") + "\n")
        highlights_file.flush()

if first_run:
    cnn_dailymail = tfds.load(name="cnn_dailymail")
    
    train_tfds = cnn_dailymail['train']
    test_tfds = cnn_dailymail['test']
    val_tfds = cnn_dailymail['validation']
    
    train_ds_iter = tfds.as_numpy(train_tfds)
    val_ds_iter = tfds.as_numpy(val_tfds)
    test_ds_iter = tfds.as_numpy(test_tfds)
    
    write_data(train_ds_iter, "train")
    write_data(test_ds_iter, "test")
    write_data(val_ds_iter, "val")

## Define Model

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

task_specific_params = model.config.task_specific_params
if task_specific_params is not None:
    model.config.update(task_specific_params.get("summarization", {}))
    

optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate, weight_decay=0.0001)

## Read the tranlated German Dataset

In [6]:
name = "train"

article_path = "../data/%s/articles_german" % name
highlights_path = "../data/%s/highlights_german" % name

articles = [x.rstrip() for x in open(article_path).readlines()]
highlights = [x.rstrip() for x in open(highlights_path).readlines()]

len_articles = len(articles)

def get_dict_data(list_input):
    ret_dict = {}
    for input_item in list_input:
        data_point = input_item.split(";")
        data_id = data_point[0]
        data = ""
        for item in data_point[1:]:
            data += item
        ret_dict[int(data_id)] = data.strip()
    return ret_dict
    
articles = get_dict_data(articles)
highlights = get_dict_data(highlights)

In [7]:
cleaned_articles = []
cleaned_highlights = []

for i in range(len_articles):
    if i in articles.keys() and i in highlights.keys():
        cleaned_articles.append(articles[i])
        cleaned_highlights.append(highlights[i])
#     else:
#         print(i)

In [8]:
assert len(cleaned_articles) == len(cleaned_highlights)

val_size = int(len(cleaned_articles)*0.1)
train_x = cleaned_articles[val_size:] 
train_y = cleaned_highlights[val_size:] 
val_x = cleaned_articles[:val_size] 
val_y = cleaned_highlights[:val_size] 

## Show some Dataset Example 

In [9]:
import numpy as np
for i in range(5):
    rand_int = np.random.randint(len(train_x))
    print("\n--------")
    print(train_x[rand_int][:1500])
    print()
    print(train_y[rand_int])


--------
Von Helen Pow. VERÖFFENTLICHT:. 11: 09 EST, 23. Juli 2013. |. UPDATED:. 12: 20 EST, 23. Juli 2013. Tragisch: Paul Franklin Dart, rechts im Bild mit Freunden, versuchte den Schützen zu beruhigen, als er in den Kopf geschossen wurde. Ein verheirateter Armeeveteran wurde vor den Augen seiner Frau und seines Stiefsohnes während einer jährlichen Raftingtour mit der Familie am Samstag erschossen, nachdem ein Verwandter am Flussufer uriniert hatte, was den Grundbesitzer in Rage brachte. James Robert Crocker, 59, konfrontierte die Gruppe von Familie und Freunden mit einer 9 mm Handfeuerwaffe, nachdem sie eine kurze Pause an einer Kiesbar in Meramex eingelegt hatten, und einer der Nachtschwärmer ging in den Wald, um sich zu befreien. Nach einem kurzen Streit darüber, ob die Kiesbar privates oder öffentliches Eigentum war, schoss Crocker Paul Franklin Dart, 48, aus nur wenigen Metern Entfernung in den Kopf, während seine Frau Loretta und ihr Sohn Josh Kling, 24, entsetzt zuschauten. Da

## Define Pytorch Dataset

In [10]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, articles, highlights):
        self.x = articles
        self.y = highlights
        
    def __getitem__(self, index):
        x = tokenizer.encode_plus(model.config.prefix + self.transfrom(self.x[index]), max_length=512, return_tensors="pt", pad_to_max_length=True)
        y = tokenizer.encode(self.transfrom(self.y[index]), max_length=150, return_tensors="pt", pad_to_max_length=True)
        return x['input_ids'].view(-1), x['attention_mask'].view(-1), y.view(-1)
    
    @staticmethod
    def transfrom(x):
        x = x.lower()
        x = re.sub("'(.*)'", r"\1", x)
        return x
    
    def __len__(self):
        return len(self.x)

In [11]:
train_ds, val_ds = MyDataset(train_x, train_y), MyDataset(val_x, val_y)

In [12]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE)
val_loader = torch.utils.data.DataLoader(val_ds, batch_size=BATCH_SIZE)
# test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE)

## Define Step function

In [13]:
pad_token_id = tokenizer.pad_token_id
def step(inputs_ids, attention_mask, y):
    y_ids = y[:, :-1].contiguous()
    lm_labels = y[:, 1:].clone()
    lm_labels[y[:, 1:] == pad_token_id] = -100
    output = model(inputs_ids, attention_mask=attention_mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
    return output[0] # loss

## Train

In [14]:
EPOCHS = 1
log_interval = 200
train_loss = []
val_loss = []
for epoch in range(EPOCHS):
    model.train() 
    start_time = time.time()
    for i, (inputs_ids, attention_mask, y) in enumerate(train_loader):
        inputs_ids = inputs_ids.to(device)
        attention_mask = attention_mask.to(device)
        y = y.to(device)
        
        
        optimizer.zero_grad()
        loss = step(inputs_ids, attention_mask, y)
        train_loss.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
            
        if (i + 1) % log_interval == 0:
            with torch.no_grad():
                x, x_mask, y = next(iter(val_loader))
                x = x.to(device)
                x_mask = x_mask.to(device)
                y = y.to(device)
                
                v_loss = step(x, x_mask, y)
                v_loss = v_loss.item()
                
                
                elapsed = time.time() - start_time
                print('| epoch {:3d} | [{:5d}/{:5d}] | '
                  'ms/batch {:5.2f} | '
                  'loss {:5.2f} | val loss {:5.2f}'.format(
                    epoch, i, len(train_loader),
                    elapsed * 1000 / log_interval,
                    loss.item(), v_loss))
                start_time = time.time()
                val_loss.append(v_loss)
                
                

| epoch   0 | [  199/ 6053] | ms/batch 412.28 | loss  3.16 | val loss  3.19
| epoch   0 | [  399/ 6053] | ms/batch 409.88 | loss  2.98 | val loss  3.05
| epoch   0 | [  599/ 6053] | ms/batch 409.04 | loss  3.07 | val loss  2.99
| epoch   0 | [  799/ 6053] | ms/batch 411.56 | loss  3.05 | val loss  2.92
| epoch   0 | [  999/ 6053] | ms/batch 409.91 | loss  2.45 | val loss  2.89
| epoch   0 | [ 1199/ 6053] | ms/batch 412.41 | loss  2.58 | val loss  2.87
| epoch   0 | [ 1399/ 6053] | ms/batch 416.77 | loss  2.53 | val loss  2.84
| epoch   0 | [ 1599/ 6053] | ms/batch 419.44 | loss  2.44 | val loss  2.83
| epoch   0 | [ 1799/ 6053] | ms/batch 415.85 | loss  2.69 | val loss  2.78
| epoch   0 | [ 1999/ 6053] | ms/batch 415.14 | loss  2.66 | val loss  2.79
| epoch   0 | [ 2199/ 6053] | ms/batch 415.83 | loss  2.87 | val loss  2.77
| epoch   0 | [ 2399/ 6053] | ms/batch 408.96 | loss  2.88 | val loss  2.76
| epoch   0 | [ 2599/ 6053] | ms/batch 413.58 | loss  2.56 | val loss  2.77
| epoch   0 

## Evaluate

In [15]:
from rouge_score import rouge_scorer
from rouge_score import scoring

class RougeScore:
    '''
    mostly from https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/evaluation/metrics.py 
    '''
    
    def __init__(self, score_keys=None)-> None:
        super().__init__()
        if score_keys is None:  
            self.score_keys = ["rouge1", "rouge2", "rougeLsum"]
        
        self.scorer = rouge_scorer.RougeScorer(self.score_keys)
        self.aggregator = scoring.BootstrapAggregator()
        
        
    @staticmethod
    def prepare_summary(summary):
            # Make sure the summary is not bytes-type
            # Add newlines between sentences so that rougeLsum is computed correctly.
            summary = summary.replace(" . ", " .\n")
            return summary
    
    def __call__(self, target, prediction):
        """Computes rouge score.''
        Args:
        targets: string
        predictions: string
        """

        target = self.prepare_summary(target)
        prediction = self.prepare_summary(prediction)
        
        self.aggregator.add_scores(self.scorer.score(target=target, prediction=prediction))

        return 
    
    def reset_states(self):
        self.rouge_list = []

    def result(self):
        result = self.aggregator.aggregate()
        
        for key in self.score_keys:
            score_text = "%s = %.2f, 95%% confidence [%.2f, %.2f]"%(
                key,
                result[key].mid.fmeasure*100,
                result[key].low.fmeasure*100,
                result[key].high.fmeasure*100
            )
            print(score_text)
        
        return {key: result[key].mid.fmeasure*100 for key in self.score_keys}

In [16]:
rouge_score = RougeScore()
predictions = []
for i, (input_ids, attention_mask, y) in enumerate(val_loader):
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    y = y.to(device)
        
    summaries = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
    real = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in y]
    for pred_sent, real_sent in zip(pred, real):
        rouge_score(pred_sent, real_sent)
        predictions.append(str("pred sentence: " + pred_sent + "\n\n real sentence: " + real_sent))
    if i > 40:
        break
    
rouge_score.result()

rouge1 = 28.10, 95% confidence [27.34, 28.86]
rouge2 = 9.60, 95% confidence [9.00, 10.18]
rougeLsum = 17.42, 95% confidence [16.89, 18.02]


{'rouge1': 28.10321209302542,
 'rouge2': 9.599739806351863,
 'rougeLsum': 17.41800656285659}

In [17]:
for pred in predictions[:10]:
    print("------")
    print(pred)
    print("------")    

------
pred sentence: katholische diözese fargo in north dakota hat potenziell hunderte gemeindemitglieder in fargo, grand forks und jamestown den hepatitis-a-virus ausgesetzt. bischof john folda, leiterin des staatlichen immunisierungsprogramms, sagt, das risiko sei gering, aber die beamten halten es für wichtig, die menschen auf die mögliche exposition aufmerksam zu machen,

 real sentence: der bischof von north dakota, john folda, nimmt sich nach der diagnose eine auszeit. er hat sich die infektion durch kontaminiertes essen in italien zugezogen. kirchenmitglieder in fargo, grand forks und jamestown hätten sich anstecken können.
------
------
pred sentence: ralph mata war leutnant für innere angelegenheiten der miami-dade police department. drogenhandelsbehörden werfen die behörden vor, um einen mordkomplott zu planen und waffen zu kaufen. mata nutzte den angaben zufolge kontakte am flughafen, um die waffen auf reisen von miami in die dominikanische republik zu töten. es hat sich ge