# Installation of required packages

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/My Drive/GeSumGenEval

/content/drive/.shortcut-targets-by-id/1PcWXs_So5sTaP0wBAR77_ORSfd4aFtHq/GeSumGenEval


In [3]:
# install all the required packages first after a start of every new collab session
!pip install -r requirements.txt



In [4]:
import sys
import nltk

print(sys.executable)

nltk.download('punkt')
#nltk.download('stopwords')

/usr/bin/python3


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Data Cleaning

In [5]:
import re
import string
#from nltk.corpus import stopwords

punctuations = string.punctuation.replace('.', '')
#stop_words = stopwords.words("german")
def clean_text(x):
    # Lowercase the text
    x = x.strip().lower()
    # Remove stop words
    #x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    # Remove unicode characters
    #x = x.encode('ascii', 'ignore').decode()
    # Remove URL
    x = re.sub(r'https*\S+', ' ', x)
    # Remove mentions
    #x = re.sub(r'@\S+', ' ', x)
    # Remove Hashtags
    #x = re.sub(r'#\S+', ' ', x)
    # Remove ticks and the next character
    #x = re.sub(r'\'\w+', '', x)
    # Remove punctuations
    x = re.sub('[%s]' % re.escape(punctuations), '', x)
    # Remove numbers
    #x = re.sub(r'\w*\d+\w*', '', x)
    # Replace the over spaces
    x = re.sub(r'\s{2,}', ' ', x)
    return x

# Summary Generation

In [6]:
import nltk
import random

def get_random_summary(source: str, num_sent=3, language='german') -> str:
    sentences = nltk.sent_tokenize(source, language)

    return " ".join(random.sample(sentences, num_sent))   

def get_lead_summary(source: str, num_sent=3, language='german') -> str:
    sentences = nltk.sent_tokenize(source, language)

    return " ".join(sentences[:3])

from summa.summarizer import summarize

def get_textrank_summary(source: str, ratio: float, language='german') -> str:
    # By default ratio value is 0.2.
    return summarize(source, language=language, ratio=ratio)

from itertools import combinations
def get_oracle_summary(source: str, reference: str, num_sent=3, language='german') -> str:
    sentences = nltk.sent_tokenize(source, language)
    max_score = 0
    oracle_summary = ""

    candidates = combinations(sentences, num_sent)
    for summary in candidates:
        summary = " ".join(summary)
        score = get_rouge([summary], [reference], False)[0]['rouge-l']['f']
        if score > max_score:
            max_score = score
            oracle_summary = summary

    return oracle_summary

# Summary Evaluation

In [7]:
from rouge import Rouge
from importlib import reload
import src.gerouge as gerouge
from summ_eval.bleu_metric import BleuMetric
from summ_eval.meteor_metric import MeteorMetric
from summ_eval.bert_score_metric import BertScoreMetric

gerouge = reload(gerouge)

def get_rouge(hypothesis, references, avg = True, language='german'):
    if language == 'german':
        rouge = gerouge.GeRouge(minimal_mode=True)
    else:
        rouge = Rouge()

    rouge_scores = rouge.get_scores(hypothesis, references, avg, True)
    if avg:
        return {k: v['f'] for k, v in rouge_scores.items()}
    else:
        return rouge_scores

def get_bleu(hypothesis, references):
    metric = BleuMetric(force=True)
    return metric.evaluate_batch(hypothesis, references)['bleu']

def get_meteor(hypothesis, references):
    metric = MeteorMetric()
    return metric.evaluate_batch(hypothesis, references)['meteor']

def get_bert_score(hypothesis, references):
    metric = BertScoreMetric(lang='de', model_type='dbmdz/bert-base-german-cased', num_layers=9, verbose=False, idf=True, rescale_with_baseline=False)
    return metric.evaluate_batch(hypothesis, references)['bert_score_f1']

Downloading the meteor jar


# Data Preprocessing for BertSum

In [12]:
import nltk
import json
import pandas as pd

def data_prep_for_BertSum(dataset: pd.DataFrame, language: str, save_path: str, corpus_type: str) -> None:
    print(f"Sentence splitting, tokenizing and converting '{corpus_type}' split to json...")
    dataset_json = []
    p_ct = 0
    shard_size = 2000
    for index, row in dataset.iterrows():
        src_tokens = []
        tgt_tokens = []

        src_sentences = nltk.sent_tokenize(row['text'], language)
        for sent in src_sentences:
            src_tokens.append(nltk.word_tokenize(sent, language))

        tgt_sentences = nltk.sent_tokenize(row['summary'], language)
        for sent in tgt_sentences:
            tgt_tokens.append(nltk.word_tokenize(sent, language))

        dataset_json.append({'src': src_tokens, 'tgt': tgt_tokens})
        if (len(dataset_json) >= shard_size):
                pt_file = "{:s}/{:s}.{:d}.json".format(save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset_json))
                    save.write(json.dumps(dataset_json))
                    p_ct += 1
                    dataset_json = []

    if (len(dataset_json) > 0):
        pt_file = "{:s}/{:s}.{:d}.json".format(save_path, corpus_type, p_ct)
        with open(pt_file, 'w') as save:
            # save.write('\n'.join(dataset_json))
            save.write(json.dumps(dataset_json))
            p_ct += 1
            dataset_json = []

# MLSUM

In [13]:
# Let's import the library. We typically only need at most four methods:
from datasets import list_datasets, list_metrics, load_dataset, load_metric
from pprint import pprint
import pandas as pd

def load_mlsum_to_csv(corpus_type: str) -> pd.DataFrame:
    # Downloading and loading a dataset
    hf_split: str = corpus_type
    if hf_split == "valid":
        hf_split = "validation"
    mlsum_dataset = load_dataset('mlsum', 'de', split=hf_split)

    # Saving dataframe in the form of csv
    df = pd.DataFrame(mlsum_dataset, columns=["text","summary"])
    df.to_csv(f"data/mlsum/{corpus_type}.csv", index=False)

    return df

for corpus_type in ['train', 'valid', 'test']:
    mlsum_dataset = load_mlsum_to_csv(corpus_type)
    data_prep_for_BertSum(mlsum_dataset, 'german', "json_data/mlsum", corpus_type)

Reusing dataset mlsum (/root/.cache/huggingface/datasets/mlsum/de/1.0.0/fa51ffa9847464afce0f114ce70ab956e57905627bb24435851ddb91312a2238)


Sentence splitting, tokenizing and converting 'train' split to json...


Reusing dataset mlsum (/root/.cache/huggingface/datasets/mlsum/de/1.0.0/fa51ffa9847464afce0f114ce70ab956e57905627bb24435851ddb91312a2238)


Sentence splitting, tokenizing and converting 'valid' split to json...


Reusing dataset mlsum (/root/.cache/huggingface/datasets/mlsum/de/1.0.0/fa51ffa9847464afce0f114ce70ab956e57905627bb24435851ddb91312a2238)


Sentence splitting, tokenizing and converting 'test' split to json...


In [11]:
import itertools
#merged_mlsum_dataset = itertools.chain(mlsum_dataset['train'], mlsum_dataset['test'], mlsum_dataset['validation'])
mlsum_dataset = pd.read_csv("data/mlsum/test.csv").iterrows()
mlsum_src, mlsum_rnd_sum, mlsum_lead_sum, mlsum_textrank_sum, mlsum_tgt = map(list,zip(*[(
                                                      row['text'],
                                                      get_random_summary(row['text']),
                                                      get_lead_summary(row['text']),
                                                      get_textrank_summary(row['text'], 0.05),
                                                      row['summary']
                                                    ) for index, row in mlsum_dataset]))
print(len(mlsum_src))
print(len(mlsum_rnd_sum))
print(len(mlsum_lead_sum))
print(len(mlsum_textrank_sum))
print(len(mlsum_tgt))

10701
10701
10701
10701
10701


In [8]:
import pandas as pd

mlsum_oracle_sum = pd.read_csv("results/mlsum/oracle_step0_hypo.csv")['hypothesis'].fillna('').tolist()
mlsum_oracle_tgt = pd.read_csv("results/mlsum/oracle_step0_ref.csv")['references'].fillna('').tolist()
mlsum_bertsum_sum = pd.read_csv("results/mlsum/bertsum_step50000_hypo.csv")['hypothesis'].fillna('').tolist()
mlsum_bertsum_tgt = pd.read_csv("results/mlsum/bertsum_step50000_ref.csv")['references'].fillna('').tolist()

In [None]:
oracle_rouge = get_rouge(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_bleu = get_bleu(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_meteor = get_meteor(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_bert_score = get_bert_score(mlsum_oracle_sum, mlsum_oracle_tgt)
print("Done")

In [16]:
bertsum_rouge = get_rouge(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_bleu = get_bleu(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_meteor = get_meteor(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_bert_score = get_bert_score(mlsum_bertsum_sum, mlsum_bertsum_tgt)
print("Done")

hash_code: bert-base-german-dbmdz-cased_L9_idf_version=0.3.9(hug_trans=4.5.1)
Done


In [17]:
rnd_rouge = get_rouge(mlsum_rnd_sum, mlsum_tgt)
rnd_bleu = get_bleu(mlsum_rnd_sum, mlsum_tgt)
rnd_meteor = get_meteor(mlsum_rnd_sum, mlsum_tgt)
rnd_bert_score = get_bert_score(mlsum_rnd_sum, mlsum_tgt)
print("Done")

hash_code: bert-base-german-dbmdz-cased_L9_idf_version=0.3.9(hug_trans=4.5.1)
Done


In [18]:
lead_rouge =  get_rouge(mlsum_lead_sum, mlsum_tgt)
lead_bleu = get_bleu(mlsum_lead_sum, mlsum_tgt)
lead_meteor = get_meteor(mlsum_lead_sum, mlsum_tgt)
lead_bert_score = get_bert_score(mlsum_lead_sum, mlsum_tgt)
print("Done")

hash_code: bert-base-german-dbmdz-cased_L9_idf_version=0.3.9(hug_trans=4.5.1)
Done


In [None]:
tr_rouge = get_rouge(mlsum_textrank_sum, mlsum_tgt)
tr_bleu = get_bleu(mlsum_textrank_sum, mlsum_tgt)
tr_meteor = get_meteor(mlsum_textrank_sum, mlsum_tgt)
tr_bert_score = get_bert_score(mlsum_textrank_sum, mlsum_tgt)
print("Done")

In [20]:
import pandas as pd

mlsum_eval_df = pd.DataFrame([
    ["Random-3",rnd_rouge['rouge-1'],rnd_rouge['rouge-2'],rnd_rouge['rouge-l'],rnd_bleu,rnd_meteor,rnd_bert_score],
    ["Lead-3",lead_rouge['rouge-1'],lead_rouge['rouge-2'],lead_rouge['rouge-l'],lead_bleu,lead_meteor,lead_bert_score],
    ["TextRank",tr_rouge['rouge-1'],tr_rouge['rouge-2'],tr_rouge['rouge-l'],tr_bleu,tr_meteor,tr_bert_score],
    ["Oracle",oracle_rouge['rouge-1'],oracle_rouge['rouge-2'],oracle_rouge['rouge-l'],oracle_bleu,oracle_meteor,oracle_bert_score],
    ["BertSum",bertsum_rouge['rouge-1'],bertsum_rouge['rouge-2'],bertsum_rouge['rouge-l'],bertsum_bleu,bertsum_meteor,bertsum_bert_score]
], columns=["Summary","ROUGE-1","ROUGE-2","ROUGE-L","BLEU","METEOR","BERT-Score"])

print(mlsum_eval_df.head(5))
mlsum_eval_df.to_csv("results/mlsum/eval.csv", index=False)

    Summary   ROUGE-1   ROUGE-2   ROUGE-L       BLEU    METEOR  BERT-Score
0  Random-3  0.142578  0.051765  0.125814   3.357033  0.103051    0.565692
1    Lead-3  0.366559  0.276914  0.330138  17.374910  0.238569    0.668897
2  TextRank  0.192702  0.077680  0.162778   4.427745  0.090128    0.453015
3    Oracle  0.553824  0.427898  0.468219  40.446977  0.312505    0.694175
4   BertSum  0.395197  0.271116  0.274531  19.787231  0.274527    0.659321


In [22]:
import pandas as pd

mlsum_eval_df = pd.read_csv("results/mlsum/eval.csv")
mlsum_eval_new_df = pd.DataFrame([
    ["Oracle",oracle_rouge['rouge-1'],oracle_rouge['rouge-2'],oracle_rouge['rouge-l'],oracle_bleu,oracle_meteor,oracle_bert_score],
    ["BertSum",bertsum_rouge['rouge-1'],bertsum_rouge['rouge-2'],bertsum_rouge['rouge-l'],bertsum_bleu,bertsum_meteor,bertsum_bert_score]
], columns=["Summary","ROUGE-1","ROUGE-2","ROUGE-L","BLEU","METEOR","BERT-Score"])
mlsum_eval_df = mlsum_eval_df.append(mlsum_eval_new_df, ignore_index=True)

print(mlsum_eval_df.head(5))
mlsum_eval_df.to_csv("results/mlsum/eval.csv", index=False)

    Summary   ROUGE-1   ROUGE-2   ROUGE-L       BLEU    METEOR  BERT-Score
0  Random-3  0.146089  0.049990  0.107071   3.645260  0.098402    0.544482
1    Lead-3  0.359463  0.263057  0.282831  17.080397  0.228436    0.654696
2  TextRank  0.194648  0.074695  0.146609   4.922404  0.087147    0.454724
3    Oracle  0.553824  0.427898  0.468219  40.446977  0.312505    0.708408
4   BertSum  0.395197  0.271116  0.274531  19.787231  0.274527    0.681540


# GeWiki

In [21]:
# fetch GeWiki data splits from their github repo: https://github.com/domfr/GeWiki

# Uncomment below lines, to fetch the GeWiki data, unzipping it, and merging mutiple files into one based on "src" or "tgt" for train, eval and test splits

#!wget -nv -i data/gewiki_urls.txt -O data/gewiki/gewiki.zip
#!unzip data/gewiki/gewiki.zip -d data/gewiki/
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/test/*.src > data/gewiki/test_src.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/test/*.tgt > data/gewiki/test_tgt.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/eval/*.src > data/gewiki/validation_src.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/eval/*.tgt > data/gewiki/validation_tgt.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/train/*.src > data/gewiki/train_src.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/train/*.tgt > data/gewiki/train_tgt.txt

from collections import defaultdict
from tqdm import tqdm
import pandas as pd

def merge_src_tgt_to_csv(num_of_files: int, csv_name: str) -> None:
    results = defaultdict(list)

    with open(f"data/gewiki/{csv_name}_src.txt", "r") as src:
        src = src.read()
    with open(f"data/gewiki/{csv_name}_tgt.txt", "r") as tgt:
        tgt = tgt.read()

    src_list = src.split("[SEP]")
    tgt_list = tgt.split("[SEP]")
    for i in tqdm(range(1, num_of_files + 1)):
        results["text"].append(src_list[i])
        results["summary"].append(tgt_list[i])
    df = pd.DataFrame(results)
    df.to_csv(f"data/gewiki/{csv_name}.csv", False)

# Creating Train CSV
# merge_src_tgt_to_csv(220000, "train")

# Creating Eval CSV
# merge_src_tgt_to_csv(10000, "valid")

# Creating Test CSV
# merge_src_tgt_to_csv(10000, "test")

for corpus_type in ['train', 'valid', 'test']:
    gewiki_dataset = pd.read_csv(f"data/gewiki/{corpus_type}.csv")
    data_prep_for_BertSum(gewiki_dataset, 'german', "json_data/gewiki", corpus_type)

Sentence splitting, tokenizing and converting 'train' split to json...
Sentence splitting, tokenizing and converting 'valid' split to json...
Sentence splitting, tokenizing and converting 'test' split to json...


In [22]:
import pandas as pd

# generating summaries only for the test set
gewiki_dataset = pd.read_csv("data/gewiki/test.csv").iterrows()
gewiki_src, gewiki_rnd_sum, gewiki_lead_sum, gewiki_textrank_sum, gewiki_tgt = map(list,zip(*[(
                                                      row['text'],
                                                      get_random_summary(row['text']),
                                                      get_lead_summary(row['text']),
                                                      get_textrank_summary(row['text'], 0.06),
                                                      row['summary']
                                                    ) for index, row in gewiki_dataset]))
print(len(gewiki_src))
print(len(gewiki_rnd_sum))
print(len(gewiki_lead_sum))
print(len(gewiki_textrank_sum))
print(len(gewiki_tgt))

10000
10000
10000
10000
10000


In [23]:
import pandas as pd

gewiki_oracle_sum = pd.read_csv("results/gewiki/oracle_step0_hypo.csv")['hypothesis'].fillna('').tolist()
gewiki_oracle_tgt = pd.read_csv("results/gewiki/oracle_step0_ref.csv")['references'].fillna('').tolist()
gewiki_bertsum_sum = pd.read_csv("results/gewiki/bertsum_step7000_hypo.csv")['hypothesis'].fillna('').tolist()
gewiki_bertsum_tgt = pd.read_csv("results/gewiki/bertsum_step7000_ref.csv")['references'].fillna('').tolist()

FileNotFoundError: ignored

In [None]:
oracle_rouge = get_rouge(gewiki_oracle_sum, gewiki_oracle_tgt)
oracle_bleu = get_bleu(gewiki_oracle_sum, gewiki_oracle_tgt)
oracle_meteor = get_meteor(gewiki_oracle_sum, gewiki_oracle_tgt)
oracle_bert_score = get_bert_score(gewiki_oracle_sum, gewiki_oracle_tgt)
print("Done")

In [None]:
bertsum_rouge = get_rouge(gewiki_bertsum_sum, gewiki_bertsum_tgt)
bertsum_bleu = get_bleu(gewiki_bertsum_sum, gewiki_bertsum_tgt)
bertsum_meteor = get_meteor(gewiki_bertsum_sum, gewiki_bertsum_tgt)
bertsum_bert_score = get_bert_score(gewiki_bertsum_sum, gewiki_bertsum_tgt)
print("Done")

In [None]:
rnd_rouge = get_rouge(gewiki_rnd_sum, gewiki_tgt)
rnd_bleu = get_bleu(gewiki_rnd_sum, gewiki_tgt)
rnd_meteor = get_meteor(gewiki_rnd_sum, gewiki_tgt)
rnd_bert_score = get_bert_score(gewiki_rnd_sum, gewiki_tgt)
print("Done")

In [None]:
lead_rouge = get_rouge(gewiki_lead_sum, gewiki_tgt)
lead_bleu = get_bleu(gewiki_lead_sum, gewiki_tgt)
lead_meteor = get_meteor(gewiki_lead_sum, gewiki_tgt)
lead_bert_score = get_bert_score(gewiki_lead_sum, gewiki_tgt)
print("Done")

In [None]:
tr_rouge = get_rouge(gewiki_textrank_sum, gewiki_tgt)
tr_bleu = get_bleu(gewiki_textrank_sum, gewiki_tgt)
tr_meteor = get_meteor(gewiki_textrank_sum, gewiki_tgt)
tr_bert_score = get_bert_score(gewiki_textrank_sum, gewiki_tgt)
print("Done")

In [None]:
import pandas as pd

gewiki_eval_df = pd.DataFrame([
    ["Random-3",rnd_rouge['rouge-1'],rnd_rouge['rouge-2'],rnd_rouge['rouge-l'],rnd_bleu,rnd_meteor,rnd_bert_score],
    ["Lead-3",lead_rouge['rouge-1'],lead_rouge['rouge-2'],lead_rouge['rouge-l'],lead_bleu,lead_meteor,lead_bert_score],
    ["TextRank",tr_rouge['rouge-1'],tr_rouge['rouge-2'],tr_rouge['rouge-l'],tr_bleu,tr_meteor,tr_bert_score],
    ["Oracle",oracle_rouge['rouge-1'],oracle_rouge['rouge-2'],oracle_rouge['rouge-l'],oracle_bleu,oracle_meteor,oracle_bert_score],
    ["BertSum",bertsum_rouge['rouge-1'],bertsum_rouge['rouge-2'],bertsum_rouge['rouge-l'],bertsum_bleu,bertsum_meteor,bertsum_bert_score]
], columns=["Summary","ROUGE-1","ROUGE-2","ROUGE-L","BLEU","METEOR","BERT-Score"])

print(gewiki_eval_df.head(5))
gewiki_eval_df.to_csv("results/gewiki/eval.csv", index=False)

In [None]:
import pandas as pd

gewiki_eval_df = pd.read_csv("results/gewiki/eval.csv")
gewiki_eval_new_df = pd.DataFrame([
    ["Oracle",oracle_rouge['rouge-1'],oracle_rouge['rouge-2'],oracle_rouge['rouge-l'],oracle_bleu,oracle_meteor,oracle_bert_score],
    ["BertSum",bertsum_rouge['rouge-1'],bertsum_rouge['rouge-2'],bertsum_rouge['rouge-l'],bertsum_bleu,bertsum_meteor,bertsum_bert_score]
], columns=["Summary","ROUGE-1","ROUGE-2","ROUGE-L","BLEU","METEOR","BERT-Score"])
gewiki_eval_df = gewiki_eval_df.append(gewiki_eval_new_df, ignore_index=True)

print(gewiki_eval_df.head(5))
gewiki_eval_df.to_csv("results/gewiki/eval.csv", index=False)