# Installation of required packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/My Drive/GeSumGenEval

/content/drive/.shortcut-targets-by-id/1PcWXs_So5sTaP0wBAR77_ORSfd4aFtHq/GeSumGenEval


In [None]:
# install all the required packages first after a start of every new collab session
!pip install -r requirements.txt

In [None]:
import sys
import nltk

print(sys.executable)

nltk.download('punkt')
nltk.download('stopwords')

/usr/bin/python3
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.


# Data Cleaning

In [None]:
import re
import string
#from nltk.corpus import stopwords

punctuations = string.punctuation.replace('.', '')
#stop_words = stopwords.words("german")
def clean_text(x):
    # Lowercase the text
    x = x.strip().lower()
    # Remove stop words
    #x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    # Remove unicode characters
    #x = x.encode('ascii', 'ignore').decode()
    # Remove URL
    x = re.sub(r'https*\S+', ' ', x)
    # Remove mentions
    #x = re.sub(r'@\S+', ' ', x)
    # Remove Hashtags
    #x = re.sub(r'#\S+', ' ', x)
    # Remove ticks and the next character
    #x = re.sub(r'\'\w+', '', x)
    # Remove punctuations
    x = re.sub('[%s]' % re.escape(punctuations), '', x)
    # Remove numbers
    #x = re.sub(r'\w*\d+\w*', '', x)
    # Replace the over spaces
    x = re.sub(r'\s{2,}', ' ', x)
    return x

# Summary Generation

In [None]:
import nltk
import random

def get_random_summary(source: str, num_sent=3, language='german') -> str:
    sentences = nltk.sent_tokenize(source, language)

    return "\n".join(random.sample(sentences, num_sent))   

def get_lead_summary(source: str, num_sent=3, language='german') -> str:
    sentences = nltk.sent_tokenize(source, language)

    return "\n".join(sentences[:3])

from summa.summarizer import summarize

def get_textrank_summary(source: str, ratio: float, language='german') -> str:
    # By default ratio value is 0.2.
    summary = summarize(source, language=language, ratio=ratio)
    sentences = nltk.sent_tokenize(summary, language)

    return "\n".join(sentences)

def get_text_with_breaks(reference: str, language='german') -> str:
    sentences = nltk.sent_tokenize(reference, language)

    return "\n".join(sentences)

def get_word_len(source: str, language='german') -> int:
    words = nltk.sent_tokenize(source, language)

    return len(words)

from itertools import combinations
def get_oracle_summary(source: str, reference: str, num_sent=3, language='german') -> str:
    sentences = nltk.sent_tokenize(source, language)
    max_score = 0
    oracle_summary = ""

    candidates = combinations(sentences, num_sent)
    for summary in candidates:
        summary = "\n".join(summary)
        score = get_rouge([summary], [reference], False)[0]['rouge-l']['f']
        if score > max_score:
            max_score = score
            oracle_summary = summary

    return oracle_summary

# Summary Evaluation

## Evaluation metrics

In [None]:
#! echo $PYTHONPATH
%env PYTHONPATH="/env/python:/usr/local/lib/python3.7/dist-packages/summ_eval"
#! echo $PYTHONPATH

from rouge import Rouge
from importlib import reload
import src.gerouge as gerouge
from summ_eval.bleu_metric import BleuMetric
from summ_eval.meteor_metric import MeteorMetric
from summ_eval.bert_score_metric import BertScoreMetric
from summ_eval.mover_score_metric import MoverScoreMetric
import summ_eval.supert_metric as supert_metric
from summ_eval.sentence_transformers import SentenceTransformer
from blanc import BlancTune
from collections import Counter
import os

spaced_stop_words = ""
with open('data/smart_stop.txt', 'r', encoding='latin-1') as f:
    stop_words = f.read().splitlines()
    spaced_stop_words = " ".join(stop_words)
    #spaced_stop_words = spaced_stop_words.decode('latin-1').encode('utf-8')

with open('data/spaced_stop_words.txt', 'w', encoding='utf-8') as f:
    f.write(spaced_stop_words)

def ignore_empty(hyps, refs):
    # Filter out hyps of 0 length
    hyps_and_refs = zip(hyps, refs)
    hyps_and_refs = [_ for _ in hyps_and_refs
                        if len(_[0]) > 0
                        and len(_[1]) > 0]
    
    return zip(*hyps_and_refs)

def get_rouge(hypothesis, references, avg=True, ignore_empty=True, language='german'):
    if language == 'german':
        rouge = gerouge.GeRouge(minimal_mode=True)
    else:
        rouge = Rouge()

    rouge_scores = rouge.get_scores(hypothesis, references, avg=avg, ignore_empty=ignore_empty)
    if avg:
        return {k: v['f'] for k, v in rouge_scores.items()}
    else:
        return map(list,zip(*[(row['rouge-1']['f'], row['rouge-2']['f'], row['rouge-l']['f']) for row in rouge_scores]))

def get_bleu(hypothesis, references, avg=True):
    metric = BleuMetric(force=True)
    if avg:
        hypothesis, references = ignore_empty(hypothesis, references)
        bleu_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return (bleu_scores['bleu']/100)
    else:
        bleu_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return [(row['bleu']/100) for row in bleu_scores]
    
def get_meteor(hypothesis, references, avg=True):
    metric = MeteorMetric()
    if avg:
        hypothesis, references = ignore_empty(hypothesis, references)
        meteor_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return meteor_scores['meteor']
    else:
        meteor_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return [row['meteor'] for row in meteor_scores]

def get_bert_score(hypothesis, references, avg=True):
    metric = BertScoreMetric(lang='de', model_type='dbmdz/bert-base-german-cased', num_layers=9, verbose=False, idf=True, rescale_with_baseline=False)
    if avg:
        hypothesis, references = ignore_empty(hypothesis, references)
        bert_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return bert_scores['bert_score_f1']
    else:
        bert_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return [row['bert_score_f1'] for row in bert_scores]

def get_mover_score(hypothesis, references, avg=True):
    os.environ['MOVERSCORE_MODEL'] = "dbmdz/bert-base-german-cased"
    metric = MoverScoreMetric(version=2, stop_wordsf='data/spaced_stop_words.txt')
    if avg:
        hypothesis, references = ignore_empty(hypothesis, references)
        mover_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return mover_scores['mover_score']
    else:
        mover_scores = metric.evaluate_batch(hypothesis, references, aggregate=avg)
        return [row['mover_score'] for row in mover_scores]

def get_blanc(hypothesis, sources, avg=True):
    corpus_score_dict = Counter()
    # Best configuration parameter values for german language taken from https://arxiv.org/abs/2105.06027
    blanc_mod = BlancTune(device='cuda', inference_batch_size=128, finetune_batch_size=24, model_name='dbmdz/bert-base-german-cased', gap=2, 
                          min_token_length_normal=4, min_token_length_lead=2, min_token_length_followup=1)
        
    
    if avg:
        hypothesis, sources = ignore_empty(hypothesis, sources)
        results = blanc_mod.eval_pairs(sources, hypothesis)
        results = [{"blanc": score} for score in results]
        [corpus_score_dict.update(x) for x in results]
        for key in corpus_score_dict.keys():
            corpus_score_dict[key] /= float(len(sources))
        return corpus_score_dict['blanc']
    else:
        results = blanc_mod.eval_pairs(sources, hypothesis)
        results = [{"blanc": score} for score in results]
        return [row['blanc'] for row in results]

def get_supert(hypothesis, sources, avg=True):
    metric = supert_metric.SupertMetric()
    #metric.bert_model = SentenceTransformer('xlm-r-100langs-bert-base-nli-stsb-mean-tokens')
    if avg:
        hypothesis, sources = ignore_empty(hypothesis, sources)
        supert_scores = metric.evaluate_batch(hypothesis, sources, aggregate=avg)
        return supert_scores['supert']
    else:
        supert_scores = metric.evaluate_batch(hypothesis, sources, aggregate=avg)
        return [row['supert'] for row in supert_scores]

env: PYTHONPATH="/env/python:/usr/local/lib/python3.7/dist-packages/summ_eval"


## Quality Estimation

In [None]:
import pandas as pd
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
import numpy as np

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
MAX_LEN = 200

def get_qe(hypothesis, avg=True):
    output = {
        'expert_coherence': [],
        'expert_consistency': [],
        'expert_fluency': [],
        'expert_relevance': [],
        'crowd_coherence': [],
        'crowd_consistency': [],
        'crowd_fluency': [],
        'crowd_relevance': [],
        }

    for key, value in output.items():
        # Model class must be defined somewhere
        model = torch.load(f'models/cnndm/{key}.pt')
        model.eval()

        # Tokenize all of the sentences and map the tokens to thier word IDs.
        input_ids = []

        # For every sentence...
        for sent in hypothesis:
            # `encode` will:
            #   (1) Tokenize the sentence.
            #   (2) Prepend the `[CLS]` token to the start.
            #   (3) Append the `[SEP]` token to the end.
            #   (4) Map tokens to their IDs.
            encoded_sent = tokenizer.encode(
                                sent,                      # Sentence to encode.
                                add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        )
            
            input_ids.append(encoded_sent)

        # Pad our input tokens
        input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                                dtype="long", truncating="post", padding="post")

        # Create attention masks
        attention_masks = []

        # Create a mask of 1s for each token followed by 0s for padding
        for seq in input_ids:
            seq_mask = [float(i>0) for i in seq]
            attention_masks.append(seq_mask) 

        # Convert to tensors.
        prediction_inputs = torch.tensor(input_ids)
        prediction_masks = torch.tensor(attention_masks)

        # Set the batch size.  
        batch_size = 16  

        # Create the DataLoader.
        prediction_data = TensorDataset(prediction_inputs, prediction_masks)
        prediction_sampler = SequentialSampler(prediction_data)
        prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

        # Prediction on test set

        print('Predicting labels for {:,} test sentences...'.format(len(prediction_inputs)))

        # Predict 
        for batch in prediction_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask = batch
            
            # Telling the model not to compute or store gradients, saving memory and 
            # speeding up prediction
            with torch.no_grad():
                # Forward pass, calculate logit predictions
                outputs = model(b_input_ids, token_type_ids=None, 
                                attention_mask=b_input_mask)

            logits = outputs[0]

            # Move logits to CPU
            logits = logits.detach().cpu().numpy()
            # Store predictions
            pred_labels_i = np.argmax(logits, axis=1).flatten()
            value.extend(pred_labels_i + 1)

    return output

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB
Loading BERT tokenizer...


## Jensen Shannon

In [None]:
from nltk.stem.snowball import GermanStemmer
import re
import copy
import scipy
from statistics import mean

with open('data/spaced_stop_words.txt', 'r', encoding='utf-8') as f:
    STOP_WORDS = set(f.read().strip().split(' '))
    
porter = GermanStemmer()

def text_to_distribution(txt, do_stem_porter=False, remove_stopwords=False):
    words = re.findall(r"[\w']+|[.,!?;]", txt)
    if do_stem_porter:
        words = [porter.stem(w) for w in words]
    if remove_stopwords:
        words = [w for w in words if w not in STOP_WORDS]
    map_word_count = {}
    for w in words:
        if w in map_word_count:
            map_word_count[w] += 1
        else:
            map_word_count[w] = 1
    return map_word_count


def combine_distributions(distr1, distr2):
    d1 = copy.deepcopy(distr1)
    for k in distr2.keys():
        if k not in d1:
            d1[k] = 0
    d2 = copy.deepcopy(distr2)
    for k in distr1.keys():
        if k not in d2:
            d2[k] = 0
    keys = d1.keys()
    v1 = [d1[k] for k in keys]
    v2 = [d2[k] for k in keys]
    return v1, v2


def good_len_summ(summary, low=-1, high=1000000):
    summ = ' '.join(summary.strip().split())
    len_summ = len(summ)
    if len(summ) >= low and len(summ)<=high:
        return True
    return False

def js_divergence(hypothesis, sources, low=-1, high=1000000, do_stem_porter=True, remove_stopwords=True):
    divergences = []
    for i in range(len(hypothesis)):
        summ = hypothesis[i]
        if not good_len_summ(summ, low=low, high=high):
            continue
        text = sources[i]
        distr_summ = text_to_distribution(summ, do_stem_porter=do_stem_porter, remove_stopwords=remove_stopwords)
        distr_text = text_to_distribution(text, do_stem_porter=do_stem_porter, remove_stopwords=remove_stopwords)
        v1, v2 = combine_distributions(distr_summ, distr_text)
        divergence = scipy.spatial.distance.jensenshannon(v1, v2)**2
        divergences.append(divergence)

    return divergences

def get_jensenshannon(hypothesis, sources, avg=True):
    if avg:
        hypothesis, sources = ignore_empty(hypothesis, sources)
        js_scores = js_divergence(hypothesis, sources)
        return mean(js_scores)
    else:
        js_scores = js_divergence(hypothesis, sources)
        return js_scores

# Data Preprocessing for BertSum

In [None]:
import nltk
import json
import pandas as pd

def data_prep_for_BertSum(dataset: pd.DataFrame, language: str, save_path: str, corpus_type: str) -> None:
    print(f"Sentence splitting, tokenizing and converting '{corpus_type}' split to json...")
    dataset_json = []
    p_ct = 0
    shard_size = 2000
    for index, row in dataset.iterrows():
        src_tokens = []
        tgt_tokens = []

        src_sentences = nltk.sent_tokenize(row['text'], language)
        for sent in src_sentences:
            src_tokens.append(nltk.word_tokenize(sent, language))

        tgt_sentences = nltk.sent_tokenize(row['summary'], language)
        for sent in tgt_sentences:
            tgt_tokens.append(nltk.word_tokenize(sent, language))

        dataset_json.append({'src': src_tokens, 'tgt': tgt_tokens})
        if (len(dataset_json) >= shard_size):
                pt_file = "{:s}/{:s}.{:d}.json".format(save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset_json))
                    save.write(json.dumps(dataset_json))
                    p_ct += 1
                    dataset_json = []

    if (len(dataset_json) > 0):
        pt_file = "{:s}/{:s}.{:d}.json".format(save_path, corpus_type, p_ct)
        with open(pt_file, 'w') as save:
            # save.write('\n'.join(dataset_json))
            save.write(json.dumps(dataset_json))
            p_ct += 1
            dataset_json = []

# Data Preprocessing for MatchSum

In [None]:
import nltk
import json
import pandas as pd

def data_prep_for_MatchSum(dataset: pd.DataFrame, language: str, save_path: str, corpus_type: str) -> None:
    print(f"Sentence splitting and converting '{corpus_type}' split to json...")

    pt_file = "{:s}/{:s}.jsonl".format(save_path, corpus_type)
    with open(pt_file, 'w') as save:
        for index, row in dataset.iterrows():
            src_sentences = nltk.sent_tokenize(row['text'], language)
            tgt_sentences = nltk.sent_tokenize(row['summary'], language)
            save.write(json.dumps({'text': src_sentences, 'summary': tgt_sentences}))
            save.write("\n")

# MLSUM

## Data Loading

In [None]:
# Let's import the library. We typically only need at most four methods:
from datasets import list_datasets, list_metrics, load_dataset, load_metric
from pprint import pprint
import pandas as pd

def load_mlsum_to_csv(corpus_type: str) -> pd.DataFrame:
    # Downloading and loading a dataset
    hf_split: str = corpus_type
    if hf_split == "valid":
        hf_split = "validation"
    mlsum_dataset = load_dataset('mlsum', 'de', split=hf_split)

    # Saving dataframe in the form of csv
    df = pd.DataFrame(mlsum_dataset, columns=["text","summary"])
    #df.to_csv(f"data/mlsum/{corpus_type}.csv", index=False)

    return df

#for corpus_type in ['train', 'valid', 'test']:
#    mlsum_dataset = load_mlsum_to_csv(corpus_type)
#    data_prep_for_BertSum(mlsum_dataset, 'german', "json_data/mlsum", corpus_type)
#    data_prep_for_MatchSum(mlsum_dataset, 'german', "json_data/mlsum", corpus_type)

In [None]:
import itertools
import pandas as pd

#merged_mlsum_dataset = itertools.chain(mlsum_dataset['train'], mlsum_dataset['test'], mlsum_dataset['validation'])
mlsum_dataset = pd.read_csv("data/mlsum/test.csv")
mlsum_src, mlsum_rnd_sum, mlsum_lead_sum, mlsum_textrank_sum, mlsum_tgt, mlsum_dataset["text_word_len"], mlsum_dataset["sum_word_len"] = map(list,zip(*[(
                                                      get_text_with_breaks(row['text']),
                                                      get_random_summary(row['text']),
                                                      get_lead_summary(row['text']),
                                                      get_textrank_summary(get_text_with_breaks(row['text']), 0.06),
                                                      get_text_with_breaks(row['summary']),
                                                      get_word_len(row['text']), 
                                                      get_word_len(row['summary']) 
                                                    ) for index, row in mlsum_dataset.iterrows()]))
print(len(mlsum_src))
print(len(mlsum_rnd_sum))
print(len(mlsum_lead_sum))
print(len(mlsum_textrank_sum))
print(len(mlsum_tgt))

#pd.DataFrame(mlsum_src).to_csv("results/mlsum/source.csv", index=False)
#pd.DataFrame(mlsum_rnd_sum).to_csv("results/mlsum/random_hypo.csv", index=False)
#pd.DataFrame(mlsum_lead_sum).to_csv("results/mlsum/lead_hypo.csv", index=False)
#pd.DataFrame(mlsum_textrank_sum).to_csv("results/mlsum/textrank_hypo.csv", index=False)
#pd.DataFrame(mlsum_tgt).to_csv("results/mlsum/reference.csv", index=False)

10000
10000
10000
10000
10000


In [None]:
import pandas as pd

mlsum_oracle_sum = pd.read_csv("results/mlsum/oracle_hypo.csv")['hypothesis'].fillna('').tolist()
mlsum_oracle_tgt = pd.read_csv("results/mlsum/oracle_ref.csv")['references'].fillna('').tolist()
mlsum_bertsum_sum = pd.read_csv("results/mlsum/bertsum_hypo.csv")['hypothesis'].fillna('').tolist()
mlsum_bertsum_tgt = pd.read_csv("results/mlsum/bertsum_ref.csv")['references'].fillna('').tolist()
mlsum_matchsum_sum = pd.read_csv("results/mlsum/matchsum_hypo.csv")['hypothesis'].fillna('').tolist()
mlsum_matchsum_tgt = pd.read_csv("results/mlsum/matchsum_ref.csv")['references'].fillna('').tolist()

print(len(mlsum_oracle_sum))
print(len(mlsum_oracle_tgt))
print(len(mlsum_bertsum_sum))
print(len(mlsum_bertsum_tgt))
print(len(mlsum_matchsum_sum))
print(len(mlsum_matchsum_tgt))

10701
10701
10701
10701
10701
10701


## Individual evaluation results per evaluation method

In [None]:
import pandas as pd

r1_rnd, r2_rnd, rl_rnd = get_rouge(mlsum_rnd_sum, mlsum_tgt, avg=False, ignore_empty=False)
r1_lead, r2_lead, rl_lead = get_rouge(mlsum_lead_sum, mlsum_tgt, avg=False, ignore_empty=False)
r1_tr, r2_tr, rl_tr = get_rouge(mlsum_textrank_sum, mlsum_tgt, avg=False, ignore_empty=False)
r1_bs, r2_bs, rl_bs = get_rouge(mlsum_bertsum_sum, mlsum_bertsum_tgt, avg=False, ignore_empty=False)
r1_ms, r2_ms, rl_ms = get_rouge(mlsum_matchsum_sum, mlsum_matchsum_tgt, avg=False, ignore_empty=False)
r1_oracle, r2_oracle, rl_oracle = get_rouge(mlsum_oracle_sum, mlsum_oracle_tgt, avg=False, ignore_empty=False)

rouge_eval = pd.DataFrame({
    'r1-Random-3': r1_rnd, 'r2-Random-3': r2_rnd, 'rl-Random-3': rl_rnd,
    'r1-Lead-3': r1_lead, 'r2-Lead-3': r2_lead, 'rl-Lead-3': rl_lead,
    'r1-TextRank': r1_tr, 'r2-TextRank': r2_tr, 'rl-TextRank': rl_tr,
    'r1-BertSum': r1_bs, 'r2-BertSum': r2_bs, 'rl-BertSum': rl_bs,
    'r1-MatchSum': r1_ms, 'r2-Matchsum': r2_ms, 'rl-MatchSum': rl_ms,
    'r1-Oracle': r1_oracle, 'r2-Oracle': r2_oracle, 'rl-Oracle': rl_oracle
})
rouge_eval.to_csv("results/mlsum/rouge_eval.csv", index=False)

In [None]:
bleu_eval = pd.DataFrame({
    'Random-3': get_bleu(mlsum_rnd_sum, mlsum_tgt, avg=False),
    'Lead-3': get_bleu(mlsum_lead_sum, mlsum_tgt, avg=False),
    'TextRank': get_bleu(mlsum_textrank_sum, mlsum_tgt, avg=False),
    'BertSum': get_bleu(mlsum_bertsum_sum, mlsum_bertsum_tgt, avg=False),
    'MatchSum': get_bleu(mlsum_matchsum_sum, mlsum_matchsum_tgt, avg=False),
    'Oracle': get_bleu(mlsum_oracle_sum, mlsum_oracle_tgt, avg=False)
})
bleu_eval.to_csv("results/mlsum/bleu_eval.csv", index=False)

In [None]:
meteor_eval = pd.DataFrame({
    'Random-3': get_meteor(mlsum_rnd_sum, mlsum_tgt, avg=False),
    'Lead-3': get_meteor(mlsum_lead_sum, mlsum_tgt, avg=False),
    'TextRank': get_meteor(mlsum_textrank_sum, mlsum_tgt, avg=False),
    'BertSum': get_meteor(mlsum_bertsum_sum, mlsum_bertsum_tgt, avg=False),
    'MatchSum': get_meteor(mlsum_matchsum_sum, mlsum_matchsum_tgt, avg=False),
    'Oracle': get_meteor(mlsum_oracle_sum, mlsum_oracle_tgt, avg=False)
})
meteor_eval.to_csv("results/mlsum/meteor_eval.csv", index=False)

In [None]:
bert_score_eval = pd.DataFrame({
    'Random-3': get_bert_score(mlsum_rnd_sum, mlsum_tgt, avg=False),
    'Lead-3': get_bert_score(mlsum_lead_sum, mlsum_tgt, avg=False),
    'TextRank': get_bert_score(mlsum_textrank_sum, mlsum_tgt, avg=False),
    'BertSum': get_bert_score(mlsum_bertsum_sum, mlsum_bertsum_tgt, avg=False),
    'MatchSum': get_bert_score(mlsum_matchsum_sum, mlsum_matchsum_tgt, avg=False),
    'Oracle': get_bert_score(mlsum_oracle_sum, mlsum_oracle_tgt, avg=False)
})
bert_score_eval.to_csv("results/mlsum/bert_score_eval.csv", index=False)

In [None]:
mover_score_eval = pd.DataFrame({
    'Random-3': get_mover_score(mlsum_rnd_sum, mlsum_tgt, avg=False),
    'Lead-3': get_mover_score(mlsum_lead_sum, mlsum_tgt, avg=False),
    'TextRank': get_mover_score(mlsum_textrank_sum, mlsum_tgt, avg=False),
    'BertSum': get_mover_score(mlsum_bertsum_sum, mlsum_bertsum_tgt, avg=False),
    'MatchSum': get_mover_score(mlsum_matchsum_sum, mlsum_matchsum_tgt, avg=False),
    'Oracle': get_mover_score(mlsum_oracle_sum, mlsum_oracle_tgt, avg=False)
})
mover_score_eval.to_csv("results/mlsum/mover_score_eval.csv", index=False)

In [None]:
blanc_eval = pd.DataFrame({
    'Random-3': get_blanc(mlsum_rnd_sum, mlsum_src, avg=False),
    'Lead-3': get_blanc(mlsum_lead_sum, mlsum_src, avg=False),
    'TextRank': get_blanc(mlsum_textrank_sum, mlsum_src, avg=False),
    'BertSum': get_blanc(mlsum_bertsum_sum, mlsum_src, avg=False),
    'MatchSum': get_blanc(mlsum_matchsum_sum, mlsum_src, avg=False),
    'Oracle': get_blanc(mlsum_oracle_sum, mlsum_src, avg=False)
})
blanc_eval.to_csv("results/mlsum/blanc_eval.csv", index=False)

In [None]:
js_eval = pd.DataFrame({
    'Random-3': get_jensenshannon(mlsum_rnd_sum, mlsum_src, avg=False),
    'Lead-3': get_jensenshannon(mlsum_lead_sum, mlsum_src, avg=False),
    'TextRank': get_jensenshannon(mlsum_textrank_sum, mlsum_src, avg=False),
    'BertSum': get_jensenshannon(mlsum_bertsum_sum, mlsum_src, avg=False),
    'MatchSum': get_jensenshannon(mlsum_matchsum_sum, mlsum_src, avg=False),
    'Oracle': get_jensenshannon(mlsum_oracle_sum, mlsum_src, avg=False)
})
js_eval.to_csv("results/mlsum/js_eval.csv", index=False)

In [None]:
% cd /usr/local/lib/python3.7/dist-packages/summ_eval/
supert_eval = pd.DataFrame({
    'Random-3': get_supert(mlsum_rnd_sum, mlsum_src, avg=False),
    'Lead-3': get_supert(mlsum_lead_sum, mlsum_src, avg=False),
    'TextRank': get_supert(mlsum_textrank_sum, mlsum_src, avg=False),
    'BertSum': get_supert(mlsum_bertsum_sum, mlsum_src, avg=False),
    'MatchSum': get_supert(mlsum_matchsum_sum, mlsum_src, avg=False),
    'Oracle': get_supert(mlsum_oracle_sum, mlsum_src, avg=False)
})
% cd /content/drive/My Drive/GeSumGenEval
supert_eval.to_csv("results/mlsum/supert_eval.csv", index=False)

In [None]:
qe_rnd = get_qe(mlsum_rnd_sum, avg=False)
qe_lead = get_qe(mlsum_lead_sum, avg=False)
qe_tr = get_qe(mlsum_textrank_sum, avg=False)
qe_bs = get_qe(mlsum_bertsum_sum, avg=False)
qe_ms = get_qe(mlsum_matchsum_sum, avg=False)
qe_oracle = get_qe(mlsum_oracle_sum, avg=False)

In [None]:
quality_dim = [
        'expert_coherence',
        'expert_consistency',
        'expert_fluency',
        'expert_relevance',
        'crowd_coherence',
        'crowd_consistency',
        'crowd_fluency',
        'crowd_relevance',
]
qe_eval = pd.DataFrame()
for dim in quality_dim:
    qe_eval['Random-3 (' + dim + ')'] = qe_rnd[dim]
    qe_eval['Lead-3 (' + dim + ')'] = qe_lead[dim]
    qe_eval['TextRank (' + dim + ')'] = qe_tr[dim]
    qe_eval['BertSum (' + dim + ')'] = qe_bs[dim]
    qe_eval['MatchSum (' + dim + ')'] = qe_ms[dim]
    qe_eval['Oracle (' + dim + ')'] = qe_oracle[dim]
qe_eval.to_csv("results/mlsum/qe_eval.csv", index=False)

## Aggregate evaluation results per generation method

In [None]:
% cd /usr/local/lib/python3.7/dist-packages/summ_eval/

/usr/local/lib/python3.7/dist-packages/summ_eval


In [None]:
oracle_rouge = get_rouge(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_bleu = get_bleu(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_meteor = get_meteor(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_bert_score = get_bert_score(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_mover_score = get_mover_score(mlsum_oracle_sum, mlsum_oracle_tgt)
oracle_blanc = get_blanc(mlsum_oracle_sum, mlsum_src)
oracle_js = get_jensenshannon(mlsum_oracle_sum, mlsum_src)
oracle_supert = get_supert(mlsum_oracle_sum, mlsum_src)

In [None]:
bertsum_rouge = get_rouge(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_bleu = get_bleu(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_meteor = get_meteor(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_bert_score = get_bert_score(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_mover_score = get_mover_score(mlsum_bertsum_sum, mlsum_bertsum_tgt)
bertsum_blanc = get_blanc(mlsum_bertsum_sum, mlsum_src)
bertsum_js = get_jensenshannon(mlsum_bertsum_sum, mlsum_src)
bertsum_supert = get_supert(mlsum_bertsum_sum, mlsum_src)

In [None]:
matchsum_rouge = get_rouge(mlsum_matchsum_sum, mlsum_matchsum_tgt)
matchsum_bleu = get_bleu(mlsum_matchsum_sum, mlsum_matchsum_tgt)
matchsum_meteor = get_meteor(mlsum_matchsum_sum, mlsum_matchsum_tgt)
matchsum_bert_score = get_bert_score(mlsum_matchsum_sum, mlsum_matchsum_tgt)
matchsum_mover_score = get_mover_score(mlsum_matchsum_sum, mlsum_matchsum_tgt)
matchsum_blanc = get_blanc(mlsum_matchsum_sum, mlsum_src)
matchsum_js = get_jensenshannon(mlsum_matchsum_sum, mlsum_src)
matchsum_supert = get_supert(mlsum_matchsum_sum, mlsum_src)

In [None]:
rnd_rouge = get_rouge(mlsum_rnd_sum, mlsum_tgt)
rnd_bleu = get_bleu(mlsum_rnd_sum, mlsum_tgt)
rnd_meteor = get_meteor(mlsum_rnd_sum, mlsum_tgt)
rnd_bert_score = get_bert_score(mlsum_rnd_sum, mlsum_tgt)
rnd_mover_score = get_mover_score(mlsum_rnd_sum, mlsum_tgt)
rnd_blanc = get_blanc(mlsum_rnd_sum, mlsum_src)
rnd_js = get_jensenshannon(mlsum_rnd_sum, mlsum_src)
rnd_supert = get_supert(mlsum_rnd_sum, mlsum_src)

In [None]:
lead_rouge =  get_rouge(mlsum_lead_sum, mlsum_tgt)
lead_bleu = get_bleu(mlsum_lead_sum, mlsum_tgt)
lead_meteor = get_meteor(mlsum_lead_sum, mlsum_tgt)
lead_bert_score = get_bert_score(mlsum_lead_sum, mlsum_tgt)
lead_mover_score = get_mover_score(mlsum_lead_sum, mlsum_tgt)
lead_blanc = get_blanc(mlsum_lead_sum, mlsum_src)
lead_js = get_jensenshannon(mlsum_lead_sum, mlsum_src)
lead_supert = get_supert(mlsum_lead_sum, mlsum_src)

In [None]:
tr_rouge = get_rouge(mlsum_textrank_sum, mlsum_tgt)
tr_bleu = get_bleu(mlsum_textrank_sum, mlsum_tgt)
tr_meteor = get_meteor(mlsum_textrank_sum, mlsum_tgt)
tr_bert_score = get_bert_score(mlsum_textrank_sum, mlsum_tgt)
tr_mover_score = get_mover_score(mlsum_textrank_sum, mlsum_tgt)
tr_blanc = get_blanc(mlsum_textrank_sum, mlsum_src)
tr_js = get_jensenshannon(mlsum_textrank_sum, mlsum_src)
tr_supert = get_supert(mlsum_textrank_sum, mlsum_src)

In [None]:
import pandas as pd

mlsum_eval_df = pd.DataFrame([
    ["Random-3",rnd_rouge['rouge-1'],rnd_rouge['rouge-2'],rnd_rouge['rouge-l'],rnd_bleu,rnd_meteor,rnd_bert_score],
    ["Lead-3",lead_rouge['rouge-1'],lead_rouge['rouge-2'],lead_rouge['rouge-l'],lead_bleu,lead_meteor,lead_bert_score],
    ["TextRank",tr_rouge['rouge-1'],tr_rouge['rouge-2'],tr_rouge['rouge-l'],tr_bleu,tr_meteor,tr_bert_score],
    ["Oracle",oracle_rouge['rouge-1'],oracle_rouge['rouge-2'],oracle_rouge['rouge-l'],oracle_bleu,oracle_meteor,oracle_bert_score],
    ["BertSum",bertsum_rouge['rouge-1'],bertsum_rouge['rouge-2'],bertsum_rouge['rouge-l'],bertsum_bleu,bertsum_meteor,bertsum_bert_score],
    ["MatchSum",matchsum_rouge['rouge-1'],matchsum_rouge['rouge-2'],matchsum_rouge['rouge-l'],matchsum_bleu,matchsum_meteor,matchsum_bert_score]
], columns=["Summary","ROUGE-1","ROUGE-2","ROUGE-L","BLEU","METEOR","BERT-Score"])

print(mlsum_eval_df)
mlsum_eval_df.to_csv("results/mlsum/eval.csv", index=False)

## Summary Selection for Human Evaluation

In [None]:
import random

short_summ_idx = [key for key, value in enumerate(mlsum_dataset["text_word_len"]) if value>500 and value<=600]
long_summ_idx = [key for key, value in enumerate(mlsum_dataset["text_word_len"]) if value>900 and value<=1100]

print(len(short_summ_idx))
print(len(long_summ_idx))

random_short_idx = random.sample(short_summ_idx, 15)
random_long_idx = random.sample(long_summ_idx, 15)

#print(len(random_short_idx))
#print(len(random_long_idx))

In [None]:
rnd_short_summ_list = [(value+2, mlsum_src[value], mlsum_tgt[value], mlsum_rnd_sum[value], mlsum_lead_sum[value], mlsum_textrank_sum[value]) for value in random_short_idx]
rnd_long_summ_list = [(value+2, mlsum_src[value], mlsum_tgt[value], mlsum_rnd_sum[value], mlsum_lead_sum[value], mlsum_textrank_sum[value]) for value in random_long_idx]

#print(len(rnd_summ_list))
human_eval_short = pd.DataFrame(rnd_short_summ_list, columns=['index', 'source', 'expert', 'random', 'lead', 'textrank'])
human_eval_long = pd.DataFrame(rnd_long_summ_list, columns=['index', 'source', 'expert', 'random', 'lead', 'textrank'])

human_eval_short.to_csv("results/mlsum/human_eval_short.csv", index=False)
human_eval_long.to_csv("results/mlsum/human_eval_long.csv", index=False)

In [None]:
import plotly.express as px

fig1 = px.histogram(mlsum_dataset, x="text_word_len", labels={'text_word_len':'No. of words in source article'})
fig1.show()

fig2 = px.histogram(mlsum_dataset, x="sum_word_len", labels={'sum_word_len':'No. of Sentences in gold summary'})
fig2.show()

# GeWiki

## Data Loading

In [None]:
# fetch GeWiki data splits from their github repo: https://github.com/domfr/GeWiki

# Uncomment below lines, to fetch the GeWiki data, unzipping it, and merging mutiple files into one based on "src" or "tgt" for train, eval and test splits

#!wget -nv -i data/gewiki_urls.txt -O data/gewiki/gewiki.zip
#!unzip data/gewiki/gewiki.zip -d data/gewiki/
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/test/*.src > data/gewiki/test_src.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/test/*.tgt > data/gewiki/test_tgt.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/eval/*.src > data/gewiki/validation_src.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/eval/*.tgt > data/gewiki/validation_tgt.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/train/*.src > data/gewiki/train_src.txt
#!awk 'BEGINFILE {print "[SEP]"}{print}' data/gewiki/train/*.tgt > data/gewiki/train_tgt.txt

from collections import defaultdict
from tqdm import tqdm
import pandas as pd

def merge_src_tgt_to_csv(num_of_files: int, csv_name: str) -> None:
    results = defaultdict(list)

    with open(f"data/gewiki/{csv_name}_src.txt", "r") as src:
        src = src.read()
    with open(f"data/gewiki/{csv_name}_tgt.txt", "r") as tgt:
        tgt = tgt.read()

    src_list = src.split("[SEP]")
    tgt_list = tgt.split("[SEP]")
    for i in tqdm(range(1, num_of_files + 1)):
        results["text"].append(src_list[i])
        results["summary"].append(tgt_list[i])
    df = pd.DataFrame(results)
    df.to_csv(f"data/gewiki/{csv_name}.csv", False)

# Creating Train CSV
# merge_src_tgt_to_csv(220000, "train")

# Creating Eval CSV
# merge_src_tgt_to_csv(10000, "valid")

# Creating Test CSV
# merge_src_tgt_to_csv(10000, "test")

#for corpus_type in ['train', 'valid', 'test']:
#    gewiki_dataset = pd.read_csv(f"data/gewiki/{corpus_type}.csv")
#    data_prep_for_BertSum(gewiki_dataset, 'german', "json_data/gewiki", corpus_type)
#    data_prep_for_MatchSum(gewiki_dataset, 'german', "json_data/gewiki", corpus_type)

In [None]:
import pandas as pd

# generating summaries only for the test set
gewiki_dataset = pd.read_csv("data/gewiki/test.csv").iterrows()
gewiki_src, gewiki_rnd_sum, gewiki_lead_sum, gewiki_textrank_sum, gewiki_tgt = map(list,zip(*[(
                                                      get_text_with_breaks(row['text']),
                                                      get_random_summary(row['text']),
                                                      get_lead_summary(row['text']),
                                                      get_textrank_summary(get_text_with_breaks(row['text']), 0.1),
                                                      get_text_with_breaks(row['summary'])
                                                    ) for index, row in gewiki_dataset]))
print(len(gewiki_src))
print(len(gewiki_rnd_sum))
print(len(gewiki_lead_sum))
print(len(gewiki_textrank_sum))
print(len(gewiki_tgt))

#pd.DataFrame(gewiki_rnd_sum).to_csv("results/gewiki/random_hypo.csv", index=False)
#pd.DataFrame(gewiki_lead_sum).to_csv("results/gewiki/lead_hypo.csv", index=False)
#pd.DataFrame(gewiki_textrank_sum).to_csv("results/gewiki/textrank_hypo.csv", index=False)
#pd.DataFrame(gewiki_tgt).to_csv("results/gewiki/reference.csv", index=False)

10000
10000
10000
10000
10000


In [None]:
import pandas as pd

gewiki_oracle_sum = pd.read_csv("results/gewiki/oracle_hypo.csv")['hypothesis'].fillna('').tolist()
gewiki_oracle_tgt = pd.read_csv("results/gewiki/oracle_ref.csv")['references'].fillna('').tolist()
gewiki_bertsum_sum = pd.read_csv("results/gewiki/bertsum_hypo.csv")['hypothesis'].fillna('').tolist()
gewiki_bertsum_tgt = pd.read_csv("results/gewiki/bertsum_ref.csv")['references'].fillna('').tolist()
gewiki_matchsum_sum = pd.read_csv("results/gewiki/matchsum_hypo.csv")['hypothesis'].fillna('').tolist()
gewiki_matchsum_tgt = pd.read_csv("results/gewiki/matchsum_ref.csv")['references'].fillna('').tolist()

print(len(gewiki_oracle_sum))
print(len(gewiki_oracle_tgt))
print(len(gewiki_bertsum_sum))
print(len(gewiki_bertsum_tgt))
print(len(gewiki_matchsum_sum))
print(len(gewiki_matchsum_tgt))

10000
10000
10000
10000
10000
10000


## Individual evaluation results per evaluation method

In [None]:
import pandas as pd

r1_rnd, r2_rnd, rl_rnd = get_rouge(gewiki_rnd_sum, gewiki_tgt, avg=False, ignore_empty=False)
r1_lead, r2_lead, rl_lead = get_rouge(gewiki_lead_sum, gewiki_tgt, avg=False, ignore_empty=False)
r1_tr, r2_tr, rl_tr = get_rouge(gewiki_textrank_sum, gewiki_tgt, avg=False, ignore_empty=False)
r1_bs, r2_bs, rl_bs = get_rouge(gewiki_bertsum_sum, gewiki_bertsum_tgt, avg=False, ignore_empty=False)
r1_ms, r2_ms, rl_ms = get_rouge(gewiki_matchsum_sum, gewiki_matchsum_tgt, avg=False, ignore_empty=False)
r1_oracle, r2_oracle, rl_oracle = get_rouge(gewiki_oracle_sum, gewiki_oracle_tgt, avg=False, ignore_empty=False)

rouge_eval = pd.DataFrame({
    'r1-Random-3': r1_rnd, 'r2-Random-3': r2_rnd, 'rl-Random-3': rl_rnd,
    'r1-Lead-3': r1_lead, 'r2-Lead-3': r2_lead, 'rl-Lead-3': rl_lead,
    'r1-TextRank': r1_tr, 'r2-TextRank': r2_tr, 'rl-TextRank': rl_tr,
    'r1-BertSum': r1_bs, 'r2-BertSum': r2_bs, 'rl-BertSum': rl_bs,
    'r1-MatchSum': r1_ms, 'r2-Matchsum': r2_ms, 'rl-MatchSum': rl_ms,
    'r1-Oracle': r1_oracle, 'r2-Oracle': r2_oracle, 'rl-Oracle': rl_oracle
})
rouge_eval.to_csv("results/gewiki/rouge_eval.csv", index=False)

In [None]:
bleu_eval = pd.DataFrame({
    'Random-3': get_bleu(gewiki_rnd_sum, gewiki_tgt, avg=False),
    'Lead-3': get_bleu(gewiki_lead_sum, gewiki_tgt, avg=False),
    'TextRank': get_bleu(gewiki_textrank_sum, gewiki_tgt, avg=False),
    'BertSum': get_bleu(gewiki_bertsum_sum, gewiki_bertsum_tgt, avg=False),
    'MatchSum': get_bleu(gewiki_matchsum_sum, gewiki_matchsum_tgt, avg=False),
    'Oracle': get_bleu(gewiki_oracle_sum, gewiki_oracle_tgt, avg=False)
})
bleu_eval.to_csv("results/gewiki/bleu_eval.csv", index=False)

In [None]:
meteor_eval = pd.DataFrame({
    'Random-3': get_meteor(gewiki_rnd_sum, gewiki_tgt, avg=False),
    'Lead-3': get_meteor(gewiki_lead_sum, gewiki_tgt, avg=False),
    'TextRank': get_meteor(gewiki_textrank_sum, gewiki_tgt, avg=False),
    'BertSum': get_meteor(gewiki_bertsum_sum, gewiki_bertsum_tgt, avg=False),
    'MatchSum': get_meteor(gewiki_matchsum_sum, gewiki_matchsum_tgt, avg=False),
    'Oracle': get_meteor(gewiki_oracle_sum, gewiki_oracle_tgt, avg=False)
})
meteor_eval.to_csv("results/gewiki/meteor_eval.csv", index=False)

In [None]:
bert_score_eval = pd.DataFrame({
    'Random-3': get_bert_score(gewiki_rnd_sum, gewiki_tgt, avg=False),
    'Lead-3': get_bert_score(gewiki_lead_sum, gewiki_tgt, avg=False),
    'TextRank': get_bert_score(gewiki_textrank_sum, gewiki_tgt, avg=False),
    'BertSum': get_bert_score(gewiki_bertsum_sum, gewiki_bertsum_tgt, avg=False),
    'MatchSum': get_bert_score(gewiki_matchsum_sum, gewiki_matchsum_tgt, avg=False),
    'Oracle': get_bert_score(gewiki_oracle_sum, gewiki_oracle_tgt, avg=False)
})
bert_score_eval.to_csv("results/gewiki/bert_score_eval.csv", index=False)

In [None]:
mover_score_eval = pd.DataFrame({
    'Random-3': get_mover_score(gewiki_rnd_sum, gewiki_tgt, avg=False),
    'Lead-3': get_mover_score(gewiki_lead_sum, gewiki_tgt, avg=False),
    'TextRank': get_mover_score(gewiki_textrank_sum, gewiki_tgt, avg=False),
    'BertSum': get_mover_score(gewiki_bertsum_sum, gewiki_bertsum_tgt, avg=False),
    'MatchSum': get_mover_score(gewiki_matchsum_sum, gewiki_matchsum_tgt, avg=False),
    'Oracle': get_mover_score(gewiki_oracle_sum, gewiki_oracle_tgt, avg=False)
})
mover_score_eval.to_csv("results/gewiki/mover_score_eval.csv", index=False)

In [None]:
blanc_eval = pd.DataFrame({
    'Random-3': get_blanc(gewiki_rnd_sum, gewiki_src, avg=False),
    'Lead-3': get_blanc(gewiki_lead_sum, gewiki_src, avg=False),
    'TextRank': get_blanc(gewiki_textrank_sum, gewiki_src, avg=False),
    'BertSum': get_blanc(gewiki_bertsum_sum, gewiki_src, avg=False),
    'MatchSum': get_blanc(gewiki_matchsum_sum, gewiki_src, avg=False),
    'Oracle': get_blanc(gewiki_oracle_sum, gewiki_src, avg=False)
})
blanc_eval.to_csv("results/gewiki/blanc_eval.csv", index=False)

In [None]:
js_eval = pd.DataFrame({
    'Random-3': get_jensenshannon(gewiki_rnd_sum, gewiki_src, avg=False),
    'Lead-3': get_jensenshannon(gewiki_lead_sum, gewiki_src, avg=False),
    'TextRank': get_jensenshannon(gewiki_textrank_sum, gewiki_src, avg=False),
    'BertSum': get_jensenshannon(gewiki_bertsum_sum, gewiki_src, avg=False),
    'MatchSum': get_jensenshannon(gewiki_matchsum_sum, gewiki_src, avg=False),
    'Oracle': get_jensenshannon(gewiki_oracle_sum, gewiki_src, avg=False)
})
js_eval.to_csv("results/gewiki/js_eval.csv", index=False)

  p = p / np.sum(p, axis=0)


In [None]:
supert_eval = pd.read_csv("results/gewiki/supert_eval.csv")
% cd /usr/local/lib/python3.7/dist-packages/summ_eval/
#supert_eval['BertSum'] = get_supert(gewiki_bertsum_sum, gewiki_src, avg=False)
supert_eval['MatchSum'] = get_supert(gewiki_matchsum_sum, gewiki_src, avg=False)
supert_eval['Oracle'] = get_supert(gewiki_oracle_sum, gewiki_src, avg=False)
% cd /content/drive/My Drive/GeSumGenEval
supert_eval.to_csv("results/gewiki/supert_eval.csv", index=False)

/usr/local/lib/python3.7/dist-packages/summ_eval


Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.weight', 'cla

/content/drive/.shortcut-targets-by-id/1PcWXs_So5sTaP0wBAR77_ORSfd4aFtHq/GeSumGenEval


In [None]:
% cd /usr/local/lib/python3.7/dist-packages/summ_eval/
supert_eval = pd.DataFrame({
    'Random-3': get_supert(gewiki_rnd_sum, gewiki_src, avg=False),
    'Lead-3': get_supert(gewiki_lead_sum, gewiki_src, avg=False),
    'TextRank': get_supert(gewiki_textrank_sum, gewiki_src, avg=False),
    'BertSum': get_jensenshannon(gewiki_bertsum_sum, gewiki_src, avg=False),
    'MatchSum': get_jensenshannon(gewiki_matchsum_sum, gewiki_src, avg=False),
    'Oracle': get_jensenshannon(gewiki_oracle_sum, gewiki_src, avg=False)
})
% cd /content/drive/My Drive/GeSumGenEval
supert_eval.to_csv("results/gewiki/supert_eval.csv", index=False)

## Aggregate evaluation results per generation method

In [None]:
% cd /usr/local/lib/python3.7/dist-packages/summ_eval/

/usr/local/lib/python3.7/dist-packages/summ_eval


In [None]:
#oracle_rouge = get_rouge(gewiki_oracle_sum, gewiki_oracle_tgt)
#oracle_bleu = get_bleu(gewiki_oracle_sum, gewiki_oracle_tgt)
#oracle_meteor = get_meteor(gewiki_oracle_sum, gewiki_oracle_tgt)
#oracle_bert_score = get_bert_score(gewiki_oracle_sum, gewiki_oracle_tgt)
#oracle_mover_score = get_mover_score(gewiki_oracle_sum, gewiki_oracle_tgt)
#oracle_blanc = get_blanc(gewiki_oracle_sum, gewiki_src)
#oracle_js = get_jensenshannon(gewiki_oracle_sum, gewiki_src)
oracle_supert = get_supert(gewiki_oracle_sum, gewiki_src)
print(oracle_supert)

100%|██████████| 1.24G/1.24G [00:31<00:00, 39.1MB/s]
Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.7557766571427685


In [None]:
#bertsum_rouge = get_rouge(gewiki_bertsum_sum, gewiki_bertsum_tgt)
#bertsum_bleu = get_bleu(gewiki_bertsum_sum, gewiki_bertsum_tgt)
#bertsum_meteor = get_meteor(gewiki_bertsum_sum, gewiki_bertsum_tgt)
#bertsum_bert_score = get_bert_score(gewiki_bertsum_sum, gewiki_bertsum_tgt)
#bertsum_mover_score = get_mover_score(gewiki_bertsum_sum, gewiki_bertsum_tgt)
#bertsum_blanc = get_blanc(gewiki_bertsum_sum, gewiki_src)
#bertsum_js = get_jensenshannon(gewiki_bertsum_sum, gewiki_src)
bertsum_supert = get_supert(gewiki_bertsum_sum, gewiki_src)
print(bertsum_supert)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.8008041634642045


In [None]:
#matchsum_rouge = get_rouge(gewiki_matchsum_sum, gewiki_matchsum_tgt)
#matchsum_bleu = get_bleu(gewiki_matchsum_sum, gewiki_matchsum_tgt)
#matchsum_meteor = get_meteor(gewiki_matchsum_sum, gewiki_matchsum_tgt)
#matchsum_bert_score = get_bert_score(gewiki_matchsum_sum, gewiki_matchsum_tgt)
#matchsum_mover_score = get_mover_score(gewiki_matchsum_sum, gewiki_matchsum_tgt)
#matchsum_blanc = get_blanc(gewiki_matchsum_sum, gewiki_src)
#matchsum_js = get_jensenshannon(gewiki_matchsum_sum, gewiki_src)
matchsum_supert = get_supert(gewiki_matchsum_sum, gewiki_src)
print(matchsum_supert)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.8000698033867725


In [None]:
#rnd_rouge = get_rouge(gewiki_rnd_sum, gewiki_tgt)
#rnd_bleu = get_bleu(gewiki_rnd_sum, gewiki_tgt)
#rnd_meteor = get_meteor(gewiki_rnd_sum, gewiki_tgt)
#rnd_bert_score = get_bert_score(gewiki_rnd_sum, gewiki_tgt)
#rnd_mover_score = get_mover_score(gewiki_rnd_sum, gewiki_tgt)
#rnd_js = get_jensenshannon(gewiki_rnd_sum, gewiki_src)
rnd_supert = get_supert(gewiki_rnd_sum, gewiki_src)
print(rnd_supert)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.7554327657610459


In [None]:
#lead_rouge = get_rouge(gewiki_lead_sum, gewiki_tgt)
#lead_bleu = get_bleu(gewiki_lead_sum, gewiki_tgt)
#lead_meteor = get_meteor(gewiki_lead_sum, gewiki_tgt)
#lead_bert_score = get_bert_score(gewiki_lead_sum, gewiki_tgt)
#lead_mover_score = get_mover_score(gewiki_lead_sum, gewiki_tgt)
#lead_js = get_jensenshannon(gewiki_lead_sum, gewiki_src)
lead_supert = get_supert(gewiki_lead_sum, gewiki_src)
print(lead_supert)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.8266020193139256


In [None]:
#tr_rouge = get_rouge(gewiki_textrank_sum, gewiki_tgt)
#tr_bleu = get_bleu(gewiki_textrank_sum, gewiki_tgt)
#tr_meteor = get_meteor(gewiki_textrank_sum, gewiki_tgt)
#tr_bert_score = get_bert_score(gewiki_textrank_sum, gewiki_tgt)
#tr_mover_score = get_mover_score(gewiki_textrank_sum, gewiki_tgt)
#tr_blanc = get_blanc(gewiki_textrank_sum, gewiki_src)
#tr_js = get_jensenshannon(gewiki_textrank_sum, gewiki_src)
tr_supert = get_supert(gewiki_textrank_sum, gewiki_src)
print(tr_supert)

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-large-nli-stsb-mean-tokens.zip/0_BERT were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.7485024953768681


In [None]:
import pandas as pd

gewiki_eval_df = pd.DataFrame([
    ["Random-3",rnd_rouge['rouge-1'],rnd_rouge['rouge-2'],rnd_rouge['rouge-l'],rnd_bleu,rnd_meteor,rnd_bert_score],
    ["Lead-3",lead_rouge['rouge-1'],lead_rouge['rouge-2'],lead_rouge['rouge-l'],lead_bleu,lead_meteor,lead_bert_score],
    ["TextRank",tr_rouge['rouge-1'],tr_rouge['rouge-2'],tr_rouge['rouge-l'],tr_bleu,tr_meteor,tr_bert_score],
    ["Oracle",oracle_rouge['rouge-1'],oracle_rouge['rouge-2'],oracle_rouge['rouge-l'],oracle_bleu,oracle_meteor,oracle_bert_score],
    ["BertSum",bertsum_rouge['rouge-1'],bertsum_rouge['rouge-2'],bertsum_rouge['rouge-l'],bertsum_bleu,bertsum_meteor,bertsum_bert_score],
    ["MatchSum",matchsum_rouge['rouge-1'],matchsum_rouge['rouge-2'],matchsum_rouge['rouge-l'],matchsum_bleu,matchsum_meteor,matchsum_bert_score]
], columns=["Summary","ROUGE-1","ROUGE-2","ROUGE-L","BLEU","METEOR","BERT-Score"])

print(gewiki_eval_df)
gewiki_eval_df.to_csv("results/gewiki/eval.csv", index=False)

    Summary   ROUGE-1   ROUGE-2   ROUGE-L      BLEU    METEOR  BERT-Score
0  Random-3  0.186970  0.061312  0.148257  2.353785  0.118363    0.568395
1    Lead-3  0.212807  0.076299  0.167209  2.719979  0.127272    0.587119
2  TextRank  0.237438  0.086741  0.176155  2.761369  0.137756    0.569675
3    Oracle  0.383840  0.201094  0.306631  8.574861  0.140905    0.576921
4   BertSum  0.288809  0.125421  0.223787  4.829666  0.153491    0.624984
5  MatchSum  0.252191  0.100162  0.202067  3.975334  0.128689    0.608604
