# Greedy-BERT model for long text summarization

## Start of [1]: Import data

In [None]:
import pandas as pd

Show training data head

In [None]:
pd.set_option("display.max_columns", None)
train = pd.read_csv("Dataset/train.csv")
val = pd.read_csv("Dataset/dev.csv")
test = pd.read_csv("Dataset/test.csv")
train.head()

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option('display.max_colwidth', None)
train[["chapter_path", "book_id", "source"]]

In [None]:
print(train.iloc[0]['chapter'])

## End of [1]: Import data

## Start of [2]: Span recognition (sentence splitting in this model)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
sent_tokenize("Hello! Hello world... It's true that Mr.Ryo is here? That's true!")

In [None]:
train_chapters = train['chapter'].tolist()
train_chapters = [train_chapters[i].replace("\n", " ") for i in range(len(train_chapters))]
val_chapters = val['chapter'].tolist()
val_chapters = [val_chapters[i].replace("\n", " ") for i in range(len(val_chapters))]
test_chapters = test['chapter'].tolist()
test_chapters = [test_chapters[i].replace("\n", " ") for i in range(len(test_chapters))]

print(train_chapters[0])

In [None]:
train_chapters_sentences = [sent_tokenize(train_chapters[i]) for i in range(len(train_chapters))]
val_chapters_sentences = [sent_tokenize(val_chapters[i]) for i in range(len(val_chapters))]
test_chapters_sentences = [sent_tokenize(test_chapters[i]) for i in range(len(test_chapters))]

In [None]:
print(test_chapters[11])
print(test_chapters_sentences[11])

## End of [2]: Span recognition

## Start of [3]: (BERT + cosine) score function for sentence similarity

In [None]:
import torch
from sentence_transformers import SentenceTransformer, util

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
sentence_embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device = device)  # multi-language model  

def sentence_similarity(sentence_A, sentence_B):
    temp_sentence_A = [sentence_A]
    temp_embedding_A = sentence_embedding_model.encode(temp_sentence_A, convert_to_tensor=False)
    temp_sentence_B = [sentence_B]
    temp_embedding_B = sentence_embedding_model.encode(temp_sentence_B, convert_to_tensor=False)
    cosine_score = util.cos_sim(temp_embedding_A, temp_embedding_B)
    return cosine_score[0][0].item()

print(train_chapters_sentences[0][2])
print(train_chapters_sentences[0][3])
print(sentence_similarity(train_chapters_sentences[0][2], train_chapters_sentences[0][3]))

## End of [3]: (BERT + cosine) score function for sentence similarity

## Start of [5']: Summary function & validation score

In [None]:
# Calculate average summary length vs chapter length ratio
train_chapters_length = train['chapter_length'].tolist()
train_summaries_length = train['summary_length'].tolist()
val_chapters_length = val['chapter_length'].tolist()
test_chapters_length = test['chapter_length'].tolist()
average_summary_length_ratio = 0
for i in range(len(train_chapters_length)):
    average_summary_length_ratio += train_summaries_length[i] / train_chapters_length[i]
    #print("{} - {}".format(i, train_summaries_length[i] / train_chapters_length[i]))
average_summary_length_ratio /= len(train_chapters_length)
print(average_summary_length_ratio)
print(' '.join("    lsdfl    sdjfkjs  ".split(" ")).split())

In [None]:
import copy
import math
from transformers import BertTokenizer, BertForNextSentencePrediction
nsp_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # nsp: next sentence prediction
nsp_model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

def next_sentence_score(sentence_A, sentence_B):
    nsp_tokenized = nsp_tokenizer(sentence_A, sentence_B, return_tensors='pt')
    nsp_labels = torch.LongTensor([0])
    nsp_scores = nsp_model(**nsp_tokenized, labels = nsp_labels)
    if nsp_scores[1][0][0].item() > nsp_scores[1][0][1].item():
        return 1.0
    else:
        return 0.0

In [None]:
# Train summary generation
train_generated_summaries = []
for i in range(len(train_chapters)):
    print("Train doc {}: {} sentences".format(i, len(train_chapters_sentences[i])))
    summary_length_limit = math.ceil(train_chapters_length[i] * average_summary_length_ratio)

    pairwise_sentence_scores = np.zeros((len(train_chapters_sentences[i]), len(train_chapters_sentences[i])))
    for j in range(len(train_chapters_sentences[i])):
        for k in range(len(train_chapters_sentences[i])):
            pairwise_sentence_scores[j, k] = next_sentence_score(train_chapters_sentences[i][j], train_chapters_sentences[i][k])
    
    sentence_sum_similarity = [(0, j) for j in range(len(train_chapters_sentences[i]))]
    for j in range(len(train_chapters_sentences[i])):
        for k in range(len(train_chapters_sentences[i])):
            #print("j = {} - k = {}".format(j, k))
            sentence_sum_similarity[j] = (sentence_sum_similarity[j][0] + pairwise_sentence_scores[j][k], j)
    sentence_sum_similarity.sort(reverse = True)
    generated_summary_length = 0
    chosen_sentences_index = []
    for j in range(len(train_chapters_sentences[i])):
        chosen_sentences_index.append(sentence_sum_similarity[j][1])
        generated_summary_length += len(' '.join(train_chapters_sentences[i][sentence_sum_similarity[j][1]].split(" ")).split())
        if generated_summary_length >= summary_length_limit:
            break
    chosen_sentences_index.sort()
    generated_summary = ""
    for j in range(len(chosen_sentences_index)):
        if j > 0:
            generated_summary += " "
        generated_summary += train_chapters_sentences[i][chosen_sentences_index[j]]
    train_generated_summaries.append(generated_summary)

In [None]:
print(train_generated_summaries[0])

In [None]:
# Train reference summaries
train_reference_summaries = train['summary_text'].tolist()
train_reference_summaries = [train_reference_summaries[i].replace("\n", " ") for i in range(len(train_reference_summaries))]

In [None]:
from rouge_score import rouge_scorer
rougescorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
# Train ROUGE score
train_rouge1 = 0
train_rouge2 = 0
train_rougeL = 0
for i in range(len(train_generated_summaries)):
    print(i)
    scores = rougescorer.score(train_reference_summaries[i], train_generated_summaries[i])
    for key in scores:
        print("{}: {}".format(key, scores[key]))
        if key == "rouge1":
            train_rouge1 += scores[key][2] # take fmeasure value
        elif key == "rouge2":
            train_rouge2 += scores[key][2] # take fmeasure value
        else:
            train_rougeL += scores[key][2] # take fmeasure value
train_rouge1 /= len(train_generated_summaries)
train_rouge2 /= len(train_generated_summaries)
train_rougeL /= len(train_generated_summaries)
print("Train: rouge1 = {}, rouge2 = {}, rougeL = {}".format(train_rouge1, train_rouge2, train_rougeL))

In [None]:
import bert_score
from bert_score import BERTScorer

bertscorer = BERTScorer(lang='en', rescale_with_baseline=True) # Default using 'roberta-large' model
#bertscorer = BERTScorer(model_type='xlnet-large-cased', rescale_with_baseline=True, lang="en")

In [None]:
# Train BERT score
average_bertscore_F1 = 0

for i in range(len(train_generated_summaries)):
    print("Train doc {}:".format(i))
    
    ref_summary = copy.deepcopy(train_reference_summaries[i])
    candidate_summary = copy.deepcopy(train_generated_summaries[i])

    # Method 1: Split by sentence
    ref_sentences = sent_tokenize(ref_summary)
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_bertscore_F1 = 0
    for j in range(len(ref_sentences)):
        ref_summary_split = ref_sentences[j]
        max_candidate_score = -1
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
        chapter_bertscore_F1 += max_candidate_score
    chapter_bertscore_F1 /= len(ref_sentences)
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))

    '''
    # Method 2: Split by n words
    ref_tokenized = word_tokenize(ref_summary)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    ref_pointer = 0
    ref_split_count = 0
    chapter_bertscore_F1 = 0
    while ref_pointer < len(ref_tokenized):
        ref_summary_split = ' '.join(ref_tokenized[ref_pointer:min(len(ref_tokenized), ref_pointer + split_length)])
        max_candidate_score = -1
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
            candidate_pointer += split_length
        chapter_bertscore_F1 += max_candidate_score
        ref_pointer += split_length
        ref_split_count += 1
    chapter_bertscore_F1 /= ref_split_count
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))
    '''
average_bertscore_F1 /= len(train_generated_summaries)
print("Train average BERTscore F1 = {}".format(average_bertscore_F1))

In [None]:
from summaqa import evaluate_corpus
from summaqa import QG_masked
question_generator = QG_masked()
from summaqa import QA_Metric
qa_metric = QA_Metric()

In [None]:
# Train summaQA
#srcs = train_chapters[:len(train_generated_summaries)]
#gens = train_generated_summaries
#srcs = [' '.join(' '.join(srcs[i].split(" ")).split()[:300]) for i in range(len(srcs))]
#gens = [' '.join(' '.join(gens[i].split(" ")).split()[:300]) for i in range(len(gens))]
#evaluate_corpus(srcs, gens)

average_summaqa_prob = 0
average_summaqa_F1 = 0

for i in range(len(train_generated_summaries)):
    print("Train doc {}:".format(i))
    
    chapter_content = copy.deepcopy(train_chapters[i])
    candidate_summary = copy.deepcopy(train_generated_summaries[i])

    # Method 1: Split by sentence
    chapter_sentences = train_chapters_sentences[i]
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    for j in range(len(chapter_sentences)):
        chapter_summary_split = chapter_sentences[j]
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        if len(masked_questions) == 0:
            continue
        chapter_split_count += 1
        print("Chapter split #{}:".format(j))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        max_candidate_F1 = 0
        max_candidate_prob = 0
        #chapter_split_sum_F1 = 0
        #chapter_split_sum_prob = 0
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            max_candidate_F1 = max(max_candidate_F1, summaqa_scores['avg_fscore'])
            max_candidate_prob = max(max_candidate_prob, summaqa_scores['avg_prob'])
            #chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            #chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(k, len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
        print("Chapter split max prob = {}".format(max_candidate_prob))
        print("Chapter split max F1 = {}".format(max_candidate_F1))
        #print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        #print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += max_candidate_prob
        chapter_summaqa_F1 += max_candidate_F1
        #chapter_summaqa_prob += chapter_split_sum_prob
        #chapter_summaqa_F1 += chapter_split_sum_F1
    if chapter_split_count > 0:
        chapter_summaqa_prob /= chapter_split_count
        chapter_summaqa_F1 /= chapter_split_count
    else:
        chapter_summaqa_prob = 1
        chapter_summaqa_F1 = 1
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    
    '''
    # Method 2: Split by n words
    chapter_tokenized = word_tokenize(chapter_content)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    chapter_pointer = 0
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    while chapter_pointer < len(chapter_tokenized):
        chapter_summary_split = ' '.join(chapter_tokenized[chapter_pointer:min(len(chapter_tokenized), chapter_pointer + split_length)])
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        print("Chapter split #{}:".format(chapter_split_count))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        #max_candidate_score = -1
        chapter_split_sum_F1 = 0
        chapter_split_sum_prob = 0
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            #max_candidate_score = max(max_candidate_score, summaqa_scores['avg_fscore'])
            chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(math.floor(candidate_pointer / split_length), len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
            candidate_pointer += split_length
        print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += chapter_split_sum_prob
        chapter_summaqa_F1 += chapter_split_sum_F1
        #chapter_summaqa_F1 += max_candidate_score
        chapter_pointer += split_length
        chapter_split_count += 1
    chapter_summaqa_prob /= chapter_split_count
    chapter_summaqa_F1 /= chapter_split_count
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    '''

average_summaqa_prob /= len(train_generated_summaries)
average_summaqa_F1 /= len(train_generated_summaries)
print("Train average SummaQA prob = {}".format(average_summaqa_prob))
print("Train average SummaQA F1 = {}".format(average_summaqa_F1))

In [None]:
# Validation summary generation
val_generated_summaries = []
for i in range(len(val_chapters)):
    print("Validation doc {}: {} sentences".format(i, len(val_chapters_sentences[i])))
    summary_length_limit = math.ceil(val_chapters_length[i] * average_summary_length_ratio)

    pairwise_sentence_scores = np.zeros((len(val_chapters_sentences[i]), len(val_chapters_sentences[i])))
    for j in range(len(val_chapters_sentences[i])):
        for k in range(len(val_chapters_sentences[i])):
            pairwise_sentence_scores[j, k] = next_sentence_score(val_chapters_sentences[i][j], val_chapters_sentences[i][k])
    
    sentence_sum_similarity = [(0, j) for j in range(len(val_chapters_sentences[i]))]
    for j in range(len(val_chapters_sentences[i])):
        for k in range(len(val_chapters_sentences[i])):
            #print("j = {} - k = {}".format(j, k))
            sentence_sum_similarity[j] = (sentence_sum_similarity[j][0] + pairwise_sentence_scores[j][k], j)
    sentence_sum_similarity.sort(reverse = True)
    generated_summary_length = 0
    chosen_sentences_index = []
    for j in range(len(val_chapters_sentences[i])):
        chosen_sentences_index.append(sentence_sum_similarity[j][1])
        generated_summary_length += len(' '.join(val_chapters_sentences[i][sentence_sum_similarity[j][1]].split(" ")).split())
        if generated_summary_length >= summary_length_limit:
            break
    chosen_sentences_index.sort()
    generated_summary = ""
    for j in range(len(chosen_sentences_index)):
        if j > 0:
            generated_summary += " "
        generated_summary += val_chapters_sentences[i][chosen_sentences_index[j]]
    val_generated_summaries.append(generated_summary)

In [None]:
# Validation reference summaries
val_reference_summaries = val['summary_text'].tolist()
val_reference_summaries = [val_reference_summaries[i].replace("\n", " ") for i in range(len(val_reference_summaries))]

In [None]:
# Validation ROUGE score
val_rouge1 = 0
val_rouge2 = 0
val_rougeL = 0
for i in range(len(val_generated_summaries)):
    print(i)
    scores = rougescorer.score(val_reference_summaries[i], val_generated_summaries[i])
    for key in scores:
        print("{}: {}".format(key, scores[key]))
        if key == "rouge1":
            val_rouge1 += scores[key][2] # take fmeasure value
        elif key == "rouge2":
            val_rouge2 += scores[key][2] # take fmeasure value
        else:
            val_rougeL += scores[key][2] # take fmeasure value
val_rouge1 /= len(val_generated_summaries)
val_rouge2 /= len(val_generated_summaries)
val_rougeL /= len(val_generated_summaries)
print("Validation: rouge1 = {}, rouge2 = {}, rougeL = {}".format(val_rouge1, val_rouge2, val_rougeL))

In [None]:
# Validation BERT score
average_bertscore_F1 = 0

for i in range(len(val_generated_summaries)):
    print("Validation doc {}:".format(i))
    
    ref_summary = copy.deepcopy(val_reference_summaries[i])
    candidate_summary = copy.deepcopy(val_generated_summaries[i])

    # Method 1: Split by sentence
    ref_sentences = sent_tokenize(ref_summary)
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_bertscore_F1 = 0
    for j in range(len(ref_sentences)):
        ref_summary_split = ref_sentences[j]
        max_candidate_score = -1
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
        chapter_bertscore_F1 += max_candidate_score
    chapter_bertscore_F1 /= len(ref_sentences)
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))

    '''
    # Method 2: Split by n words
    ref_tokenized = word_tokenize(ref_summary)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    ref_pointer = 0
    ref_split_count = 0
    chapter_bertscore_F1 = 0
    while ref_pointer < len(ref_tokenized):
        ref_summary_split = ' '.join(ref_tokenized[ref_pointer:min(len(ref_tokenized), ref_pointer + split_length)])
        max_candidate_score = -1
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
            candidate_pointer += split_length
        chapter_bertscore_F1 += max_candidate_score
        ref_pointer += split_length
        ref_split_count += 1
    chapter_bertscore_F1 /= ref_split_count
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))
    '''
average_bertscore_F1 /= len(val_generated_summaries)
print("Validation average BERTscore F1 = {}".format(average_bertscore_F1))

In [None]:
# Validation summaQA
#srcs = val_chapters[:len(val_generated_summaries)]
#gens = val_generated_summaries
#srcs = [' '.join(' '.join(srcs[i].split(" ")).split()[:300]) for i in range(len(srcs))]
#gens = [' '.join(' '.join(gens[i].split(" ")).split()[:300]) for i in range(len(gens))]
#evaluate_corpus(srcs, gens)

average_summaqa_prob = 0
average_summaqa_F1 = 0

for i in range(len(val_generated_summaries)):
    print("Validation doc {}:".format(i))
    
    chapter_content = copy.deepcopy(val_chapters[i])
    candidate_summary = copy.deepcopy(val_generated_summaries[i])

    # Method 1: Split by sentence
    chapter_sentences = val_chapters_sentences[i]
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    for j in range(len(chapter_sentences)):
        chapter_summary_split = chapter_sentences[j]
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        if len(masked_questions) == 0:
            continue
        chapter_split_count += 1
        print("Chapter split #{}:".format(j))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        max_candidate_F1 = 0
        max_candidate_prob = 0
        #chapter_split_sum_F1 = 0
        #chapter_split_sum_prob = 0
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            max_candidate_F1 = max(max_candidate_F1, summaqa_scores['avg_fscore'])
            max_candidate_prob = max(max_candidate_prob, summaqa_scores['avg_prob'])
            #chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            #chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(k, len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
        print("Chapter split max prob = {}".format(max_candidate_prob))
        print("Chapter split max F1 = {}".format(max_candidate_F1))
        #print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        #print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += max_candidate_prob
        chapter_summaqa_F1 += max_candidate_F1
        #chapter_summaqa_prob += chapter_split_sum_prob
        #chapter_summaqa_F1 += chapter_split_sum_F1
    if chapter_split_count > 0:
        chapter_summaqa_prob /= chapter_split_count
        chapter_summaqa_F1 /= chapter_split_count
    else:
        chapter_summaqa_prob = 1
        chapter_summaqa_F1 = 1
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    
    '''
    # Method 2: Split by n words
    chapter_tokenized = word_tokenize(chapter_content)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    chapter_pointer = 0
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    while chapter_pointer < len(chapter_tokenized):
        chapter_summary_split = ' '.join(chapter_tokenized[chapter_pointer:min(len(chapter_tokenized), chapter_pointer + split_length)])
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        print("Chapter split #{}:".format(chapter_split_count))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        #max_candidate_score = -1
        chapter_split_sum_F1 = 0
        chapter_split_sum_prob = 0
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            #max_candidate_score = max(max_candidate_score, summaqa_scores['avg_fscore'])
            chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(math.floor(candidate_pointer / split_length), len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
            candidate_pointer += split_length
        print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += chapter_split_sum_prob
        chapter_summaqa_F1 += chapter_split_sum_F1
        #chapter_summaqa_F1 += max_candidate_score
        chapter_pointer += split_length
        chapter_split_count += 1
    chapter_summaqa_prob /= chapter_split_count
    chapter_summaqa_F1 /= chapter_split_count
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    '''

average_summaqa_prob /= len(val_generated_summaries)
average_summaqa_F1 /= len(val_generated_summaries)
print("Validation average SummaQA prob = {}".format(average_summaqa_prob))
print("Validation average SummaQA F1 = {}".format(average_summaqa_F1))

## End of [5']: Summary function & validation score

## Start of [6']: Run test & evaluate

In [18]:
# Test summary generation
test_generated_summaries = []
for i in range(len(test_chapters)):
    print("Test doc {}: {} sentences".format(i, len(test_chapters_sentences[i])))
    summary_length_limit = math.ceil(test_chapters_length[i] * average_summary_length_ratio)

    pairwise_sentence_scores = np.zeros((len(test_chapters_sentences[i]), len(test_chapters_sentences[i])))
    for j in range(len(test_chapters_sentences[i])):
        for k in range(len(test_chapters_sentences[i])):
            pairwise_sentence_scores[j, k] = next_sentence_score(test_chapters_sentences[i][j], test_chapters_sentences[i][k])
    
    sentence_sum_similarity = [(0, j) for j in range(len(test_chapters_sentences[i]))]
    for j in range(len(test_chapters_sentences[i])):
        for k in range(len(test_chapters_sentences[i])):
            #print("j = {} - k = {}".format(j, k))
            sentence_sum_similarity[j] = (sentence_sum_similarity[j][0] + pairwise_sentence_scores[j][k], j)
    sentence_sum_similarity.sort(reverse = True)
    generated_summary_length = 0
    chosen_sentences_index = []
    for j in range(len(test_chapters_sentences[i])):
        chosen_sentences_index.append(sentence_sum_similarity[j][1])
        generated_summary_length += len(' '.join(test_chapters_sentences[i][sentence_sum_similarity[j][1]].split(" ")).split())
        if generated_summary_length >= summary_length_limit:
            break
    chosen_sentences_index.sort()
    generated_summary = ""
    for j in range(len(chosen_sentences_index)):
        if j > 0:
            generated_summary += " "
        generated_summary += test_chapters_sentences[i][chosen_sentences_index[j]]
    test_generated_summaries.append(generated_summary)

Test doc 0: 77 sentences
Test doc 1: 146 sentences
Test doc 2: 67 sentences
Test doc 3: 12 sentences
Test doc 4: 21 sentences
Test doc 5: 123 sentences
Test doc 6: 7 sentences
Test doc 7: 18 sentences
Test doc 8: 31 sentences
Test doc 9: 63 sentences
Test doc 10: 202 sentences
Test doc 11: 4 sentences
Test doc 12: 5 sentences
Test doc 13: 68 sentences
Test doc 14: 21 sentences
Test doc 15: 12 sentences
Test doc 16: 34 sentences
Test doc 17: 79 sentences
Test doc 18: 38 sentences
Test doc 19: 29 sentences
Test doc 20: 12 sentences
Test doc 21: 21 sentences
Test doc 22: 69 sentences
Test doc 23: 33 sentences
Test doc 24: 21 sentences
Test doc 25: 7 sentences
Test doc 26: 18 sentences
Test doc 27: 31 sentences
Test doc 28: 40 sentences
Test doc 29: 63 sentences
Test doc 30: 44 sentences
Test doc 31: 50 sentences
Test doc 32: 9 sentences
Test doc 33: 20 sentences
Test doc 34: 9 sentences
Test doc 35: 28 sentences
Test doc 36: 42 sentences
Test doc 37: 9 sentences
Test doc 38: 68 sentences


KeyboardInterrupt: 

In [None]:
# Test reference summaries
test_reference_summaries = test['summary_text'].tolist()
test_reference_summaries = [test_reference_summaries[i].replace("\n", " ") for i in range(len(test_reference_summaries))]

In [None]:
# Test ROUGE score
test_rouge1 = 0
test_rouge2 = 0
test_rougeL = 0
for i in range(len(test_generated_summaries)):
    print(i)
    scores = rougescorer.score(test_reference_summaries[i], test_generated_summaries[i])
    for key in scores:
        print("{}: {}".format(key, scores[key]))
        if key == "rouge1":
            test_rouge1 += scores[key][2] # take fmeasure value
        elif key == "rouge2":
            test_rouge2 += scores[key][2] # take fmeasure value
        else:
            test_rougeL += scores[key][2] # take fmeasure value
test_rouge1 /= len(test_generated_summaries)
test_rouge2 /= len(test_generated_summaries)
test_rougeL /= len(test_generated_summaries)
print("Test: rouge1 = {}, rouge2 = {}, rougeL = {}".format(test_rouge1, test_rouge2, test_rougeL))

In [None]:
# Test BERT score
average_bertscore_F1 = 0

for i in range(len(test_generated_summaries)):
    print("Test doc {}:".format(i))
    
    ref_summary = copy.deepcopy(test_reference_summaries[i])
    candidate_summary = copy.deepcopy(test_generated_summaries[i])

    # Method 1: Split by sentence
    ref_sentences = sent_tokenize(ref_summary)
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_bertscore_F1 = 0
    for j in range(len(ref_sentences)):
        ref_summary_split = ref_sentences[j]
        max_candidate_score = -1
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
        chapter_bertscore_F1 += max_candidate_score
    chapter_bertscore_F1 /= len(ref_sentences)
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))

    '''
    # Method 2: Split by n words
    ref_tokenized = word_tokenize(ref_summary)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    ref_pointer = 0
    ref_split_count = 0
    chapter_bertscore_F1 = 0
    while ref_pointer < len(ref_tokenized):
        ref_summary_split = ' '.join(ref_tokenized[ref_pointer:min(len(ref_tokenized), ref_pointer + split_length)])
        max_candidate_score = -1
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
            candidate_pointer += split_length
        chapter_bertscore_F1 += max_candidate_score
        ref_pointer += split_length
        ref_split_count += 1
    chapter_bertscore_F1 /= ref_split_count
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))
    '''
average_bertscore_F1 /= len(test_generated_summaries)
print("Test average BERTscore F1 = {}".format(average_bertscore_F1))

In [None]:
# Test summaQA
#srcs = test_chapters[:len(test_generated_summaries)]
#gens = test_generated_summaries
#srcs = [' '.join(' '.join(srcs[i].split(" ")).split()[:300]) for i in range(len(srcs))]
#gens = [' '.join(' '.join(gens[i].split(" ")).split()[:300]) for i in range(len(gens))]
#evaluate_corpus(srcs, gens)

average_summaqa_prob = 0
average_summaqa_F1 = 0

for i in range(len(test_generated_summaries)):
    print("Test doc {}:".format(i))
    
    chapter_content = copy.deepcopy(test_chapters[i])
    candidate_summary = copy.deepcopy(test_generated_summaries[i])

    # Method 1: Split by sentence
    chapter_sentences = test_chapters_sentences[i]
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    for j in range(len(chapter_sentences)):
        chapter_summary_split = chapter_sentences[j]
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        if len(masked_questions) == 0:
            continue
        chapter_split_count += 1
        print("Chapter split #{}:".format(j))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        max_candidate_F1 = 0
        max_candidate_prob = 0
        #chapter_split_sum_F1 = 0
        #chapter_split_sum_prob = 0
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            max_candidate_F1 = max(max_candidate_F1, summaqa_scores['avg_fscore'])
            max_candidate_prob = max(max_candidate_prob, summaqa_scores['avg_prob'])
            #chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            #chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(k, len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
        print("Chapter split max prob = {}".format(max_candidate_prob))
        print("Chapter split max F1 = {}".format(max_candidate_F1))
        #print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        #print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += max_candidate_prob
        chapter_summaqa_F1 += max_candidate_F1
        #chapter_summaqa_prob += chapter_split_sum_prob
        #chapter_summaqa_F1 += chapter_split_sum_F1
    if chapter_split_count > 0:
        chapter_summaqa_prob /= chapter_split_count
        chapter_summaqa_F1 /= chapter_split_count
    else:
        chapter_summaqa_prob = 1
        chapter_summaqa_F1 = 1
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    
    '''
    # Method 2: Split by n words
    chapter_tokenized = word_tokenize(chapter_content)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    chapter_pointer = 0
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    while chapter_pointer < len(chapter_tokenized):
        chapter_summary_split = ' '.join(chapter_tokenized[chapter_pointer:min(len(chapter_tokenized), chapter_pointer + split_length)])
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        print("Chapter split #{}:".format(chapter_split_count))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        #max_candidate_score = -1
        chapter_split_sum_F1 = 0
        chapter_split_sum_prob = 0
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            #max_candidate_score = max(max_candidate_score, summaqa_scores['avg_fscore'])
            chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(math.floor(candidate_pointer / split_length), len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
            candidate_pointer += split_length
        print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += chapter_split_sum_prob
        chapter_summaqa_F1 += chapter_split_sum_F1
        #chapter_summaqa_F1 += max_candidate_score
        chapter_pointer += split_length
        chapter_split_count += 1
    chapter_summaqa_prob /= chapter_split_count
    chapter_summaqa_F1 /= chapter_split_count
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    '''

average_summaqa_prob /= len(test_generated_summaries)
average_summaqa_F1 /= len(test_generated_summaries)
print("Test average SummaQA prob = {}".format(average_summaqa_prob))
print("Test average SummaQA F1 = {}".format(average_summaqa_F1))

## End of [6']: Run test & evaluate