# MCS-GA-One-and-Two-Words-tfidf_CNN-Dailymail model for text summarization

## Start of [1]: Import data

In [None]:
import pandas as pd

Show training data head

In [None]:
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)
val = pd.read_csv("Dataset/cnn_dailymail/validation.csv")
test = pd.read_csv("Dataset/cnn_dailymail/test.csv")
val.head()

In [None]:
print(val.iloc[0]['article'])

## End of [1]: Import data

## Start of [2]: Span recognition (sentence splitting in this model)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
sent_tokenize("Hello! Hello world... It's true that Mr.Ryo is here? That's true!")

In [None]:
val_chapters = val['article'].tolist()
val_chapters_paragraphs = [val_chapters[i].split("\n\n") for i in range(len(val_chapters))]
for i in range(len(val_chapters_paragraphs)):
    val_chapters_paragraphs[i] = [val_chapters_paragraphs[i][j].replace("\n", " ") for j in range(len(val_chapters_paragraphs[i]))]
val_chapters = [val_chapters[i].replace("\n", " ") for i in range(len(val_chapters))]
test_chapters = test['article'].tolist()
test_chapters_paragraphs = [test_chapters[i].split("\n\n") for i in range(len(test_chapters))]
for i in range(len(test_chapters_paragraphs)):
    test_chapters_paragraphs[i] = [test_chapters_paragraphs[i][j].replace("\n", " ") for j in range(len(test_chapters_paragraphs[i]))]
test_chapters = [test_chapters[i].replace("\n", " ") for i in range(len(test_chapters))]

print(val_chapters_paragraphs[0])

In [None]:
val_chapters_sentences = [sent_tokenize(val_chapters[i]) for i in range(len(val_chapters))]
test_chapters_sentences = [sent_tokenize(test_chapters[i]) for i in range(len(test_chapters))]

val_chapters_paragraphs_sentences = []
for i in range(len(val_chapters_paragraphs)): # chapter i
    chapter_list = []
    for j in range(len(val_chapters_paragraphs[i])): # paragraph j
        chapter_list.append(sent_tokenize(val_chapters_paragraphs[i][j]))
    val_chapters_paragraphs_sentences.append(chapter_list)
test_chapters_paragraphs_sentences = []
for i in range(len(test_chapters_paragraphs)): # chapter i
    chapter_list = []
    for j in range(len(test_chapters_paragraphs[i])): # paragraph j
        chapter_list.append(sent_tokenize(test_chapters_paragraphs[i][j]))
    test_chapters_paragraphs_sentences.append(chapter_list)

print(val_chapters_paragraphs_sentences[0])

## End of [2]: Span recognition

## Start of [3]: (BERT + cosine) score function for sentence similarity

In [None]:
import torch
from sentence_transformers import SentenceTransformer, util

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
sentence_embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device = device)  # multi-language model  

def sentence_similarity(sentence_A, sentence_B):
    temp_sentence_A = [sentence_A]
    temp_embedding_A = sentence_embedding_model.encode(temp_sentence_A, convert_to_tensor=False)
    temp_sentence_B = [sentence_B]
    temp_embedding_B = sentence_embedding_model.encode(temp_sentence_B, convert_to_tensor=False)
    cosine_score = util.cos_sim(temp_embedding_A, temp_embedding_B)
    return cosine_score[0][0].item()

print(val_chapters_sentences[0][2])
print(val_chapters_sentences[0][3])
print(sentence_similarity(val_chapters_sentences[0][2], val_chapters_sentences[0][3]))

## End of [3]: (BERT + cosine) score function for sentence similarity

## Start of [5']: Metrics preparation

In [None]:
# Validation reference summaries
val_reference_summaries = val['highlights'].tolist()
val_reference_summaries = [val_reference_summaries[i].replace("\n", " ") for i in range(len(val_reference_summaries))]

In [None]:
# Calculate average summary length vs chapter length ratio
average_summary_length_ratio = 0
for i in range(len(val_chapters)):
    average_summary_length_ratio += len(word_tokenize(val_reference_summaries[i])) / len(word_tokenize(val_chapters[i]))
average_summary_length_ratio /= len(val_chapters)
#average_summary_length_ratio = 0.1
print(average_summary_length_ratio)

In [None]:
import copy
import math
import numpy as np
import os
from sklearn.preprocessing import normalize

EPS = 0.000001

In [None]:
from rouge_score import rouge_scorer
rougescorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
sent_A = "1 I love machine learning"
sent_B = "1 think love i machine learning.."
print(rougescorer.score(sent_B, sent_A))

In [None]:
import bert_score
from bert_score import BERTScorer

bertscorer = BERTScorer(lang='en', rescale_with_baseline=True) # Default using 'roberta-large' model
#bertscorer = BERTScorer(model_type='xlnet-large-cased', rescale_with_baseline=True, lang="en")

In [None]:
from summaqa import evaluate_corpus
from summaqa import QG_masked
question_generator = QG_masked()
from summaqa import QA_Metric
qa_metric = QA_Metric()

## End of [5']: Metrics preparation

## Start of [6']: Run test & evaluate

In [None]:
# Test reference summaries
test_reference_summaries = test['highlights'].tolist()
test_reference_summaries = [test_reference_summaries[i].replace("\n", " ") for i in range(len(test_reference_summaries))]

In [None]:
# Test summary generation
test_generated_summaries = []
for i in range(len(test_chapters)):
    num_sentences = len(test_chapters_sentences[i])
    print("Test doc {}: {} sentences".format(i, len(test_chapters_sentences[i])))
    summary_length_limit = math.ceil(len(word_tokenize(test_chapters[i])) * average_summary_length_ratio)
    sentences_length = [len(word_tokenize(test_chapters_sentences[i][j])) for j in range(num_sentences)]
    #summary_length_limit = sum(sentences_length)
    
    inputFile = open("MCS-GA-One-and-Two-Words-tfidf-input.txt", "w")
    print("{} {}".format(num_sentences, summary_length_limit), file = inputFile)
    for j in range(num_sentences):
        print("{} ".format(sentences_length[j]), end = "", file = inputFile)
    print("", file = inputFile)
    for j in range(num_sentences):
        print("{}".format(test_chapters_sentences[i][j]), file = inputFile)
    inputFile.close()

    os.system("./MCS-GA-One-and-Two-Words-tfidf")
    chosen_sentences_index = []
    outputFile = open("MCS-GA-One-and-Two-Words-tfidf-output.txt", "r")
    num_lines = int(outputFile.readline())
    for j in range(num_lines):
        temp_index = int(outputFile.readline())
        chosen_sentences_index.append(temp_index)
    current_profit = float(outputFile.readline())
    print("Profit = {0:.9f}".format(current_profit))
    outputFile.close()
    
    chosen_sentences_index.sort()
    generated_summary = ""
    for j in range(len(chosen_sentences_index)):
        if j > 0:
            generated_summary += " "
        generated_summary += test_chapters_sentences[i][chosen_sentences_index[j]]
    test_generated_summaries.append(generated_summary)

In [None]:
print(test_generated_summaries[0])

In [None]:
# Test ROUGE score
test_rouge1 = 0
test_rouge2 = 0
test_rougeL = 0
for i in range(len(test_generated_summaries)):
    print(i)
    scores = rougescorer.score(test_reference_summaries[i], test_generated_summaries[i])
    for key in scores:
        print("{}: {}".format(key, scores[key]))
        if key == "rouge1":
            test_rouge1 += scores[key][2] # take fmeasure value
        elif key == "rouge2":
            test_rouge2 += scores[key][2] # take fmeasure value
        else:
            test_rougeL += scores[key][2] # take fmeasure value
test_rouge1 /= len(test_generated_summaries)
test_rouge2 /= len(test_generated_summaries)
test_rougeL /= len(test_generated_summaries)
print("Test: rouge1 = {}, rouge2 = {}, rougeL = {}".format(test_rouge1, test_rouge2, test_rougeL))

In [None]:
# Print test result to file
test_result_file = open("MCS-GA-One-and-Two-Words-tfidf_CNN-Dailymail_test-generated-result.txt", "w")
for i in range(len(test_generated_summaries)):
    print(test_generated_summaries[i], file = test_result_file)
test_result_file.close()

In [None]:
# Test BERT score
average_bertscore_F1 = 0

for i in range(len(test_generated_summaries)):
    print("Test doc {}:".format(i))
    
    ref_summary = copy.deepcopy(test_reference_summaries[i])
    candidate_summary = copy.deepcopy(test_generated_summaries[i])

    # Method 1: Split by sentence
    ref_sentences = sent_tokenize(ref_summary)
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_bertscore_F1 = 0
    for j in range(len(ref_sentences)):
        ref_summary_split = ref_sentences[j]
        max_candidate_score = -1
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
        chapter_bertscore_F1 += max_candidate_score
    chapter_bertscore_F1 /= len(ref_sentences)
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))

    '''
    # Method 2: Split by n words
    ref_tokenized = word_tokenize(ref_summary)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    ref_pointer = 0
    ref_split_count = 0
    chapter_bertscore_F1 = 0
    while ref_pointer < len(ref_tokenized):
        ref_summary_split = ' '.join(ref_tokenized[ref_pointer:min(len(ref_tokenized), ref_pointer + split_length)])
        max_candidate_score = -1
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            P, R, F1 = bertscorer.score([candidate_summary_split], [ref_summary_split])
            #print("Candidate length = {}, reference length = {}".format(len(word_tokenize(candidate_summary_split)), len(word_tokenize(ref_summary_split))))
            max_candidate_score = max(max_candidate_score, F1.mean().item())
            candidate_pointer += split_length
        chapter_bertscore_F1 += max_candidate_score
        ref_pointer += split_length
        ref_split_count += 1
    chapter_bertscore_F1 /= ref_split_count
    average_bertscore_F1 += chapter_bertscore_F1
    print("Chapter BERTScore F1 = {}".format(chapter_bertscore_F1))
    '''
average_bertscore_F1 /= len(test_generated_summaries)
print("Test average BERTscore F1 = {}".format(average_bertscore_F1))

In [None]:
# Test summaQA
#srcs = test_chapters[:len(test_generated_summaries)]
#gens = test_generated_summaries
#srcs = [' '.join(' '.join(srcs[i].split(" ")).split()[:300]) for i in range(len(srcs))]
#gens = [' '.join(' '.join(gens[i].split(" ")).split()[:300]) for i in range(len(gens))]
#evaluate_corpus(srcs, gens)

average_summaqa_prob = 0
average_summaqa_F1 = 0

for i in range(len(test_generated_summaries)):
    print("Test doc {}:".format(i))
    
    chapter_content = copy.deepcopy(test_chapters[i])
    candidate_summary = copy.deepcopy(test_generated_summaries[i])

    # Method 1: Split by sentence
    chapter_sentences = test_chapters_sentences[i]
    candidate_sentences = sent_tokenize(candidate_summary)
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    for j in range(len(chapter_sentences)):
        chapter_summary_split = chapter_sentences[j]
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        if len(masked_questions) == 0:
            continue
        chapter_split_count += 1
        print("Chapter split #{}:".format(j))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        max_candidate_F1 = 0
        max_candidate_prob = 0
        #chapter_split_sum_F1 = 0
        #chapter_split_sum_prob = 0
        for k in range(len(candidate_sentences)):
            candidate_summary_split = candidate_sentences[k]
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            max_candidate_F1 = max(max_candidate_F1, summaqa_scores['avg_fscore'])
            max_candidate_prob = max(max_candidate_prob, summaqa_scores['avg_prob'])
            #chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            #chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(k, len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
        print("Chapter split max prob = {}".format(max_candidate_prob))
        print("Chapter split max F1 = {}".format(max_candidate_F1))
        #print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        #print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += max_candidate_prob
        chapter_summaqa_F1 += max_candidate_F1
        #chapter_summaqa_prob += chapter_split_sum_prob
        #chapter_summaqa_F1 += chapter_split_sum_F1
    if chapter_split_count > 0:
        chapter_summaqa_prob /= chapter_split_count
        chapter_summaqa_F1 /= chapter_split_count
    else:
        chapter_summaqa_prob = 1
        chapter_summaqa_F1 = 1
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    
    '''
    # Method 2: Split by n words
    chapter_tokenized = word_tokenize(chapter_content)
    candidate_tokenized = word_tokenize(candidate_summary)
    split_length = 250
    chapter_pointer = 0
    chapter_split_count = 0
    chapter_summaqa_prob = 0
    chapter_summaqa_F1 = 0
    while chapter_pointer < len(chapter_tokenized):
        chapter_summary_split = ' '.join(chapter_tokenized[chapter_pointer:min(len(chapter_tokenized), chapter_pointer + split_length)])
        article = chapter_summary_split
        masked_questions, answer_spans = question_generator.get_questions(article)
        print("Chapter split #{}:".format(chapter_split_count))
        print(chapter_summary_split)
        print("Questions:")
        print(masked_questions)
        print("Answers:")
        print(answer_spans)
        #max_candidate_score = -1
        chapter_split_sum_F1 = 0
        chapter_split_sum_prob = 0
        candidate_pointer = 0
        while candidate_pointer < len(candidate_tokenized):
            candidate_summary_split = ' '.join(candidate_tokenized[candidate_pointer:min(len(candidate_tokenized), candidate_pointer + split_length)])
            summaqa_scores = qa_metric.compute(masked_questions, answer_spans, candidate_summary_split)
            #max_candidate_score = max(max_candidate_score, summaqa_scores['avg_fscore'])
            chapter_split_sum_F1 += summaqa_scores['avg_fscore']
            chapter_split_sum_prob += summaqa_scores['avg_prob']
            print("Candidate split #{} (Chapter split length = {}, candidate length = {}): SummaQA score = {}".format(math.floor(candidate_pointer / split_length), len(word_tokenize(chapter_summary_split)), len(word_tokenize(candidate_summary_split)), summaqa_scores))
            print(candidate_summary_split)
            candidate_pointer += split_length
        print("Chapter split sum prob = {}".format(chapter_split_sum_prob))
        print("Chapter split sum F1 = {}".format(chapter_split_sum_F1))
        chapter_summaqa_prob += chapter_split_sum_prob
        chapter_summaqa_F1 += chapter_split_sum_F1
        #chapter_summaqa_F1 += max_candidate_score
        chapter_pointer += split_length
        chapter_split_count += 1
    chapter_summaqa_prob /= chapter_split_count
    chapter_summaqa_F1 /= chapter_split_count
    average_summaqa_prob += chapter_summaqa_prob
    average_summaqa_F1 += chapter_summaqa_F1
    print("Chapter SummaQA prob = {}".format(chapter_summaqa_prob))
    print("Chapter SummaQA F1 = {}".format(chapter_summaqa_F1))
    '''

average_summaqa_prob /= len(test_generated_summaries)
average_summaqa_F1 /= len(test_generated_summaries)
print("Test average SummaQA prob = {}".format(average_summaqa_prob))
print("Test average SummaQA F1 = {}".format(average_summaqa_F1))

## End of [6']: Run test & evaluate