# SI 630 Final Project

In [103]:
import requests
import io
import zipfile
import pandas as pd
import numpy as np
import os
import xml.etree.ElementTree as ET
import json
import re
import string
import html
from rouge import Rouge

## Import data

In [13]:
! pwd

/Users/xinye/Desktop/si618wn23-main/630project


In [None]:
url = "https://cs.stanford.edu/~myasu/projects/scisumm_net/scisummnet_release1.1__20190413.zip"
response = requests.get(url)
with zipfile.ZipFile(io.BytesIO(response.content)) as zipObj:
     # Extract all the contents of zip file in different directory
     zipObj.extractall("nlp_data")
     print("File is unzipped in nlp_data folder") 

In [13]:
# get all raw text, break all papers into two parts -- Abstract and rest of document
# first get all filepaths
xmlfiles = []
citations = []
summary= []
for subdir, dirs, files in os.walk(r'/home/xinye/nlp_data/scisummnet_release1.1__20190413/top1000_complete'):
    for filename in files:
        filepath = subdir + os.sep + filename
        if filepath.endswith(".xml"):
            xmlfiles.append(filepath)
        if filepath.endswith(".json"):
            citations.append(filepath)
        if filepath.endswith(".txt"):
            summary.append(filepath)
            

In [14]:
#next parse all XML documents

def parse_xml_abstract(fp):
    """ parse an XML journal article into an abstract and the rest of the text
    """
    try:
        tree = ET.parse(fp)
    except Exception as e:
        return "","",str(e)
    root = tree.getroot()
    
    ab = []
    bod = []
    
    for child in root:
        if child.tag == "ABSTRACT":
            for block in child:
                ab.append(block.text)
        else:
            for block in child:
                bod.append(block.text)
                
    #convert from list --> string
    abstract = "\n".join(ab)
    body = "\n".join(bod)
    
    #decode html entities
    abstract = html.unescape(abstract)
    body = html.unescape(body)
    
    return abstract,body,""

In [15]:
raw_cols = []
for fpn in range(len(xmlfiles)):
    ab,bod,err = parse_xml_abstract(xmlfiles[fpn])
    if err:
        #print(fp, err)
        continue
    f = open(citations[fpn]) 

    # returns JSON object as  
    # a dictionary 
    data = json.load(f) 
    only_text = []
    for entry in data:
        only_text.append(entry['clean_text'])

    raw_cols.append([ab,bod,only_text,xmlfiles[fpn]])

df = pd.DataFrame(raw_cols, columns=["abstract","body","citations", "filepath"])
# only use the first 20 papers

In [16]:
# get all summary text using the summary path

summary_text = []
for fpn in range(len(summary)):
    f = open(summary[fpn]) 
    data = f.read()
    summary_text.append(data)

df['summary'] = summary_text

In [18]:
# drop rows with missing values or empty abstracts
df = df.dropna()
df = df[df.abstract != ""]
df.head(5)

Unnamed: 0,abstract,body,citations,filepath,summary
0,2: Test set performance of our systems: and ou...,"The open source Moses (Koehn et al., 2007) MT ...",[Factored translation models have also been us...,/home/xinye/nlp_data/scisummnet_release1.1__20...,Experiments in Domain Adaptation for Statistic...
1,"At present, adapting an Information Extraction...","Most of the world’s information is recorded, p...",[Our work is related to previous work on domai...,/home/xinye/nlp_data/scisummnet_release1.1__20...,"On-Demand Information Extraction\nAt present, ..."
2,"In this paper, we have proposed novel methods ...","In this paper, we have proposed novel methods ...",[These are much finer grained than Penn Treeba...,/home/xinye/nlp_data/scisummnet_release1.1__20...,Supertagging: An Approach To Almost Parsing\nI...
3,Statistical machine translation systems are us...,"In statistical machine translation (SMT), tran...","[Transductive learning method (Ueffing et al, ...",/home/xinye/nlp_data/scisummnet_release1.1__20...,Transductive learning for statistical machine ...
4,We combine the strengths of Bayesian modeling ...,Most state-of-the-art statistical machine tran...,[This is in line with earlier work on consiste...,/home/xinye/nlp_data/scisummnet_release1.1__20...,Bayesian Learning of Non-Compositional Phrases...


In [19]:
df.to_csv('summaries_sample.csv')

In [67]:
df = pd.read_csv('summaries_sample.csv')
df.shape

(924, 6)

In [24]:
df = df.iloc[:80]

In [68]:
# split the data in 80:10:10 for train:valid:test dataset
from sklearn.model_selection import train_test_split

in_df = df.sample(len(df), random_state=630)
train_df, rem_df = train_test_split(in_df, train_size=0.8)
val_df, test_df = train_test_split(rem_df, test_size=0.5)

# Extractive model 1:  Textrank

During the experiment, we found it is hard to let the pagerank algorithms converge. So we decided to use shorter sequence length.

In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from rouge import Rouge

def extractive_summarizer(text, num_sentences=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    
    summarized_text = '. '.join([str(sentence) for sentence in summary]) + '.'
    return summarized_text

In [4]:
# Test the summarizer
text = df['body'][1]
print(extractive_summarizer(text))

We asked a subject to judge usefulness in three grades; A) very useful – for the query, many people might want to use this table for the further investigation of the topic, B) useful – at least, for some purpose, some people might want to use this table for further investigation and C) not useful – no one will be interested in using this table for further investigation.. Compared to the results in the ‘useful’ category, the tables for these two topics have more slots filled and the NE types of the fillers have fewer mistakes.. However, the results are limited to a pair of participants and because of the nature of the procedure, the discovered relations are static relations like a country and its presidents rather than events..


In [7]:
# Split the dataset
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

In [29]:
def evaluate(df, num_sentences=3):
    rouge = Rouge()
    rouge_scores = []
    
    for index, row in df.iterrows():
        text = row['body']
        reference = row['summary']
        summary = extractive_summarizer(text, num_sentences)
        
        score = rouge.get_scores(summary, reference, avg=True)
        rouge_scores.append(score)
    
    return rouge_scores

In [9]:
rouge_scores = evaluate(test_df)

In [10]:
rouge_scores

[{'rouge-1': {'f': 0.3115264747682962,
   'p': 0.29411764705882354,
   'r': 0.33112582781456956},
  'rouge-2': {'f': 0.06896551225911732,
   'p': 0.0650887573964497,
   'r': 0.07333333333333333},
  'rouge-l': {'f': 0.18947367921052644,
   'p': 0.18947368421052632,
   'r': 0.18947368421052632}},
 {'rouge-1': {'f': 0.2686567115430609,
   'p': 0.23195876288659795,
   'r': 0.3191489361702128},
  'rouge-2': {'f': 0.0240240191506832,
   'p': 0.02072538860103627,
   'r': 0.02857142857142857},
  'rouge-l': {'f': 0.21008402864875372,
   'p': 0.1937984496124031,
   'r': 0.22935779816513763}},
 {'rouge-1': {'f': 0.24365481744337664,
   'p': 0.2857142857142857,
   'r': 0.21238938053097345},
  'rouge-2': {'f': 0.010204076742115006,
   'p': 0.011976047904191617,
   'r': 0.008888888888888889},
  'rouge-l': {'f': 0.16733067246615146,
   'p': 0.20588235294117646,
   'r': 0.14093959731543623}},
 {'rouge-1': {'f': 0.2787286014832528,
   'p': 0.3313953488372093,
   'r': 0.24050632911392406},
  'rouge-2': 

In [134]:
def compute_metrics(rouge_scores):
    rouge_1_precision = np.mean([score['rouge-1']['p'] for score in rouge_scores])
    rouge_1_recall = np.mean([score['rouge-1']['r'] for score in rouge_scores])
    rouge_1_f1 = np.mean([score['rouge-1']['f'] for score in rouge_scores])
    
    rouge_2_precision = np.mean([score['rouge-2']['p'] for score in rouge_scores])
    rouge_2_recall = np.mean([score['rouge-2']['r'] for score in rouge_scores])
    rouge_2_f1 = np.mean([score['rouge-2']['f'] for score in rouge_scores])
    
    rouge_l_precision = np.mean([score['rouge-l']['p'] for score in rouge_scores])
    rouge_l_recall = np.mean([score['rouge-l']['r'] for score in rouge_scores])
    rouge_l_f1 = np.mean([score['rouge-l']['f'] for score in rouge_scores])
    
    return rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1

In [11]:

rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores)

print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")

ROUGE-1 Precision: 0.2281, Recall: 0.2965, F1: 0.2464
ROUGE-2 Precision: 0.0321, Recall: 0.0415, F1: 0.0343
ROUGE-L Precision: 0.1768, Recall: 0.2003, F1: 0.1808


As an unsupervised extractive summarization algorithm, I will then use training data to experiment with different hyperparameters and preprocessing techniques.

## Experiment and Adjust the number of sentences using training data

In [None]:
# Experiment with different numbers of sentences on the validation dataset
min_sentences = 2
max_sentences = 7
best_num_sentences = 0
best_avg_rouge_1_f1 = 0

for num_sentences in range(min_sentences, max_sentences + 1):
    rouge_scores = evaluate(train_df, num_sentences=num_sentences)
    _, _, rouge_1_f1, _, _, _, _, _, _ = compute_metrics(rouge_scores)
    avg_rouge_1_f1 = np.mean(rouge_1_f1)

    print(f"Number of sentences: {num_sentences}, Avg. ROUGE-1 F1: {avg_rouge_1_f1:.4f}")

    if avg_rouge_1_f1 > best_avg_rouge_1_f1:
        best_num_sentences = num_sentences
        best_avg_rouge_1_f1 = avg_rouge_1_f1

print(f"\nThe best number of sentences is {best_num_sentences} with an average ROUGE-1 F1 score of {best_avg_rouge_1_f1:.4f}")

In [None]:
rouge_scores_test = evaluate(test_df, num_sentences=best_num_sentences)
rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores_test)

print(f"\nTest dataset performance using {best_num_sentences} sentences:")
print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")

Therefore, 3 is the best sentense length for textrank model; the performance is what have been calculated in the original model

Conclusion: It is hard to directly use textrake to input the whole paper and generate summarization.

## Vocabulary-based pre-processing: Feature engineering, select only top n keywords

Trying to analyze the training data to identify the most relevant words, phrases, or entities that contribute to a good summary. Use this information to pre-process the input text and filter out irrelevant content before applying TextRank.

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

def top_n_keywords(document, n=10):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform([document])
    feature_array = np.array(vectorizer.get_feature_names())
    tfidf_sorting = np.argsort(tfidf_matrix.toarray()).flatten()[::-1]
    top_n = feature_array[tfidf_sorting][:n]
    return top_n

def preprocess_text(text, top_n):
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token.lower() in top_n]
    return ' '.join(filtered_tokens)

In [24]:
# Train the model to identify top_n keywords based on training data
train_documents = train_df['body'].apply(str).tolist()
combined_train_documents = ' '.join(train_documents)
top_n = 50  # You can experiment with different values for top_n
top_n_keywords = top_n_keywords(combined_train_documents, n=top_n)

# Modify the extractive_summarizer to use preprocessed text
def extractive_summarizer(text, num_sentences=3, top_n_keywords=None):
    if top_n_keywords:
        text = preprocess_text(text, top_n_keywords)
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    
    summarized_text = '. '.join([str(sentence) for sentence in summary]) + '.'
    return summarized_text

In [25]:
rouge_scores_test = evaluate(test_df, num_sentences=3)
rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores_test)

print(f"\nTest dataset performance using {best_num_sentences} sentences:")
print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")


Test dataset performance using 3 sentences:
ROUGE-1 Precision: 0.2450, Recall: 0.2865, F1: 0.2536
ROUGE-2 Precision: 0.0428, Recall: 0.0473, F1: 0.0432
ROUGE-L Precision: 0.1918, Recall: 0.1948, F1: 0.1871


##  lemmatization

In [26]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/xinye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/xinye/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/xinye/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [31]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text, top_n, lemmatize=False):
    tokens = nltk.word_tokenize(text)
    
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tagged_tokens = nltk.pos_tag(tokens)
        lemmatized_tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos)) for token, pos in tagged_tokens]
        tokens = lemmatized_tokens

    filtered_tokens = [token for token in tokens if token.lower() in top_n]
    return ' '.join(filtered_tokens)

def extractive_summarizer(text, num_sentences=3, top_n_keywords=None, lemmatize=False):
    if top_n_keywords is not None and len(top_n_keywords) > 0:
        text = preprocess_text(text, top_n_keywords, lemmatize=lemmatize)
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    
    summarized_text = '. '.join([str(sentence) for sentence in summary]) + '.'
    return summarized_text


In [32]:
def evaluate(df, num_sentences=3, lemmatize=False):
    rouge = Rouge()
    rouge_scores = []
    
    for index, row in df.iterrows():
        text = row['body']
        reference = row['summary']
        summary = extractive_summarizer(text, num_sentences, top_n_keywords=top_n_keywords, lemmatize=lemmatize)
        
        score = rouge.get_scores(summary, reference, avg=True)
        rouge_scores.append(score)
    
    return rouge_scores

rouge_scores_lemmatized = evaluate(test_df, num_sentences=best_num_sentences, lemmatize=True)
rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores_lemmatized)

print(f"\nTest dataset performance using {best_num_sentences} sentences and lemmatization:")
print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")


Test dataset performance using 3 sentences and lemmatization:
ROUGE-1 Precision: 0.0375, Recall: 0.0659, F1: 0.0425
ROUGE-2 Precision: 0.0021, Recall: 0.0031, F1: 0.0020
ROUGE-L Precision: 0.1893, Recall: 0.0619, F1: 0.0914


Lemmatization is will reduce the performance of the textrank model, indicating it might not for summarization task, or for this special clean dataframe.

# Extractive Model 2: LexRank and Sentence-transfomers!

https://huggingface.co/sentence-transformers

https://github.com/UKPLab/sentence-transformers

https://github.com/UKPLab/sentence-transformers/tree/master/examples/applications/text-summarization

Here we used sentence-transformers and lexrank to generate extractive summarization for the paper input.

In [30]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import nltk

In [32]:
def degree_centrality_scores(similarity_matrix):
    row_sums = np.sum(similarity_matrix, axis=1)
    row_sums[row_sums == 0] = 1 # avoid divide by 0
    normalized_similarity_matrix = similarity_matrix / row_sums[:, np.newaxis] # Normalization
    
    # eigenvector
    eigvals, eigvecs = np.linalg.eig(normalized_similarity_matrix.T)
    eigvec = np.real(eigvecs[:, np.argmax(eigvals)])
    centrality_scores = eigvec / np.sum(eigvec)
    
    return centrality_scores

In [47]:
def lexrank_summarize(document, num_sentences=8):
    # The code below are adpated based on the code example provided by sententransformer author Nils Reimers:
    # https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/text-summarization/text-summarization.py
    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentences = nltk.sent_tokenize(document)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    scores = util.cos_sim(embeddings, embeddings)
    scores_np = scores.cpu().numpy()
    centrality_scores = degree_centrality_scores(scores_np)
    most_central_sentence_indices = np.argsort(-centrality_scores)

    top_indices = most_central_sentence_indices[:num_sentences]
    top_indices.sort()
    summary = ". ".join([sentences[i] for i in top_indices])

    return summary

In [45]:
def evaluate(df):
    rouge = Rouge()
    rouge_scores = []
    
    for index, row in df.iterrows():
        text = row['body']
        reference = row['summary']
        summary =lexrank_summarize(text)
        
        score = rouge.get_scores(summary, reference, avg=True)
        rouge_scores.append(score)
    
    return rouge_scores

rouge_scores = evaluate(test_df)
rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores)

print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")

ROUGE-1 Precision: 0.2829, Recall: 0.4166, F1: 0.3233
ROUGE-2 Precision: 0.0701, Recall: 0.1037, F1: 0.0797
ROUGE-L Precision: 0.2374, Recall: 0.3056, F1: 0.2595


# Extractive model3: clsutering and sentence-transformers
https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L3-v2

@inproceedings{reimers-2019-sentence-bert,
  title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
  author = "Reimers, Nils and Gurevych, Iryna",
  booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
  month = "11",
  year = "2019",
  publisher = "Association for Computational Linguistics",
  url = "https://arxiv.org/abs/1908.10084",
}

## basline

In [28]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [50]:
def strass_summarization(text, num_clusters=5):
    sentences = nltk.sent_tokenize(text)
    model = SentenceTransformer('paraphrase-MiniLM-L3-v2', device='cpu')
    sentence_embeddings = model.encode(sentences, batch_size=16)

    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(sentence_embeddings)

    closest_sentences_indices = []
    for i in range(num_clusters):
        centroid = kmeans.cluster_centers_[i]
        closest_sentence_idx = np.argmin(np.linalg.norm(sentence_embeddings - centroid, axis=1))
        closest_sentences_indices.append(closest_sentence_idx)

    important_sentences = [sentences[idx] for idx in sorted(closest_sentences_indices)]
    return ' '.join(important_sentences)

In [77]:
def evaluate(df):
    rouge = Rouge()
    rouge_scores = []
    
    for index, row in df.iterrows():
        text = row['body']
        reference = row['summary']
        summary = strass_summarization(text)
        
        score = rouge.get_scores(summary, reference, avg=True)
        rouge_scores.append(score)
    
    return rouge_scores

rouge_scores_strass= evaluate(test_df)
rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores_strass)

print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")

ROUGE-1 Precision: 0.2742, Recall: 0.3416, F1: 0.2948
ROUGE-2 Precision: 0.0613, Recall: 0.0749, F1: 0.0655
ROUGE-L Precision: 0.2146, Recall: 0.2522, F1: 0.2266


## experiment with clusters

In [34]:
def strass_summarization(text, num_clusters):
    sentences = nltk.sent_tokenize(text)
    model = SentenceTransformer('paraphrase-MiniLM-L3-v2', device='cpu')
    sentence_embeddings = model.encode(sentences, batch_size=16)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(sentence_embeddings)

    closest_sentences_indices = []
    for i in range(num_clusters):
        centroid = kmeans.cluster_centers_[i]
        closest_sentence_idx = np.argmin(np.linalg.norm(sentence_embeddings - centroid, axis=1))
        closest_sentences_indices.append(closest_sentence_idx)

    important_sentences = [sentences[idx] for idx in sorted(closest_sentences_indices)]
    return ' '.join(important_sentences)

In [35]:
def evaluate(df, num_clusters):
    rouge = Rouge()
    rouge_scores = []
    
    for index, row in df.iterrows():
        text = row['body']
        reference = row['summary']
        summary = strass_summarization(text, num_clusters=num_clusters)
        
        score = rouge.get_scores(summary, reference, avg=True)
        rouge_scores.append(score)
    
    return rouge_scores

In [91]:

min_cluster = 2
max_cluster = 10
best_num_cluster = 0
best_avg_rouge_1_f1 = 0

for num in range(min_cluster, max_cluster + 1):
    rouge_scores = evaluate(val_df, num_clusters=num)
    _, _, rouge_1_f1, _, _, _, _, _, _ = compute_metrics(rouge_scores)
    avg_rouge_1_f1 = np.mean(rouge_1_f1)

    print(f"Number of clusters: {num}, Avg. ROUGE-1 F1: {avg_rouge_1_f1:.4f}")

    if avg_rouge_1_f1 > best_avg_rouge_1_f1:
        best_num_cluster = num
        best_avg_rouge_1_f1 = avg_rouge_1_f1

print(f"\nThe best number of clusters is {best_num_cluster} with an average ROUGE-1 F1 score of {best_avg_rouge_1_f1:.4f}")

Number of clusters: 2, Avg. ROUGE-1 F1: 0.2383
Number of clusters: 3, Avg. ROUGE-1 F1: 0.2601
Number of clusters: 4, Avg. ROUGE-1 F1: 0.2678
Number of clusters: 5, Avg. ROUGE-1 F1: 0.2740
Number of clusters: 6, Avg. ROUGE-1 F1: 0.2774
Number of clusters: 7, Avg. ROUGE-1 F1: 0.2721
Number of clusters: 8, Avg. ROUGE-1 F1: 0.2738
Number of clusters: 9, Avg. ROUGE-1 F1: 0.2730
Number of clusters: 10, Avg. ROUGE-1 F1: 0.2708

The best number of clusters is 6 with an average ROUGE-1 F1 score of 0.2774


In [89]:
rouge_scores_strass= evaluate(test_df, num_clusters=6)
rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores_strass)

print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")

ROUGE-1 Precision: 0.2564, Recall: 0.3618, F1: 0.2909
ROUGE-2 Precision: 0.0542, Recall: 0.0741, F1: 0.0608
ROUGE-L Precision: 0.2006, Recall: 0.2632, F1: 0.2226


# Extractive Model 4: lexrank+clustering

In [62]:
from sklearn.cluster import KMeans
def lexrank_summarize(document, num_sentences=10):
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
    sentences = nltk.sent_tokenize(document)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    scores = util.cos_sim(embeddings, embeddings)
    scores_np = scores.cpu().numpy()
    
    # cluster sentence embeddings
    num_clusters = min(len(sentences), 9)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(embeddings)
    
    # get most central sentence from each cluster
    most_central_sentence_indices = []
    for i in range(num_clusters):
        cluster_embeddings = embeddings[kmeans.labels_ == i]
        cluster_scores = util.cos_sim(cluster_embeddings, cluster_embeddings).cpu().numpy()
        cluster_centrality_scores = degree_centrality_scores(cluster_scores)
        most_central_sentence_index = np.argmax(cluster_centrality_scores)
        most_central_sentence_indices.append(np.where(kmeans.labels_ == i)[0][most_central_sentence_index])
        
    top_indices = np.argsort(most_central_sentence_indices)[:num_sentences]
    summary = ". ".join([sentences[i] for i in top_indices])

    return summary

In [62]:
def evaluate(df):
    rouge = Rouge()
    rouge_scores = []
    
    for index, row in df.iterrows():
        text = row['body']
        reference = row['summary']
        summary = lexrank_summarize(text)
        
        score = rouge.get_scores(summary, reference, avg=True)
        rouge_scores.append(score)
    
    return rouge_scores

rouge_scores = evaluate(test_df)
rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores)

print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")

ROUGE-1 Precision: 0.3118, Recall: 0.4365, F1: 0.3508
ROUGE-2 Precision: 0.1302, Recall: 0.1694, F1: 0.1411
ROUGE-L Precision: 0.2754, Recall: 0.3564, F1: 0.3023


# Extractive+Abstractive: lexrank+clustering+gpt2

In [73]:
from sklearn.cluster import KMeans
def lexrank_summarize(document, num_sentences=20):
    model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
    sentences = nltk.sent_tokenize(document)
    
    if not sentences:  # exclude empty sentences list
        return ''
    
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    scores = util.cos_sim(embeddings, embeddings)
    scores_np = scores.cpu().numpy()
    
    num_clusters = min(len(sentences), 9)
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(embeddings)
    
    most_central_sentence_indices = []
    for i in range(num_clusters):
        cluster_embeddings = embeddings[kmeans.labels_ == i]
        cluster_scores = util.cos_sim(cluster_embeddings, cluster_embeddings).cpu().numpy()
        cluster_centrality_scores = degree_centrality_scores(cluster_scores)
        most_central_sentence_index = np.argmax(cluster_centrality_scores)
        most_central_sentence_indices.append(np.where(kmeans.labels_ == i)[0][most_central_sentence_index])
        
    top_indices = np.argsort(most_central_sentence_indices)[:num_sentences]
    summary = ". ".join([sentences[i] for i in top_indices])

    return summary

In [74]:
test_df['body'] = test_df['body'].fillna('')
train_df['body'] = train_df['body'].fillna('')
val_df['body'] = val_df['body'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['body'] = test_df['body'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['body'] = train_df['body'].fillna('')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['body'] = val_df['body'].fillna('')


In [75]:
train_df['body_extractive'] = train_df['body'].apply(lambda x: lexrank_summarize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['body_extractive'] = train_df['body'].apply(lambda x: lexrank_summarize(x))


In [76]:
test_df['body_extractive'] = test_df['body'].apply(lambda x: lexrank_summarize(x))
val_df['body_extractive'] = val_df['body'].apply(lambda x: lexrank_summarize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['body_extractive'] = test_df['body'].apply(lambda x: lexrank_summarize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['body_extractive'] = val_df['body'].apply(lambda x: lexrank_summarize(x))


In [78]:
train_df.to_csv('train_df.csv')
val_df.to_csv('val_df.csv')
test_df.to_csv('test_df.csv')

In [153]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
from rouge_score import rouge_scorer
from transformers import Trainer

In [154]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize the data
train_encodings = tokenizer(train_df['body_extractive'].tolist(), train_df['summary'].tolist(), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_df['body_extractive'].tolist(), val_df['summary'].tolist(), truncation=True, padding=True, max_length=512)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [155]:
class GPT2SummarizationTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs, labels=labels)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)  #  default for pad_token_id
        loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

class PaperDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = item["input_ids"].clone()
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = PaperDataset(train_encodings)
val_dataset = PaperDataset(val_encodings)


In [98]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    save_strategy='epoch',
    fp16=True, 
    gradient_accumulation_steps=2,
    load_best_model_at_end=True,
)

trainer = GPT2SummarizationTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,0.000951
2,0.004000,0.000668
2,0.003500,0.000665
4,0.003500,0.000539
4,0.002200,0.000649


TrainOutput(global_step=1845, training_loss=0.0032731205467286147, metrics={'train_runtime': 232.0274, 'train_samples_per_second': 15.925, 'train_steps_per_second': 7.952, 'total_flos': 964167598080000.0, 'train_loss': 0.0032731205467286147, 'epoch': 4.99})

In [99]:
test_encodings = tokenizer(test_df['body_extractive'].tolist(), truncation=True, padding=True, max_length=512)
test_dataset = PaperDataset(test_encodings)

In [100]:
def generate_summaries(model, dataset, tokenizer):
    summaries = []
    model.eval()

    for item in dataset:
        with torch.no_grad():
            input_ids = item["input_ids"].unsqueeze(0).to(model.device)
            attention_mask = item["attention_mask"].unsqueeze(0).to(model.device)
            output = model.generate(input_ids, attention_mask=attention_mask)
            summary = tokenizer.decode(output[0], skip_special_tokens=True)
            summaries.append(summary)

    return summaries

In [101]:
test_summaries = generate_summaries(model, test_dataset, tokenizer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='l

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='l

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='l

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512,

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 512, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='l

In [102]:
rouge_scores = []

for gen_summary, ref_summary in zip(test_summaries, test_df['summary']):
    scores = rouge.get_scores(ref_summary, gen_summary)[0]
    rouge_scores.append(scores)

rouge_1_precision, rouge_1_recall, rouge_1_f1, rouge_2_precision, rouge_2_recall, rouge_2_f1, rouge_l_precision, rouge_l_recall, rouge_l_f1 = compute_metrics(rouge_scores)

print(f"ROUGE-1 Precision: {rouge_1_precision:.4f}, Recall: {rouge_1_recall:.4f}, F1: {rouge_1_f1:.4f}")
print(f"ROUGE-2 Precision: {rouge_2_precision:.4f}, Recall: {rouge_2_recall:.4f}, F1: {rouge_2_f1:.4f}")
print(f"ROUGE-L Precision: {rouge_l_precision:.4f}, Recall: {rouge_l_recall:.4f}, F1: {rouge_l_f1:.4f}")



ROUGE-1 Precision: 0.3982, Recall: 0.2965, F1: 0.3268
ROUGE-2 Precision: 0.1274, Recall: 0.1075, F1: 0.1113
ROUGE-L Precision: 0.3200, Recall: 0.2564, F1: 0.2762
