## **Ignore Warnings**

In [40]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # Ignore all UserWarnings
warnings.filterwarnings("ignore", message=".*DeprecationWarning.*") # Ignore warnings containing "DeprecationWarning"

## **Installing required libraries**

In [41]:
!pip install nltk
!pip install sumy
!pip install rouge



## **Importing Required Libraries**

In [42]:
import numpy as np
import pandas as pd
import os
import nltk
import re
import zipfile
# from google.colab import drive             #UNCOMMENT IF USING DRIVE
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

## **Downloading required NLTK components**

In [43]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/itadmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/itadmin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/itadmin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## **Connect to drive**

Run this cell only if you are running it on Colab and ensure the zip file of validation dataset is uploaded on colab

In [44]:
# drive.mount('/content/drive')
# # Path to the directory containing the dataset on Google Drive
# dataset_path = '/content/drive/MyDrive/NLP project/validation.zip' #path to your dataset
# use_colab=True

In [45]:
use_colab=False

## **Unzip to extract files**

In [46]:
if(use_colab):
    # Unzip the uploaded file
    zip_file_name = "/content/drive/MyDrive/NLP project/validation.zip" # replace with the folder path on drive
    output_folder = '/content/unzipped_files/'

    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall(output_folder)

## **Loading Annual Reports in Data Frame**

In [47]:
# Specify the path to the folder containing the text files
if(use_colab): folder_path = r'/content/unzipped_files/validation/annual_reports'
else: folder_path = r'toy dataset/annual_reports'  # Replace with the actual path to your folder

# Create an empty list to store the data
data = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  # Check if it's a text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding = 'utf-8') as f:
            text = f.read()
        data.append({'filename': filename, 'text': text})

# Create a dataframe from the list of dictionaries
df = pd.DataFrame(data)


In [48]:
df

Unnamed: 0,filename,text
0,30779.txt,"National Express Group PLC, Annual Report 201..."
1,30777.txt,25695 19 March 2018 3:29 PM Proof 7 256...
2,30778.txt,Staffline Group plc \nAnnual Report 2017\n En...


## **Preprocessing**

In [49]:
def remove_spaces_links(df):
    df["text"] = df["text"].str.replace("\n", " ")
    df["text"] = df["text"].str.replace(r"\s+", " ")
    df.loc[:,'text'] = df['text'].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
    return df

# def replace_special_symbols(df):
#     df["text"] = df["text"].str.replace("@", "")
#     df["text"] = df["text"].str.replace("#", "")
#     df["text"] = df["text"].str.replace("$", "")
#     df["text"] = df["text"].str.replace("~", "")
#     return df

def lower_case_text(df):
    df['text'] = df['text'].str.lower()
    return df

# def lemmatize_text(df):
#     lemmatizer = WordNetLemmatizer()
#     df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
#     return df


In [50]:
df = remove_spaces_links(df)
# df = replace_special_symbols(df)
df=lower_case_text(df)
# df =lemmatize_text(df)

  df["text"] = df["text"].str.replace(r"\s+", " ")
  df.loc[:,'text'] = df['text'].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()


In [51]:
df

Unnamed: 0,filename,text
0,30779.txt,"national express group plc, annual report 2017..."
1,30777.txt,25695 19 march 2018 3:29 pm proof 7 25695 19 m...
2,30778.txt,staffline group plc annual report 2017 ena bl ...


## **Generate and save the summaries**

### **LSA summarizer**

In [52]:
def summarize_with_lsa(text, sentences_count=70):
    # Parse the text
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # Count the number of sentences in the text
    sentence_count = len(list(parser.document.sentences))

    # Calculate 7% of the sentence count
    sentences_count = int(0.07 * sentence_count)

    # Create an LSA Summarizer
    summarizer = LsaSummarizer()

    # Get the summary
    summary = summarizer(parser.document, sentences_count)

    # Convert the summary to a list of sentences
    summary_sentences = [str(sentence) for sentence in summary]

    # Join the sentences to form the final summary
    final_summary = ' '.join(summary_sentences)

    return final_summary

In [53]:
def summarize_and_save(row, text_column="text", filename_column="filename"):
    """Summarizes the text from a dataframe row and saves it with the specified filename."""

    summary = summarize_with_lsa(row[text_column])  # Summarize using LSA
    filename = row[filename_column]
    summary_filename = f"{filename[:-4]}_summary.txt"

    os.makedirs("toy_gen_sum", exist_ok=True)# will create a gen_sum folder in your current directory of project
    with open(f"toy_gen_sum/{summary_filename}", "w",encoding='utf-8') as f:
        f.write(summary)

# Example usage with dataframe:
for i, row in df.iterrows():
    summarize_and_save(row)  # Pass the entire row to the function

## **Load generated Summaries**

In [54]:
folder_path = 'gen_sum'
data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r',encoding="utf-8") as f:
            text = f.read()
        data.append({'filename': filename, 'summary': text})

gen_summaries = pd.DataFrame(data)

In [55]:
gen_summaries

Unnamed: 0,filename,summary
0,31474_summary.txt,we apply world-class materials science and man...
1,32340_summary.txt,bring energy to life annual report and account...
2,32333_summary.txt,we have also strengthened the board with the a...
3,31617_summary.txt,"certain figures contained in this document, in..."
4,30943_summary.txt,"since then, it has consistently maintained its..."
...,...,...
358,32480_summary.txt,integrated reporting has become modern best pr...
359,31005_summary.txt,"further explanations can be found in notes 4, ..."
360,32345_summary.txt,"babcock provides skilled, bespoke engineering ..."
361,30785_summary.txt,→ housing operating margins reduced to 5.2% (2...


## **Load gold summaries**

In [56]:
if(use_colab): folder_path = r'/content/unzipped_files/validation/gold_summaries'
else: folder_path = r'toy dataset/gold_summaries'  # Replace with the actual path to your folder

data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r',encoding="utf-8") as f:
            text = f.read()
        data.append({'filename': filename, 'gold_summary': text})

gold_summaries = pd.DataFrame(data)

In [57]:
gold_summaries

Unnamed: 0,filename,gold_summary
0,30779_4.txt,Driving our business forward through our thre...
1,30777_1.txt,25695 19 March 2018 3:29 PM Proof 7\n02...
2,30778_2.txt,3 Overview Strategic Report Corporate Governa...
3,30778_1.txt,10\nStaffline Group plc Annual Report 2017\nC...
4,30779_2.txt,"1,745\n2,094\n2,321 2017\n2016\n2015\n111.0\n..."
5,30779_1.txt,Our strategy is \ndelivering results\nNation...
6,30777_2.txt,25695 19 March 2018 3:29 PM Proof 7\nFi...
7,30777_3.txt,25695 19 March 2018 3:29 PM Proof 7\nA....
8,30779_3.txt,Delivering \nshareholder value\nDear fellow ...
9,30778_3.txt,6\nStaffline Group plc Annual Report 2017\nA ...


## **Sort by filename**

In [58]:
df_sorted = gold_summaries.sort_values(by='filename')

In [59]:
df_sorted

Unnamed: 0,filename,gold_summary
1,30777_1.txt,25695 19 March 2018 3:29 PM Proof 7\n02...
6,30777_2.txt,25695 19 March 2018 3:29 PM Proof 7\nFi...
7,30777_3.txt,25695 19 March 2018 3:29 PM Proof 7\nA....
3,30778_1.txt,10\nStaffline Group plc Annual Report 2017\nC...
2,30778_2.txt,3 Overview Strategic Report Corporate Governa...
9,30778_3.txt,6\nStaffline Group plc Annual Report 2017\nA ...
5,30779_1.txt,Our strategy is \ndelivering results\nNation...
4,30779_2.txt,"1,745\n2,094\n2,321 2017\n2016\n2015\n111.0\n..."
8,30779_3.txt,Delivering \nshareholder value\nDear fellow ...
0,30779_4.txt,Driving our business forward through our thre...


In [60]:
gen_summaries

Unnamed: 0,filename,summary
0,31474_summary.txt,we apply world-class materials science and man...
1,32340_summary.txt,bring energy to life annual report and account...
2,32333_summary.txt,we have also strengthened the board with the a...
3,31617_summary.txt,"certain figures contained in this document, in..."
4,30943_summary.txt,"since then, it has consistently maintained its..."
...,...,...
358,32480_summary.txt,integrated reporting has become modern best pr...
359,31005_summary.txt,"further explanations can be found in notes 4, ..."
360,32345_summary.txt,"babcock provides skilled, bespoke engineering ..."
361,30785_summary.txt,→ housing operating margins reduced to 5.2% (2...


## **Merge to create single frame**

In [61]:
gen_summaries['base_filename'] = gen_summaries['filename'].str.split('_').str[0]
df_sorted['base_filename'] = df_sorted['filename'].str.split('_').str[0]
merged_df = pd.merge(gen_summaries, df_sorted, on='base_filename', how='inner')
result_df = merged_df[['filename_x', 'filename_y', 'summary', 'gold_summary']]

In [62]:
result_df

Unnamed: 0,filename_x,filename_y,summary,gold_summary
0,30777_summary.txt,30777_1.txt,throughout this report you will see illustrati...,25695 19 March 2018 3:29 PM Proof 7\n02...
1,30777_summary.txt,30777_2.txt,throughout this report you will see illustrati...,25695 19 March 2018 3:29 PM Proof 7\nFi...
2,30777_summary.txt,30777_3.txt,throughout this report you will see illustrati...,25695 19 March 2018 3:29 PM Proof 7\nA....
3,30779_summary.txt,30779_1.txt,"in addition, unless otherwise stated, all pre-...",Our strategy is \ndelivering results\nNation...
4,30779_summary.txt,30779_2.txt,"in addition, unless otherwise stated, all pre-...","1,745\n2,094\n2,321 2017\n2016\n2015\n111.0\n..."
5,30779_summary.txt,30779_3.txt,"in addition, unless otherwise stated, all pre-...",Delivering \nshareholder value\nDear fellow ...
6,30779_summary.txt,30779_4.txt,"in addition, unless otherwise stated, all pre-...",Driving our business forward through our thre...
7,30778_summary.txt,30778_1.txt,key priorities for 2018 are as follows: • cont...,10\nStaffline Group plc Annual Report 2017\nC...
8,30778_summary.txt,30778_2.txt,key priorities for 2018 are as follows: • cont...,3 Overview Strategic Report Corporate Governa...
9,30778_summary.txt,30778_3.txt,key priorities for 2018 are as follows: • cont...,6\nStaffline Group plc Annual Report 2017\nA ...


## **Evaluating using Rouge Score**

In [63]:
from rouge import Rouge
import sys

# Increase recursion limit
sys.setrecursionlimit(10**6)
rouge = Rouge()

def calculate_rouge(row):
    scores = rouge.get_scores(row['summary'], row['gold_summary'])[0]
    return scores

result_df['rouge_scores'] = result_df.apply(calculate_rouge, axis=1)

print(result_df['rouge_scores'])

0    {'rouge-1': {'r': 0.5161290322580645, 'p': 0.1...
1    {'rouge-1': {'r': 0.8163265306122449, 'p': 0.0...
2    {'rouge-1': {'r': 0.5401459854014599, 'p': 0.1...
3    {'rouge-1': {'r': 0.49345650500384913, 'p': 0....
4    {'rouge-1': {'r': 0.6536796536796536, 'p': 0.0...
5    {'rouge-1': {'r': 0.5520169851380042, 'p': 0.1...
6    {'rouge-1': {'r': 0.4782608695652174, 'p': 0.0...
7    {'rouge-1': {'r': 0.35527299925205685, 'p': 0....
8    {'rouge-1': {'r': 0.6415094339622641, 'p': 0.1...
9    {'rouge-1': {'r': 0.4784853700516351, 'p': 0.2...
Name: rouge_scores, dtype: object


In [64]:
rouge_scores = result_df['rouge_scores']
rouge_scores

0    {'rouge-1': {'r': 0.5161290322580645, 'p': 0.1...
1    {'rouge-1': {'r': 0.8163265306122449, 'p': 0.0...
2    {'rouge-1': {'r': 0.5401459854014599, 'p': 0.1...
3    {'rouge-1': {'r': 0.49345650500384913, 'p': 0....
4    {'rouge-1': {'r': 0.6536796536796536, 'p': 0.0...
5    {'rouge-1': {'r': 0.5520169851380042, 'p': 0.1...
6    {'rouge-1': {'r': 0.4782608695652174, 'p': 0.0...
7    {'rouge-1': {'r': 0.35527299925205685, 'p': 0....
8    {'rouge-1': {'r': 0.6415094339622641, 'p': 0.1...
9    {'rouge-1': {'r': 0.4784853700516351, 'p': 0.2...
Name: rouge_scores, dtype: object

## **Calculate Average Rouge Score**

In [67]:
# Initialize accumulated scores
total_rouge_1 = {'r': 0, 'p': 0, 'f': 0}
total_rouge_2 = {'r': 0, 'p': 0, 'f': 0}
total_rouge_l = {'r': 0, 'p': 0, 'f': 0}

# Accumulate scores
for scores_dict in rouge_scores:
    total_rouge_1['r'] += scores_dict['rouge-1']['r']
    total_rouge_1['p'] += scores_dict['rouge-1']['p']
    total_rouge_1['f'] += scores_dict['rouge-1']['f']
    
    total_rouge_2['r'] += scores_dict['rouge-2']['r']
    total_rouge_2['p'] += scores_dict['rouge-2']['p']
    total_rouge_2['f'] += scores_dict['rouge-2']['f']
    
    total_rouge_l['r'] += scores_dict['rouge-l']['r']
    total_rouge_l['p'] += scores_dict['rouge-l']['p']
    total_rouge_l['f'] += scores_dict['rouge-l']['f']

# Calculate average scores
num_scores = len(rouge_scores)

avg_rouge_1 = {key: total_rouge_1[key] / num_scores for key in total_rouge_1}
avg_rouge_2 = {key: total_rouge_2[key] / num_scores for key in total_rouge_2}
avg_rouge_l = {key: total_rouge_l[key] / num_scores for key in total_rouge_l}

print("Average ROUGE-1 scores:", avg_rouge_1)
print("Average ROUGE-2 scores:", avg_rouge_2)
print("Average ROUGE-L scores:", avg_rouge_l)

Average ROUGE-1 scores: {'r': 0.5525283364924449, 'p': 0.16171647839945638, 'f': 0.22423115450922074}
Average ROUGE-2 scores: {'r': 0.28344372765576503, 'p': 0.05946354702270201, 'f': 0.08380787187892189}
Average ROUGE-L scores: {'r': 0.5264932837735914, 'p': 0.1534997855122378, 'f': 0.21281131088835611}


## **Calculate average for csv file generated via Java tool**

In [68]:
import pandas as pd

# Assuming 'your_file.csv' is the name of your CSV file
df = pd.read_csv('toy dataset results.csv') #replace it with yo

# Filter data based on ROUGE types
rouge_1_data = df[df['ROUGE-Type'].str.contains('ROUGE-1')]
rouge_2_data = df[df['ROUGE-Type'].str.contains('ROUGE-2')]
rouge_l_data = df[df['ROUGE-Type'].str.contains('ROUGE-L')]
rouge_su_data = df[df['ROUGE-Type'].str.contains('ROUGE-SU')]

print(type(rouge_1_data))

print(rouge_l_data)

# Calculate averages for each ROUGE type
# print("op",rouge_l_data['Avg_F-Score'])
avg_rouge_1 = rouge_1_data['Avg_F-Score'].mean()
avg_rouge_2 = rouge_2_data['Avg_F-Score'].mean()
avg_rouge_l = rouge_l_data['Avg_F-Score'].mean()
avg_rouge_su = rouge_su_data['Avg_F-Score'].mean()

print(type(avg_rouge_1))

print(avg_rouge_1)

# Print the results
print(f'Average ROUGE-1 F-Score: {avg_rouge_1:.5f}')
print(f'Average ROUGE-2 F-Score: {avg_rouge_2:.5f}')
print(f'Average ROUGE-L F-Score: {avg_rouge_l:.5f}')
print(f'Average ROUGE-SU F-Score: {avg_rouge_su:.5f}')


<class 'pandas.core.frame.DataFrame'>
                 ROUGE-Type  Task Name System Name  Avg_Recall  Avg_Precision  \
0   ROUGE-L+StopWordRemoval      30779       1.TXT     0.32858        0.55514   
1   ROUGE-L+StopWordRemoval      30779       3.TXT     0.15085        0.66910   
2   ROUGE-L+StopWordRemoval      30779       4.TXT     0.11849        0.58537   
3   ROUGE-L+StopWordRemoval      30779       2.TXT     0.07296        1.00000   
16  ROUGE-L+StopWordRemoval      30778       3.TXT     0.29525        0.61216   
17  ROUGE-L+StopWordRemoval      30778       1.TXT     0.44388        0.43166   
18  ROUGE-L+StopWordRemoval      30778       2.TXT     0.11830        0.81818   
28  ROUGE-L+StopWordRemoval      30777       3.TXT     0.14130        0.58592   
29  ROUGE-L+StopWordRemoval      30777       1.TXT     0.21467        0.59848   
30  ROUGE-L+StopWordRemoval      30777       2.TXT     0.06590        0.92381   

    Avg_F-Score  Num Reference Summaries  
0       0.41282            