## **Ignore Warnings**

In [26]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # Ignore all UserWarnings
warnings.filterwarnings("ignore", message=".*DeprecationWarning.*") # Ignore warnings containing "DeprecationWarning"

## **Installing required libraries**

In [27]:
!pip install nltk
!pip install sumy
!pip install rouge



## **Importing Required Libraries**

In [2]:
import numpy as np
import pandas as pd
import os
import nltk
import re
import zipfile
# from google.colab import drive             #UNCOMMENT IF USING DRIVE
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

## **Downloading required NLTK components**

In [29]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/itadmin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/itadmin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/itadmin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## **Connect to drive**

Run this cell only if you are running it on Colab and ensure the zip file of validation dataset is uploaded on colab

In [30]:
use_colab=False

In [31]:
# drive.mount('/content/drive')
# # Path to the directory containing the dataset on Google Drive
# dataset_path = '/content/drive/MyDrive/NLP project/validation.zip' #path to your dataset
# use_colab=True

## **Unzip to extract files**

In [32]:
if(use_colab):
    # Unzip the uploaded file
    zip_file_name = "/content/drive/MyDrive/NLP project/validation.zip" # replace with the folder path on drive
    output_folder = '/content/unzipped_files/'

    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall(output_folder)

## **Loading Annual Reports in Data Frame**

In [33]:
# Specify the path to the folder containing the text files
if(use_colab): folder_path = r'/content/unzipped_files/validation/annual_reports'
else: folder_path = r'validation/annual_reports'  # Replace with the actual path to your folder

# Create an empty list to store the data
data = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  # Check if it's a text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding = 'utf-8') as f:
            text = f.read()
        data.append({'filename': filename, 'text': text})

# Create a dataframe from the list of dictionaries
df = pd.DataFrame(data)


In [34]:
df

Unnamed: 0,filename,text
0,32561.txt,Interactive PDF\nUser guide\nThis PDF allows ...
1,30858.txt,Annual Report and Accounts 2017\nEmpowering \...
2,32148.txt,Connecting \neverybody to \nlive a better \n...
3,32657.txt,Unquenchable \nthirst for \nimprovement\nA.G....
4,32773.txt,Circle Property Plc Annual Report and Account...
...,...,...
358,30820.txt,Delivering \nquality\nAnnual Report and Acco...
359,30927.txt,Ibstock plc Annual Report and Accounts 2017\n...
360,32176.txt,Annual Report 2017\nTate & Lyle Annual Report...
361,31654.txt,easyHotel plc\nAccelerated \ngrowth \nAnnual...


## **Preprocessing**

In [35]:
def remove_spaces_links(df):
    df["text"] = df["text"].str.replace("\n", " ")
    df["text"] = df["text"].str.replace(r"\s+", " ")
    df.loc[:,'text'] = df['text'].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
    return df

def replace_special_symbols(df):
    df["text"] = df["text"].str.replace("@", "")
    df["text"] = df["text"].str.replace("#", "")
    df["text"] = df["text"].str.replace("$", "")
    df["text"] = df["text"].str.replace("~", "")
    return df

def lower_case_text(df):
    df['text'] = df['text'].str.lower()
    return df

# def lemmatize_text(df):
#     lemmatizer = WordNetLemmatizer()
#     df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
#     return df


In [36]:
df = remove_spaces_links(df)
df = replace_special_symbols(df)
df=lower_case_text(df)
# df =lemmatize_text(df)

  df["text"] = df["text"].str.replace(r"\s+", " ")
  df.loc[:,'text'] = df['text'].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
  df["text"] = df["text"].str.replace("$", "")


In [37]:
df

Unnamed: 0,filename,text
0,32561.txt,interactive pdf user guide this pdf allows you...
1,30858.txt,annual report and accounts 2017 empowering the...
2,32148.txt,connecting everybody to live a better today an...
3,32657.txt,unquenchable thirst for improvement a.g. barr ...
4,32773.txt,circle property plc annual report and accounts...
...,...,...
358,30820.txt,delivering quality annual report and accounts ...
359,30927.txt,ibstock plc annual report and accounts 2017 de...
360,32176.txt,annual report 2017 tate & lyle annual report 2...
361,31654.txt,easyhotel plc accelerated growth annual report...


In [6]:
def count_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        sentences = nltk.sent_tokenize(text)
        return len(sentences)

def average_sentences(folder_path):
    total_sentences = 0
    total_files = 0

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            total_sentences += count_sentences(file_path)
            total_files += 1

    if total_files == 0:
        return 0

    return total_sentences / total_files

folder_path = 'validation/gold_summaries'
average = average_sentences(folder_path)
print(f"Average number of sentences per file: {average}")


Average number of sentences per file: 42.9632


## **Generate and save the summaries**

### **LSA summarizer**

In [38]:
def summarize_with_lsa(text, sentences_count=50):
    # Parse the text
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # # Count the number of sentences in the text
    # sentence_count = len(list(parser.document.sentences))

    # # Calculate 7% of the sentence count
    # sentences_count = int(0.021 * sentence_count)

    # Create an LSA Summarizer
    summarizer = LsaSummarizer()

    # Get the summary
    summary = summarizer(parser.document, sentences_count)

    # Convert the summary to a list of sentences
    summary_sentences = [str(sentence) for sentence in summary]

    # Join the sentences to form the final summary
    final_summary = ' '.join(summary_sentences)

    return final_summary

In [39]:
def summarize_and_save(row, text_column="text", filename_column="filename"):
    """Summarizes the text from a dataframe row and saves it with the specified filename."""

    summary = summarize_with_lsa(row[text_column])  # Summarize using LSA
    filename = row[filename_column]
    summary_filename = f"{filename[:-4]}_summary.txt"

    os.makedirs("gen_50_sum", exist_ok=True)# will create a gen_sum folder in your current directory of project
    with open(f"gen_50_sum/{summary_filename}", "w",encoding='utf-8') as f:
        f.write(summary)

# Example usage with dataframe:
for i, row in df.iterrows():
    summarize_and_save(row)  # Pass the entire row to the function

## **Load generated Summaries**

In [40]:
folder_path = 'gen_50_sum'
data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r',encoding="utf-8") as f:
            text = f.read()
        data.append({'filename': filename, 'summary': text})

gen_summaries = pd.DataFrame(data)

In [41]:
gen_summaries

Unnamed: 0,filename,summary
0,31474_summary.txt,"we work in the electronics, energy, healthcare..."
1,32340_summary.txt,you can read more about these on page 8. i am ...
2,32333_summary.txt,our global values put customers at the heart o...
3,31617_summary.txt,"certain figures contained in this document, in..."
4,30943_summary.txt,"since then, it has consistently maintained its..."
...,...,...
358,32480_summary.txt,"for example, we met our annual leakage reducti..."
359,31005_summary.txt,"further explanations can be found in notes 4, ..."
360,32345_summary.txt,long‑term successful relationships with our cu...
361,30785_summary.txt,this has been most evident recently where we h...


## **Load gold summaries**

In [42]:
if(use_colab): folder_path = r'/content/unzipped_files/validation/gold_summaries'
else: folder_path = r'validation/gold_summaries'  # Replace with the actual path to your folder

data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r',encoding="utf-8") as f:
            text = f.read()
        data.append({'filename': filename, 'gold_summary': text})

gold_summaries = pd.DataFrame(data)

In [43]:
gold_summaries

Unnamed: 0,filename,gold_summary
0,31277_2.txt,2 Velocys plc Annual report and accounts 2017...
1,32939_1.txt,16\nmanx telecom plc\nannual report and accou...
2,30887_1.txt,“BAE Systems delivered a \ngood performance ...
3,32097_2.txt,25361.02 13-6-17 Proof Four\nUNDERLYING...
4,31005_2.txt,Building sustainable growth\nCommercial highl...
...,...,...
1245,32025_3.txt,CHAIRMAN’S STATEMENT\nOverview\nIn my last ye...
1246,31064_3.txt,Our Business \nThe Group has performed well a...
1247,32236_1.txt,SysGroup Plc Annual Report & Accounts 2017 9...
1248,32848_2.txt,* Underlying results are stated before \nacqu...


## **Sort by filename**

In [44]:
df_sorted = gold_summaries.sort_values(by='filename')

In [45]:
df_sorted

Unnamed: 0,filename,gold_summary
188,30777_1.txt,25695 19 March 2018 3:29 PM Proof 7\n02...
741,30777_2.txt,25695 19 March 2018 3:29 PM Proof 7\nFi...
886,30777_3.txt,25695 19 March 2018 3:29 PM Proof 7\nA....
509,30778_1.txt,10\nStaffline Group plc Annual Report 2017\nC...
290,30778_2.txt,3 Overview Strategic Report Corporate Governa...
...,...,...
1167,33154_3.txt,Dixons Carphone plc Annual Report and Accoun...
463,33155_1.txt,"IN CONVERSATION WITH \nDAVID ATKINS, CHIEF EX..."
226,33155_2.txt,HIGHLIGHTS\n2015 overview\nPortfolio value\n(...
560,33155_3.txt,Setting Hammerson’s \nvalues and standards\nC...


In [46]:
gen_summaries

Unnamed: 0,filename,summary
0,31474_summary.txt,"we work in the electronics, energy, healthcare..."
1,32340_summary.txt,you can read more about these on page 8. i am ...
2,32333_summary.txt,our global values put customers at the heart o...
3,31617_summary.txt,"certain figures contained in this document, in..."
4,30943_summary.txt,"since then, it has consistently maintained its..."
...,...,...
358,32480_summary.txt,"for example, we met our annual leakage reducti..."
359,31005_summary.txt,"further explanations can be found in notes 4, ..."
360,32345_summary.txt,long‑term successful relationships with our cu...
361,30785_summary.txt,this has been most evident recently where we h...


## **Merge to create single frame**

In [47]:
gen_summaries['base_filename'] = gen_summaries['filename'].str.split('_').str[0]
df_sorted['base_filename'] = df_sorted['filename'].str.split('_').str[0]
merged_df = pd.merge(gen_summaries, df_sorted, on='base_filename', how='inner')
result_df = merged_df[['filename_x', 'filename_y', 'summary', 'gold_summary']]

In [48]:
result_df

Unnamed: 0,filename_x,filename_y,summary,gold_summary
0,31474_summary.txt,31474_1.txt,"we work in the electronics, energy, healthcare...",We continue to make good progress with the \n...
1,31474_summary.txt,31474_2.txt,"we work in the electronics, energy, healthcare...","1. Throughout the Annual Report, including th..."
2,31474_summary.txt,31474_3.txt,"we work in the electronics, energy, healthcare...",Financial performance showed progress with \n...
3,32340_summary.txt,32340_1.txt,you can read more about these on page 8. i am ...,Chief Executive’s review\nThe past year was a...
4,32340_summary.txt,32340_2.txt,you can read more about these on page 8. i am ...,1. UK Electricity Transmission 29\n2. UK Gas ...
...,...,...,...,...
1245,30785_summary.txt,30785_3.txt,this has been most evident recently where we h...,Chairman’s statement\n“ We continue to take a...
1246,30785_summary.txt,30785_5.txt,this has been most evident recently where we h...,"Q&A with CEO, David Miles\n92%\nof tenants ra..."
1247,32556_summary.txt,32556_1.txt,• due to its diverse operations and operating ...,Strategic Report\nwww.biffa.co.uk\n10\nBiffa ...
1248,32556_summary.txt,32556_2.txt,• due to its diverse operations and operating ...,FY17 Net Revenue\n1\n (%)\n£898.8m\nFY17 Unde...


## **Evaluating using Rouge Score**

In [49]:
from rouge import Rouge
import sys

# Increase recursion limit
sys.setrecursionlimit(10**6)
rouge = Rouge()

def calculate_rouge(row):
    scores = rouge.get_scores(row['summary'], row['gold_summary'])[0]
    return scores

result_df['rouge_scores'] = result_df.apply(calculate_rouge, axis=1)

print(result_df['rouge_scores'])

0       {'rouge-1': {'r': 0.3155487804878049, 'p': 0.2...
1       {'rouge-1': {'r': 0.41954022988505746, 'p': 0....
2       {'rouge-1': {'r': 0.3247863247863248, 'p': 0.0...
3       {'rouge-1': {'r': 0.29500580720092917, 'p': 0....
4       {'rouge-1': {'r': 0.2802056555269923, 'p': 0.1...
                              ...                        
1245    {'rouge-1': {'r': 0.31004366812227074, 'p': 0....
1246    {'rouge-1': {'r': 0.3740831295843521, 'p': 0.2...
1247    {'rouge-1': {'r': 0.31911532385466035, 'p': 0....
1248    {'rouge-1': {'r': 0.3191489361702128, 'p': 0.1...
1249    {'rouge-1': {'r': 0.301707779886148, 'p': 0.21...
Name: rouge_scores, Length: 1250, dtype: object


In [50]:
rouge_scores = result_df['rouge_scores']
rouge_scores

0       {'rouge-1': {'r': 0.3155487804878049, 'p': 0.2...
1       {'rouge-1': {'r': 0.41954022988505746, 'p': 0....
2       {'rouge-1': {'r': 0.3247863247863248, 'p': 0.0...
3       {'rouge-1': {'r': 0.29500580720092917, 'p': 0....
4       {'rouge-1': {'r': 0.2802056555269923, 'p': 0.1...
                              ...                        
1245    {'rouge-1': {'r': 0.31004366812227074, 'p': 0....
1246    {'rouge-1': {'r': 0.3740831295843521, 'p': 0.2...
1247    {'rouge-1': {'r': 0.31911532385466035, 'p': 0....
1248    {'rouge-1': {'r': 0.3191489361702128, 'p': 0.1...
1249    {'rouge-1': {'r': 0.301707779886148, 'p': 0.21...
Name: rouge_scores, Length: 1250, dtype: object

## **Calculate Average Rouge Score**

In [51]:
# Initialize accumulated scores
total_rouge_1 = {'r': 0, 'p': 0, 'f': 0}
total_rouge_2 = {'r': 0, 'p': 0, 'f': 0}
total_rouge_l = {'r': 0, 'p': 0, 'f': 0}

# Accumulate scores
for scores_dict in rouge_scores:
    total_rouge_1['r'] += scores_dict['rouge-1']['r']
    total_rouge_1['p'] += scores_dict['rouge-1']['p']
    total_rouge_1['f'] += scores_dict['rouge-1']['f']
    
    total_rouge_2['r'] += scores_dict['rouge-2']['r']
    total_rouge_2['p'] += scores_dict['rouge-2']['p']
    total_rouge_2['f'] += scores_dict['rouge-2']['f']
    
    total_rouge_l['r'] += scores_dict['rouge-l']['r']
    total_rouge_l['p'] += scores_dict['rouge-l']['p']
    total_rouge_l['f'] += scores_dict['rouge-l']['f']

# Calculate average scores
num_scores = len(rouge_scores)

avg_rouge_1 = {key: total_rouge_1[key] / num_scores for key in total_rouge_1}
avg_rouge_2 = {key: total_rouge_2[key] / num_scores for key in total_rouge_2}
avg_rouge_l = {key: total_rouge_l[key] / num_scores for key in total_rouge_l}

print("Average ROUGE-1 scores:", avg_rouge_1)
print("Average ROUGE-2 scores:", avg_rouge_2)
print("Average ROUGE-L scores:", avg_rouge_l)

Average ROUGE-1 scores: {'r': 0.3741334800701818, 'p': 0.18614018763328846, 'f': 0.2272364107556849}
Average ROUGE-2 scores: {'r': 0.1284550714805288, 'p': 0.06091093955358742, 'f': 0.07156293941774251}
Average ROUGE-L scores: {'r': 0.34539175254106946, 'p': 0.17088887929771346, 'f': 0.20869331539647543}


## **Calculate average for csv file generated via Java tool**

In [56]:
import pandas as pd

# Assuming 'your_file.csv' is the name of your CSV file
df = pd.read_csv('results_50.csv')

# Filter data based on ROUGE types
rouge_1_data = df[df['ROUGE-Type'].str.contains('ROUGE-1')]
rouge_2_data = df[df['ROUGE-Type'].str.contains('ROUGE-2')]
rouge_l_data = df[df['ROUGE-Type'].str.contains('ROUGE-L')]
rouge_su_data = df[df['ROUGE-Type'].str.contains('ROUGE-SU')]

print(type(rouge_1_data))

print(rouge_l_data)

# Calculate averages for each ROUGE type
# print("op",rouge_l_data['Avg_F-Score'])
avg_rouge_1 = rouge_1_data['Avg_F-Score'].mean()
avg_rouge_2 = rouge_2_data['Avg_F-Score'].mean()
avg_rouge_l = rouge_l_data['Avg_F-Score'].mean()
avg_rouge_su = rouge_su_data['Avg_F-Score'].mean()

print(type(avg_rouge_1))

print(avg_rouge_1)

# Print the results
print(f'Average ROUGE-1 F-Score: {avg_rouge_1:.5f}')
print(f'Average ROUGE-2 F-Score: {avg_rouge_2:.5f}')
print(f'Average ROUGE-L F-Score: {avg_rouge_l:.5f}')
print(f'Average ROUGE-SU F-Score: {avg_rouge_su:.5f}')


<class 'pandas.core.frame.DataFrame'>
                            ROUGE-Type  Task Name  System Name  Avg_Recall  \
0     ROUGE-L+StopWordRemoval+Stemming      33090  SUMMARY.TXT     0.73464   
4     ROUGE-L+StopWordRemoval+Stemming      30817  SUMMARY.TXT     0.50286   
8     ROUGE-L+StopWordRemoval+Stemming      30816  SUMMARY.TXT     0.65796   
12    ROUGE-L+StopWordRemoval+Stemming      30819  SUMMARY.TXT     0.54218   
16    ROUGE-L+StopWordRemoval+Stemming      33097  SUMMARY.TXT     0.36671   
...                                ...        ...          ...         ...   
1408  ROUGE-L+StopWordRemoval+Stemming      31127  SUMMARY.TXT     0.47277   
1412  ROUGE-L+StopWordRemoval+Stemming      30830  SUMMARY.TXT     0.43723   
1416  ROUGE-L+StopWordRemoval+Stemming      30954  SUMMARY.TXT     0.49718   
1420  ROUGE-L+StopWordRemoval+Stemming      31008  SUMMARY.TXT     0.40217   
1424  ROUGE-L+StopWordRemoval+Stemming      32217  SUMMARY.TXT     0.71432   

      Avg_Precision  Avg_