## **Ignore Warnings**

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning) # Ignore all UserWarnings
warnings.filterwarnings("ignore", message=".*DeprecationWarning.*") # Ignore warnings containing "DeprecationWarning"

## **Installing required libraries**

In [None]:
!pip install nltk
!pip install sumy
!pip install rouge



## **Downloading required NLTK components**

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## **Importing Required Libraries**

In [None]:
import numpy as np
import pandas as pd
import os
import nltk
import re
import zipfile
from google.colab import drive
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

## **Connect to drive**

In [None]:
drive.mount('/content/drive')
# Path to the directory containing the dataset on Google Drive
dataset_path = '/content/drive/MyDrive/NLP project/validation.zip' #path to your dataset

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Unzip to extract files**

In [None]:
# Unzip the uploaded file
zip_file_name = "/content/drive/MyDrive/NLP project/validation.zip"
output_folder = '/content/unzipped_files/'

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(output_folder)

## **Loading Annual Reports in Data Frame**

In [None]:
# Specify the path to the folder containing the text files
folder_path = r'/content/unzipped_files/validation/annual_reports'  # Replace with the actual path to your folder

# Create an empty list to store the data
data = []

# Iterate through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):  # Check if it's a text file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding = 'utf-8') as f:
            text = f.read()
        data.append({'filename': filename, 'text': text})

# Create a dataframe from the list of dictionaries
df = pd.DataFrame(data)


In [None]:
df

Unnamed: 0,filename,text
0,31277.txt,Pivoting to \nrenewable fuels\nAnnual report ...
1,31831.txt,Excellence in ventilation\nVolution Group plc...
2,32556.txt,Annual \nReport 2017 Introduction to Biffa Co...
3,32825.txt,Annual \nReport \n2017 We are a leading inte...
4,32061.txt,VAN ELLE HOLDINGS PLC ANNUAL REPORT AND ACCOU...
...,...,...
358,33038.txt,Annual\nReport&\nForm20\n-\nF\n2006 Filename...
359,32376.txt,SCAPA GROUP PLC ANNUAL REPORT AND ACCOUNTS 20...
360,31040.txt,Polypipe Group plc\nAnnual Report and Account...
361,32340.txt,Bring Energy to Life\nAnnual Report \nand Ac...


## **Preprocessing**

In [None]:
def remove_spaces_links(df):
    df["text"] = df["text"].str.replace("\n", " ")
    df["text"] = df["text"].str.replace(r"\s+", " ")
    df.loc[:,'text'] = df['text'].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()
    return df

# def replace_special_symbols(df):
#     df["text"] = df["text"].str.replace("@", "")
#     df["text"] = df["text"].str.replace("#", "")
#     df["text"] = df["text"].str.replace("$", "")
#     df["text"] = df["text"].str.replace("~", "")
#     return df

def lower_case_text(df):
    df['text'] = df['text'].str.lower()
    return df

# def lemmatize_text(df):
#     lemmatizer = WordNetLemmatizer()
#     df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
#     return df


In [None]:
df = remove_spaces_links(df)
# df = replace_special_symbols(df)
df=lower_case_text(df)
# df =lemmatize_text(df)

  df["text"] = df["text"].str.replace(r"\s+", " ")
  df.loc[:,'text'] = df['text'].str.replace(r'\s*https?://\S+(\s+|$)', ' ').str.strip()


In [None]:
df

Unnamed: 0,filename,text
0,31277.txt,pivoting to renewable fuels annual report and ...
1,31831.txt,excellence in ventilation volution group plc a...
2,32556.txt,annual report 2017 introduction to biffa conte...
3,32825.txt,annual report 2017 we are a leading integrated...
4,32061.txt,van elle holdings plc annual report and accoun...
...,...,...
358,33038.txt,annual report& form20 - f 2006 filename: 82456...
359,32376.txt,scapa group plc annual report and accounts 201...
360,31040.txt,polypipe group plc annual report and accounts ...
361,32340.txt,bring energy to life annual report and account...


## **Generate and save the summaries**

### **LSA summarizer**

In [None]:
def summarize_with_lsa(text, sentences_count=70):
    # Parse the text
    parser = PlaintextParser.from_string(text, Tokenizer("english"))

    # Count the number of sentences in the text
    sentence_count = len(list(parser.document.sentences))

    # Calculate 7% of the sentence count
    sentences_count = int(0.07 * sentence_count)

    # Create an LSA Summarizer
    summarizer = LsaSummarizer()

    # Get the summary
    summary = summarizer(parser.document, sentences_count)

    # Convert the summary to a list of sentences
    summary_sentences = [str(sentence) for sentence in summary]

    # Join the sentences to form the final summary
    final_summary = ' '.join(summary_sentences)

    return final_summary

In [167]:
def summarize_and_save(row, text_column="text", filename_column="filename"):
    """Summarizes the text from a dataframe row and saves it with the specified filename."""

    summary = summarize_with_lsa(row[text_column])  # Summarize using LSA
    filename = row[filename_column]
    summary_filename = f"{filename[:-4]}_summary.txt"

    os.makedirs("gen_sum", exist_ok=True)
    with open(f"gen_sum/{summary_filename}", "w",encoding='utf-8') as f:
        f.write(summary)

# Example usage with dataframe:
for i, row in df.iterrows():
    summarize_and_save(row)  # Pass the entire row to the function

KeyboardInterrupt: 

## **Load generated Summaries**

In [None]:
folder_path = 'gen_sum'
data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r',encoding="utf-8") as f:
            text = f.read()
        data.append({'filename': filename, 'summary': text})

gen_summaries = pd.DataFrame(data)

In [None]:
gen_summaries

## **Load gold summaries**

In [None]:
folder_path = '/content/unzipped_files/validation/gold_summaries'
data = []

for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r',encoding="utf-8") as f:
            text = f.read()
        data.append({'filename': filename, 'gold_summary': text})

gold_summaries = pd.DataFrame(data)

In [None]:
gold_summaries

## **Sort by filename**

In [None]:
df_sorted = gold_summaries.sort_values(by='filename')

In [None]:
df_sorted

In [None]:
gen_summaries

## **Merge to create single frame**

In [None]:
gen_summaries['base_filename'] = gen_summaries['filename'].str.split('_').str[0]
df_sorted['base_filename'] = df_sorted['filename'].str.split('_').str[0]
merged_df = pd.merge(gen_summaries, df_sorted, on='base_filename', how='inner')
result_df = merged_df[['filename_x', 'filename_y', 'summary', 'gold_summary']]

In [None]:
result_df

## **Evaluating using Rouge Score**

In [None]:
from rouge import Rouge
import sys

# Increase recursion limit
sys.setrecursionlimit(10**6)
rouge = Rouge()

def calculate_rouge(row):
    scores = rouge.get_scores(row['summary'], row['gold_summary'])[0]
    return scores

result_df['rouge_scores'] = result_df.apply(calculate_rouge, axis=1)

print(result_df['rouge_scores'])

In [None]:
rouge_scores = result_df['rouge_scores']
rouge_scores

## **Calculate Average Rouge Score**

In [None]:
# Initialize accumulated scores
total_rouge_1 = {"recall": 0, "precision": 0, "fscore": 0}
total_rouge_2 = {"recall": 0, "precision": 0, "fscore": 0}
total_rouge_l = {"recall": 0, "precision": 0, "fscore": 0}

# Accumulate scores
for scores_dict in rouge_scores:
    total_rouge_1["recall"] += scores_dict['rouge-1']["recall"]
    total_rouge_1["precision"] += scores_dict['rouge-1']["precision"]
    total_rouge_1["fscore"] += scores_dict['rouge-1']["fscore"]

    total_rouge_2["recall"] += scores_dict['rouge-2']["recall"]
    total_rouge_2["precision"] += scores_dict['rouge-2']["precision"]
    total_rouge_2["fscore"] += scores_dict['rouge-2']["fscore"]

    total_rouge_l["recall"] += scores_dict['rouge-l']["recall"]
    total_rouge_l["precision"] += scores_dict['rouge-l']["precision"]
    total_rouge_l["fscore"] += scores_dict['rouge-l']["fscore"]

# Calculate average scores
num_scores = len(rouge_scores)

avg_rouge_1 = {key: total_rouge_1[key] / num_scores for key in total_rouge_1}
avg_rouge_2 = {key: total_rouge_2[key] / num_scores for key in total_rouge_2}
avg_rouge_l = {key: total_rouge_l[key] / num_scores for key in total_rouge_l}

print("Average ROUGE-1 scores:", avg_rouge_1)
print("Average ROUGE-2 scores:", avg_rouge_2)
print("Average ROUGE-L scores:", avg_rouge_l)

## **Calculate average for csv file generated via Java tool**

In [None]:
import pandas as pd

# Assuming 'your_file.csv' is the name of your CSV file
df = pd.read_csv('/content/results.csv')

# Filter data based on ROUGE types
rouge_1_data = df[df['ROUGE-Type'].str.contains('ROUGE-1')]
rouge_2_data = df[df['ROUGE-Type'].str.contains('ROUGE-2')]
rouge_l_data = df[df['ROUGE-Type'].str.contains('ROUGE-L')]
rouge_su_data = df[df['ROUGE-Type'].str.contains('ROUGE-SU')]

print(type(rouge_1_data))

print(rouge_l_data)

# Calculate averages for each ROUGE type
# print("op",rouge_l_data['Avg_F-Score'])
avg_rouge_1 = rouge_1_data['Avg_F-Score'].mean()
avg_rouge_2 = rouge_2_data['Avg_F-Score'].mean()
avg_rouge_l = rouge_l_data['Avg_F-Score'].mean()
avg_rouge_su = rouge_su_data['Avg_F-Score'].mean()

print(type(avg_rouge_1))

print(avg_rouge_1)

# Print the results
print(f'Average ROUGE-1 F-Score: {avg_rouge_1:.5f}')
print(f'Average ROUGE-2 F-Score: {avg_rouge_2:.5f}')
print(f'Average ROUGE-L F-Score: {avg_rouge_l:.5f}')
print(f'Average ROUGE-SU F-Score: {avg_rouge_su:.5f}')
