Import Dependencies

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import torch
from transformers import BartForConditionalGeneration, BartTokenizer
import IPython
from IPython.display import clear_output




[nltk_data] Downloading package punkt to /home/swaraj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/swaraj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

Summaries to DataFrame (result_df)

In [4]:
# Specify the path to the validation folder
folder_path = './annual_reports'

# Get a list of all text files in the folder
text_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# Create an empty list to store DataFrames
dfs = []

# Iterate through each text file and populate the list of DataFrames
for file_name in tqdm(text_files, desc='Processing files', unit='file'):
    file_path = os.path.join(folder_path, file_name)

    # Read the contents of the text file
    with open(file_path, 'r') as file:
        file_contents = file.read()

    # Create a DataFrame for the current file
    df = pd.DataFrame({'File Name': [file_name], 'Contents': [file_contents]})

    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames in the list into a single DataFrame
result_df = pd.concat(dfs, ignore_index=True)

# Display the resulting DataFrame
print(result_df)


Processing files: 100%|███████████████████| 363/363 [00:00<00:00, 1083.19file/s]

     File Name                                           Contents
0    32067.txt   Accrol Group Holdings plc / Annual Report and...
1    32541.txt   The A–Z of \nexperience\nAnnual Report 2017 L...
2    31604.txt   ANNUAL REPORT AND ACCOUNTS 2017\nPURPOSEFUL\n...
3    31061.txt   LoopUp Group plc | Annual Report & Accounts 2...
4    32186.txt   Norcros plc Annual report and accounts 2017\n...
..         ...                                                ...
358  31059.txt   ANNUAL REPORT AND ACCOUNTS 2017\nREGISTERED C...
359  31984.txt   Annual Report  \n& Accounts\nfor the year end...
360  31074.txt   Pillars of growth\nCircassia Pharmaceuticals ...
361  32168.txt    SHAPING A  \nsustainable future\nFlybe Group...
362  32236.txt   annual report & accounts 2017\nSysGroup Plc\n...

[363 rows x 2 columns]





Pre-processing

In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove links using regular expression
    text = re.sub(r'http\S+|www\S+', '', text)

    # Optionally, you can remove other special characters, numbers, etc. based on your requirements.
    # For example, removing non-alphanumeric characters:
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove unnecessary spaces (multiple spaces, leading, and trailing spaces)
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove numbers with length more than 4
    text = re.sub(r'\b\d{5,}\b', '', text)

    return text

# Assuming 'result_df' is your DataFrame with 'Contents' column
result_df['Preprocessed'] = result_df['Contents'].apply(preprocess_text)

# Display the resulting DataFrame
print(result_df)


     File Name                                           Contents  \
0    32067.txt   Accrol Group Holdings plc / Annual Report and...   
1    32541.txt   The A–Z of \nexperience\nAnnual Report 2017 L...   
2    31604.txt   ANNUAL REPORT AND ACCOUNTS 2017\nPURPOSEFUL\n...   
3    31061.txt   LoopUp Group plc | Annual Report & Accounts 2...   
4    32186.txt   Norcros plc Annual report and accounts 2017\n...   
..         ...                                                ...   
358  31059.txt   ANNUAL REPORT AND ACCOUNTS 2017\nREGISTERED C...   
359  31984.txt   Annual Report  \n& Accounts\nfor the year end...   
360  31074.txt   Pillars of growth\nCircassia Pharmaceuticals ...   
361  32168.txt    SHAPING A  \nsustainable future\nFlybe Group...   
362  32236.txt   annual report & accounts 2017\nSysGroup Plc\n...   

                                          Preprocessed  
0    accrol group holdings plc annual report and ac...  
1    the az of experience annual report 2017 landse...  


Save to CSV

In [None]:
result_df.to_csv('output_file.csv', index=False, escapechar='\\')


Load from CSV


In [None]:
import pandas as pd

In [None]:
result_df = pd.read_csv('output_file.csv')
result_df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/output_file.csv'

Stemming and Lemmatization (not required). We have the option to do it but it doesnt generate correct summaries

In [None]:
# Function to apply stemming and lemmatization
def apply_stemming_lemmatization(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# Assuming 'result_df' is your DataFrame with 'Preprocessed' column
result_df['Root'] = result_df['Preprocessed'].apply(apply_stemming_lemmatization)

# Display the resulting DataFrame
print(result_df)


Sliding Window Protocol Definaion


In [None]:
# Load the model and tokenizer on GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load the model and tokenizer on the selected device
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

def summarize(text, maxSummarylength=500):
    # Encode the text and summarize
    inputs = tokenizer.encode("summarize: " +
                              text,
                              return_tensors="pt",
                              max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(inputs, max_length=maxSummarylength,
                                 min_length=int(maxSummarylength/5),
                                 length_penalty=10.0,
                                 num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def split_text_into_pieces(text, max_tokens=900, overlapPercent=10):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Calculate the overlap in tokens
    overlap_tokens = int(max_tokens * overlapPercent / 100)

    # Split the tokens into chunks of size
    # max_tokens with overlap
    pieces = [tokens[i:i + max_tokens]
              for i in range(0, len(tokens),
                             max_tokens - overlap_tokens)]

    # Convert the token pieces back into text
    text_pieces = [tokenizer.decode(
        tokenizer.convert_tokens_to_ids(piece),
        skip_special_tokens=True) for piece in pieces]

    return text_pieces

def recursive_summarize(text, max_length=5000, recursionLevel=0):
    recursionLevel = recursionLevel + 1
    print("######### Recursion level: ",
          recursionLevel, "\n\n######### ")
    tokens = tokenizer.tokenize(text)
    expectedCountOfChunks = len(tokens) / max_length
    max_length = int(len(tokens) / expectedCountOfChunks) + 2

    # Break the text into pieces of max_length
    pieces = split_text_into_pieces(text, max_tokens=max_length)

    print("Number of pieces: ", len(pieces))
    # Summarize each piece
    summaries = []
    k = 0
    for k in range(0, len(pieces)):
        piece = pieces[k]
        print("****************************************************")
        print("Piece:", (k + 1), " out of ", len(pieces), "pieces")
        # print(piece, "\n")
        summary = summarize(piece, maxSummarylength=max_length/3*2)
        print("SUMMARY: ", summary)
        summaries.append(summary)
        print("****************************************************")

    concatenated_summary = ' '.join(summaries)

    tokens = tokenizer.tokenize(concatenated_summary)

    if len(tokens) > max_length:
        # If the concatenated_summary is too long, repeat the process
        print("############# GOING RECURSIVE ##############")
        return recursive_summarize(concatenated_summary,
                                   max_length=max_length,
                                   recursionLevel=recursionLevel)
    else:
        # Concatenate the summaries and summarize again
        final_summary = concatenated_summary
        if len(pieces) > 1:
            final_summary = summarize(concatenated_summary,
                                       maxSummarylength=max_length)

        # Save CSV after processing each row
        # result_df.to_csv('output_file.csv', index=False, escapechar='\\')

        return final_summary


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# Assuming 'result_df' is your DataFrame with 'Contents' and 'Preprocessed' columns
tqdm.pandas()

# Specify the starting and ending indices for the range of rows you want to process
start_index = 6
end_index = 10

# Create an empty DataFrame to store the selected summaries
selected_summaries_df = pd.DataFrame(columns=['Selected_Summary'])

# Process each row in the specified range
for index, row in result_df.iloc[start_index:end_index + 1].iterrows():
    # Apply summarization to the 'Preprocessed' column. You can change this to 'Root' if you want to perform it on lemmatized text 
    summary = recursive_summarize(row['Preprocessed'])

    # Append the summary to the selected_summaries_df
    selected_summaries_df = selected_summaries_df.append({'Selected_Summary': summary}, ignore_index=True)

    # Copy 'Selected_Summary' values to 'Summary' column if not NaN
    result_df.loc[index, 'Summary'] = summary if not pd.isna(summary) else result_df.loc[index, 'Summary']

    # Save the CSV file after processing each row
    result_df.to_csv('output_file.csv', index=False, escapechar='\\')

    # Clear the output of the cell
    clear_output(wait=True)

# Display the resulting DataFrame
print(result_df)

# Clear the output in Jupyter Notebook
IPython.display.clear_output(wait=True)


######### Recursion level:  1 

######### 
Number of pieces:  21
****************************************************
Piece: 1  out of  21 pieces
SUMMARY:  S saga plc annual report and accounts for the year ending 31 january 2017. Financial highlights a renewed focus on our customers journey allows us to understand their needs to improve their lives day to day see what weve learnt. underlying profit before tax 1 m 56 0 40 80 120 160 200 17 16 15 1638 1774 1874 profit beforeTax 1 m 97 040 80 120160 200 1716 15 1138 1762 1933 product holding per hac 21 core products available operating cash flow m 222 0 50 100 150 200 250 17 1615 1630 1781 2176 basic earnings per share 2 pence 60 0 4 8 12 16 17 16 16 15 86 133 141 debt ratio net debt to ebitda 176 0 1 2 3 4 16 15 15 ipo 25 31 23 19 dividend per share pence 181 0 2 4 6 8 10 17 16 14 15 41 72 72 85 highaffinity customers hacs 483k 1 profit beforetax excluding derivatives and ogden impact 2 from continuing operations strategic report govern

IndexError: index out of range in self

In [None]:
result_df.head()