### Text Summarization of 10K Reports
### Step 1: PDF to Text Conversion
### Step 2: Text preprocessing and Splitting to separate sections
### Step 3: Text Summarization Models (GPT, BERT) - Rougue score


In [None]:
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.4/56.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.28.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.0 pypdfium2-4.28.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### SPLIT SECTIONS AND STORE IN DF

In [None]:
def contains_pair (line):
    # Define the pairs to search for
    pairs = [("1", "business"), ("1a", "risk"), ("1b", "staff"), ("2", "properties"), ("3", "legal"), ("4", "safety"), ("5", "market"), ("6", "reserved"), ("7", "management"),
             ("7a", "disclosure"), ("8", "financial"), ("9", "changes"), ("9a", "control"), ("9b", "other"), ("9c", "disclosure"), ("10", "director"), ("11", "executive"),
             ("12", "security"), ("13", "certain"), ("14", "principal"), ("15", "exhibits"), ("16", "form"),
             ("1(a)", "risk"), ("1(b)", "staff"), ("7(a)", "disclosure"), ("9(a)", "control"), ("9(b)", "other"), ("9(c)", "disclosure")]

    # Initialize flag to indicate if pair is found
    pair_found = False

    # Check if any of the pairs is found in the line
    for pair in pairs:
        if pair[0].lower() in line.lower() and pair[1].lower() in line.lower():
            pair_found = True
            break

    return pair_found


In [None]:
# Updated extraction to specifically extract relevant sections of 10K Reports based on typical patterns of reports across multiple companies.
# Companies not following the typical patterns will not be able to be extracted, and specific customisation would be required for such company.

import os
import pdfplumber
import pandas as pd

def extract_text_and_split_sections(pdf_path):
    # Initialize variables
    sections = {}
    current_section = None
    start_extraction = False  # Flag to indicate when to start extraction
    part_iv_found = False

    # Open the PDF file and extract text line by line
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for line in page.extract_text().split('\n'):
                # Check for "Part IV" to start extraction
                if line.lower().startswith('part iv'):
                    part_iv_found = True
                    continue  # Skip to the next iteration of the loop

                # Check for "Part I" after "Part IV" to start extraction
                if part_iv_found and line.lower().startswith('part i'):
                    start_extraction = True

                # Start extraction if the flag is set and line starts with "Item"
                if start_extraction:
                    if line.startswith('Item') or line.startswith('ITEM'):
                        pair_found = contains_pair (line)
                        if pair_found:
                            # Start of a new section
                            current_section = line.split('.')[0].strip()
                            sections[current_section] = [line]  # Start the section with the current line
                        elif current_section:
                            # Add the line to the current section
                            sections[current_section].append(line)
                    elif current_section:
                        # Add the line to the current section
                        sections[current_section].append(line)

    # Create a list of dictionaries containing section data
    data = [{'Item Number': item_number, 'Section Text': '\n'.join(section_content)}
            for item_number, section_content in sections.items()]

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)

    return df


In [None]:
def naming_df (pdf_path):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    # Replace spaces and periods with underscores
    base_name = base_name.replace(" ", "_").replace(".", "_")
    # Check if the first character is a number
    if base_name[0].isdigit():
        # Add an underscore before the name
        base_name = f"_{base_name}"
    df_name = f"{base_name}_df"
    return df_name


#### Text Summarization

#### GPT2 Model

In [2]:
import os
import pandas as pd


In [3]:
# to suppress future warning from scikit-learn's KMeans clustering algorithm
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [4]:
!pip install bert-extractive-summarizer
from summarizer import TransformerSummarizer

Collecting bert-extractive-summarizer
  Downloading bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Installing collected packages: bert-extractive-summarizer
Successfully installed bert-extractive-summarizer-0.10.1


In [None]:
gpt2_summarizer  = TransformerSummarizer(transformer_type="GPT2",transformer_model_key="gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
GPT2_INPUT_TOKEN_LIMIT = 12288

def gpt2_summary (df, gpt2_summarizer):
    # Loop through the rows of the DataFrame
    for index in range(len(df)):

        # Skip processing for items 9 and beyond
        item_number = df.iloc[index, 0]
        if any(item in item_number for item in ["9", "9A", "9B", "9C","10", "11", "12", "13", "14", "15", "16"]):
            continue

        content = df.iloc[index, 1]

        # Split content into sub-sections respecting input token limit
        sub_sections = split_content(content, GPT2_INPUT_TOKEN_LIMIT)

        gpt2_summaries = []
        for sub_section in sub_sections:
            summary = gpt2_summarizer(sub_section, min_length=50)
            gpt2_summary = summary
            gpt2_summaries.append(gpt2_summary)

        combined_summary = ' '.join(gpt2_summaries)
        df.loc[index, 'GPT2 Summary'] = combined_summary

    return df

In [None]:
import os

# Specify the directory path
directory = "/content/drive/MyDrive/PLP Project/updated/focus"

# List files in the directory
files = os.listdir(directory)

# Print the list of files
print("Files in the directory:")
for file in files:
    print(file)


Files in the directory:
UnitedHealth_Group_Incorporated_20221231_df.json
Walmart_Inc__20230131_df.json
Walmart_Inc__20220131_df.json
Walmart_Inc__20210131_df.json
Verizon_Communications_Inc__20231231_df.json
Verizon_Communications_Inc__20221231_df.json
Verizon_Communications_Inc__20211231_df.json
UnitedHealth_Group_Incorporated_20231231_df.json
UnitedHealth_Group_Incorporated_20211231_df.json
United_Airlines_Holdings,_Inc__20231231_df.json
United_Airlines_Holdings,_Inc__20221231_df.json
United_Airlines_Holdings,_Inc__20211231_df.json


In [None]:
# To loop through JSON Files and generate GPT2 summary without fine-tuning

# Specify the path to the JSON folder
json_directory = "/content/drive/MyDrive/PLP Project/updated/focus"

# Loop through all files in the directory
for file_name in os.listdir(json_directory):
    # Construct the full path to the json file
    json_file_path = os.path.join(json_directory, file_name)
    df = pd.read_json(json_file_path)

    df_gpt2 = gpt2_summary (df, gpt2_summarizer)
    df_name = os.path.splitext(file_name)[0]

    df_gpt2.to_json(json_file_path, orient='records')
    print(f"DataFrame '{df_name}' saved to: {json_file_path}")


DataFrame 'UnitedHealth_Group_Incorporated_20221231_df' saved to: /content/drive/MyDrive/PLP Project/updated/focus/UnitedHealth_Group_Incorporated_20221231_df.json
DataFrame 'Walmart_Inc__20230131_df' saved to: /content/drive/MyDrive/PLP Project/updated/focus/Walmart_Inc__20230131_df.json
DataFrame 'Walmart_Inc__20220131_df' saved to: /content/drive/MyDrive/PLP Project/updated/focus/Walmart_Inc__20220131_df.json
DataFrame 'Walmart_Inc__20210131_df' saved to: /content/drive/MyDrive/PLP Project/updated/focus/Walmart_Inc__20210131_df.json
DataFrame 'Verizon_Communications_Inc__20231231_df' saved to: /content/drive/MyDrive/PLP Project/updated/focus/Verizon_Communications_Inc__20231231_df.json
DataFrame 'Verizon_Communications_Inc__20221231_df' saved to: /content/drive/MyDrive/PLP Project/updated/focus/Verizon_Communications_Inc__20221231_df.json
DataFrame 'Verizon_Communications_Inc__20211231_df' saved to: /content/drive/MyDrive/PLP Project/updated/focus/Verizon_Communications_Inc__2021123

In [None]:
from transformers import GPT2Tokenizer
# Adjust report_item_col to view the different items of the report
report_item_col =1

# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Tokenize the text in column 1
tokens_column_1 = tokenizer.tokenize(df.iloc[report_item_col, 1])

# Tokenize the text in column 2
tokens_column_2 = tokenizer.tokenize(df.iloc[report_item_col, 2])

# Print the number of tokens in each column
print("Number of tokens in original text:", len(tokens_column_1))
print("Number of tokens in summary text:", len(tokens_column_2))
print("Original Text:")
print(df.iloc[report_item_col, 1])
print("Summary Text:")
print(df.iloc[report_item_col, 2])

Number of tokens in original text: 12689
Number of tokens in summary text: 2207
Original Text:
Item 1A. Risk Factors
The Company’s business, reputation, results of operations and financial condition, as well as the price of the Company’s stock, can be
affected by a number of factors, whether currently known or unknown, including those described below. When any one or more of these
risks materialize from time to time, the Company’s business, reputation, results of operations and financial condition, as well as the price
of the Company’s stock, can be materially and adversely affected.
Because of the following factors, as well as other factors affecting the Company’s results of operations and financial condition, past
financial performance should not be considered to be a reliable indicator of future performance, and investors should not use historical
trends to anticipate results or trends in future periods. This discussion of risk factors contains forward-looking statements.
This secti

#### Generating Rouge Scores against Gemini Summary

In [None]:
# Installing packages
!pip install rouge-score

import os
import pandas as pd
from openpyxl import Workbook, load_workbook
from rouge_score import rouge_scorer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=7f28e8a087540eb4b70336b0438cb3601dce19c38c77827cc4d1e557fcd38122
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Preprocessing summaries before comparison

def rouge_preprocessing (summary_text):
    # Convert to lowercase
    summary_text_lower = summary_text.lower()
    # Remove punctuation
    summary_text_no_punctuation = re.sub(r'[^\w\s]', '', summary_text_lower)
    # Tokenize the text
    summary_text_tokens = word_tokenize(summary_text_no_punctuation)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    summary_text_filtered = [word for word in summary_text_tokens if word not in stop_words]

    # Optionally perform stemming or lemmatization

    # Join the tokens back into a string
    summary_text_processed = ' '.join(summary_text_filtered)

    return summary_text_processed

In [None]:
# Generate rouge scores from summaries
from rouge_score import rouge_scorer

def generate_rouge_scores (reference_summary, candidate_summary):
    # Initialize Rouge scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Compute Rouge scores
    scores = scorer.score(candidate_summary, reference_summary)

    # Unpack scores dictionary and return individual metrics
    rouge1_precision = scores['rouge1'].precision
    rouge1_recall = scores['rouge1'].recall
    rouge1_fmeasure = scores['rouge1'].fmeasure

    rouge2_precision = scores['rouge2'].precision
    rouge2_recall = scores['rouge2'].recall
    rouge2_fmeasure = scores['rouge2'].fmeasure

    rougeL_precision = scores['rougeL'].precision
    rougeL_recall = scores['rougeL'].recall
    rougeL_fmeasure = scores['rougeL'].fmeasure

    return rouge1_precision, rouge1_recall, rouge1_fmeasure, \
           rouge2_precision, rouge2_recall, rouge2_fmeasure, \
           rougeL_precision, rougeL_recall, rougeL_fmeasure


In [None]:
def gpt2_zeroshot_rouge_scores (json_file_path):
    df = pd.read_json(json_file_path, encoding='latin1')  # Specify the encoding here
    rouge_scores_list = []

    # Loop through the rows of the DataFrame
    for index in range(len(df)):
        # Skip processing for items 9 and beyond
        item_number = df.iloc[index, 0]

        if any(item in item_number for item in ["9", "9A", "9B", "9C","10", "11", "12", "13", "14", "15", "16"]):
            continue

        reference_summary = df.iloc[index, 2]
        candidate_summary = df.iloc[index, 3]
        reference_summary = rouge_preprocessing(reference_summary)
        candidate_summary = rouge_preprocessing(candidate_summary)
        rouge_scores = generate_rouge_scores (reference_summary, candidate_summary)
        file_name = os.path.basename(json_file_path)
        rouge_scores_list.append([file_name, item_number, *rouge_scores])

    return rouge_scores_list

In [None]:
## Export ROUGE Scores to Excel
import datetime
from openpyxl import Workbook, load_workbook

# Function to create or load workbook and add data
def update_rouge_excel(output_directory, rouge_scores_list):
    # Define file path
    file_path = f"{output_directory}.xlsx"

    try:
        # Load existing workbook
        workbook = load_workbook(file_path)
        worksheet = workbook.active
    except FileNotFoundError:
        # If workbook doesn't exist, create a new one
        workbook = Workbook()
        worksheet = workbook.active
        # Add headers to the first row
        header = ["file_name", "item_number", "rouge1_precision", "rouge1_recall", "rouge1_fmeasure", "rouge2_precision",
                  "rouge2_recall", "rouge2_fmeasure", "rougeL_precision", "rougeL_recall", "rougeL_fmeasure"
                 ]
        worksheet.append(header)

    for rouge_scores in rouge_scores_list:
        worksheet.append(rouge_scores)
    workbook.save(file_path)


In [None]:
# To loop through JSON Files and generate rouge scores

# Specify the path to the JSON folder
json_directory = "/content/drive/MyDrive/PLP Project/updated"

# Create excel file named GPT2_zero_shot_rouge_score if it does not exist
# Headers to consist of file_name, item_number, 9 rouge scores values from generate_rouge_scores function
output_file_name = 'GPT2_zero_shot_rouge_score'
output_directory = os.path.join(json_directory, output_file_name)

# Loop through all files in the directory and append rouge scores to excel file
for file_name in os.listdir(json_directory):
    # Construct the full path to the json file
    json_file_path = os.path.join(json_directory, file_name)

    # Check if the path is a file
    if os.path.isfile(json_file_path):
        # code to store
        rouge_scores_list = gpt2_zeroshot_rouge_scores(json_file_path)
        update_rouge_excel(output_directory, rouge_scores_list)

