### Text Summarization of 10K Reports
### Step 1: PDF to Text Conversion
### Step 2: Text preprocessing and Splitting to separate sections
### Step 3: Text Summarization with Gemini for Training Data

In [None]:
!pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.0-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.4/56.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.28.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20231228 pdfplumber-0.11.0 pypdfium2-4.28.0


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#### SPLIT SECTIONS AND STORE IN DF

In [None]:
def contains_pair (line):
    # Define the pairs to search for
    pairs = [("1", "business"), ("1a", "risk"), ("1b", "staff"), ("2", "properties"), ("3", "legal"), ("4", "safety"), ("5", "market"), ("6", "reserved"), ("7", "management"),
             ("7a", "disclosure"), ("8", "financial"), ("9", "changes"), ("9a", "control"), ("9b", "other"), ("9c", "disclosure"), ("10", "director"), ("11", "executive"),
             ("12", "security"), ("13", "certain"), ("14", "principal"), ("15", "exhibits"), ("16", "form"),
             ("1(a)", "risk"), ("1(b)", "staff"), ("7(a)", "disclosure"), ("9(a)", "control"), ("9(b)", "other"), ("9(c)", "disclosure")]

    # Initialize flag to indicate if pair is found
    pair_found = False

    # Check if any of the pairs is found in the line
    for pair in pairs:
        if pair[0].lower() in line.lower() and pair[1].lower() in line.lower():
            pair_found = True
            break

    return pair_found


In [None]:
# Updated extraction to specifically extract relevant sections of 10K Reports based on typical patterns of reports across multiple companies.
# Companies not following the typical patterns will not be able to be extracted, and specific customisation would be required for such company.

import os
import pdfplumber
import pandas as pd

def extract_text_and_split_sections(pdf_path):
    # Initialize variables
    sections = {}
    current_section = None
    start_extraction = False  # Flag to indicate when to start extraction
    part_iv_found = False

    # Open the PDF file and extract text line by line
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for line in page.extract_text().split('\n'):
                # Check for "Part IV" to start extraction
                if line.lower().startswith('part iv'):
                    part_iv_found = True
                    continue  # Skip to the next iteration of the loop

                # Check for "Part I" after "Part IV" to start extraction
                if part_iv_found and line.lower().startswith('part i'):
                    start_extraction = True

                # Start extraction if the flag is set and line starts with "Item"
                if start_extraction:
                    if line.startswith('Item') or line.startswith('ITEM'):
                        pair_found = contains_pair (line)
                        if pair_found:
                            # Start of a new section
                            current_section = line.split('.')[0].strip()
                            sections[current_section] = [line]  # Start the section with the current line
                        elif current_section:
                            # Add the line to the current section
                            sections[current_section].append(line)
                    elif current_section:
                        # Add the line to the current section
                        sections[current_section].append(line)

    # Create a list of dictionaries containing section data
    data = [{'Item Number': item_number, 'Section Text': '\n'.join(section_content)}
            for item_number, section_content in sections.items()]

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(data)

    return df


In [None]:
def naming_df (pdf_path):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    # Replace spaces and periods with underscores
    base_name = base_name.replace(" ", "_").replace(".", "_")
    # Check if the first character is a number
    if base_name[0].isdigit():
        # Add an underscore before the name
        base_name = f"_{base_name}"
    df_name = f"{base_name}_df"
    return df_name


#### Gemini Summary

In [None]:
pip install -q -U google-generativeai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/137.4 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/137.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.4/137.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
# Used to securely store your API key
from google.colab import userdata

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
# Refer to Gemini API Quickstart with Python
# In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name GOOGLE_API_KEY.

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
# Setting up Gemini Model
generation_config = {
    "temperature":0,
    "top_p":1,
    "top_k":1,
    "max_output_tokens":400,
}

safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

In [None]:
model = genai.GenerativeModel('gemini-pro',
                              # generation_config = generation_config,
                              safety_settings = safety_settings)

In [None]:
INPUT_TOKEN_LIMIT = 12288
# OUTPUT_TOKEN_LIMIT = 4096

def split_content(content, max_tokens):
    """
    Split content into sub-sections not exceeding the maximum token limit.
    """
    sub_sections = []
    current_section = ''
    tokens_count = 0
    for token in content.split():
        token_length = len(token.split())  # Adjusting token length considering whitespace
        if tokens_count + token_length <= max_tokens:
            current_section += token + ' '
            tokens_count += token_length
        else:
            sub_sections.append(current_section.strip())
            current_section = token + ' '
            tokens_count = token_length
    sub_sections.append(current_section.strip())
    return sub_sections

In [None]:
import time  # Import the time module

def gemini_summary (df):
    # Loop through the rows of the DataFrame
    for index in range(len(df)):

        # Skip processing for items 9 and beyond
        item_number = df.iloc[index, 0]
        if any(item in item_number for item in ["9", "9A", "9B", "9C","10", "11", "12", "13", "14", "15", "16"]):
            continue

        # Construct the question using the content from the specific column
        content = df.iloc[index, 1]

        # Split content into sub-sections respecting input token limit
        sub_sections = split_content(content, INPUT_TOKEN_LIMIT)

        gemini_summaries = []
        for sub_section in sub_sections:
            # Construct the question for the sub-section
            question = "Please summarize the following content: " + sub_section

            # Generate the response using the Gemini model
            response = model.generate_content(question)

            # Extract the gemini summary from the response
            gemini_summary = response.text
            gemini_summaries.append(gemini_summary)

        # Combine gemini summaries of sub-sections into one summary
        combined_summary = ' '.join(gemini_summaries)

        # Add the combined gemini summary to the DataFrame
        df.loc[index, 'Gemini Summary'] = combined_summary

    return df

#### Single File Extraction and Generate Gemini Summary

In [None]:
# For Single file extraction to create df and get gemini summary

# Usage - Specify file path:
pdf_path = '/content/drive/MyDrive/PLP Project/data/Merck & Co., Inc._20211231.pdf'
df = extract_text_and_split_sections(pdf_path)
# print(df.head())

# add gemini summary
df_gemini = gemini_summary (df)

# renaming df and store df
df_name = naming_df (pdf_path)
globals()[df_name] = df_gemini
print(df_name)
# print(Apple_Inc__20210925_df.head())

# Specify the directory path
directory = '/content/drive/MyDrive/PLP Project'

# Check if the DataFrame exists
if df_name in globals():
    json_file_path = os.path.join(directory, f"{df_name}.json")
    df = globals()[df_name]
    df.to_json(json_file_path, orient='records')
    print(f"DataFrame '{df_name}' saved to: {json_file_path}")
else:
    print(f"DataFrame '{df_name}' not found. Skipping...")


Merck_&_Co_,_Inc__20211231_df
DataFrame 'Merck_&_Co_,_Inc__20211231_df' saved to: /content/drive/MyDrive/PLP Project/Merck_&_Co_,_Inc__20211231_df.json


In [None]:
to_markdown(df.iloc[2, 2])

> **Summary of Financial Statements and Supplementary Data**
> 
> **Item 8. Financial Statements and Supplementary Data**
> 
> The consolidated balance sheet of Merck & Co., Inc. and subsidiaries as of December 31, 2021 and 2020, and the related consolidated statements of income, of comprehensive income, of equity and of cash flows for each of the three years in the period ended December 31, 2021, the notes to consolidated financial statements, and the report dated February 25, 2022 of PricewaterhouseCoopers LLP, independent registered public accounting firm, are as follows:
> 
> **Condensed Consolidated Statement of Income**
> 
> Sales for 2021 were $48,704 million, $41,518 million in 2020, and $39,121 million in 2019.
> 
> **Costs, Expenses, and Other**
> 
> 2021                                   2020                                     2019
> Cost of sales                      $13,626 million           $13,618 million         $12,016 million
> Selling, general and administrative      9,634 million                 8,955 million            9,455 million
> Research and development                12,245 million           13,397 million          9,724 million
> Restructuring costs                              661 million                   575 million              626 million
> Other (income) expense, net                    (1,341) million         (890) million               129 million
> 
> **Income from Continuing Operations**
> 
> 2021                                        2020                                         2019
> Operating income before taxes           $13,879 million        $ 5,863 million         $ 7,171 million
> Income from Continuing Operations     12,358 million             4,523 million             5,606 million
> 
> **Net Income**
> 
> 2021                                            2020                                       2019
> Income from Continuing Operations        $12,345 million      $ 4,519 million        $ 5,690 million
> Income from Discontinued Operations       704 million                  2,548 million          4,153 million
> 
> Net Income                                            $13,049 million     $ 7,067 million         $ 9,843 million
> 
> **Condensed Consolidated Statement of Comprehensive Income**
> 
> 2021                                   2020                                           2019
> Net Income                                  $13,049 million    $ 7,067 million               $ 9,843 million
> Other Comprehensive Income (Loss)
> Net of Taxes:
> Net unrealized gain (loss) on
> derivatives, net of reclassifications       410 million                 (297) million         (135) million
> Net unrealized (loss) gain on
> investments, net of reclassifications       (18) million                 96 million                    —
> Benefit plan net gain (loss) and prior
> service credit (cost), net of amortization      1,769 million                (279) million        (705) million
> Cumulative translation adjustment              (423) million                153 million                96 million
> Comprehensive Income                                 $14,805 million $ 6,626 million $ 9,195 million
> 
> **Condensed Consolidated Balance Sheet**
> 
> Assets
> 
> 2021                                                              2020
> Current Assets
> Cash and cash equivalents                   $ 8,096 million     $ 8,050 million
> Accounts receivable (net of allowance for
> doubtful accounts of $62 million in 2021 and
> $67 million in 2020)                                  9,230 million        6,803 million
> Inventories (excludes inventories of $2,194
> million in 2021 and $2,070 million in 2020
> classified in Other assets - see Note 8)      5,953 million        5,554 million
> Other current assets                                          6,987 million       4,674 million
> Current assets of discontinued operations    -                        2,683 million
> 
> Total current assets                                    $30,266 million   $27,764 million
> 
> Investments                                                        370 million                785 million
> Property, Plant and Equipment                         37,471 million     35,162 million
> Goodwill                                                              21,264 million     18,882 million
> Other Intangibles, Net                                       22,933 million    14,101 million
> Other Assets                                                        11,582 million     9,881 million
> 
> Noncurrent Assets of Discontinued Operations    -                        3,175 million
> 
> Total Assets                                                          $105,694 million $91,588 million
> 
> Liabilities and Equity
> 
> Current Liabilities
> Loans payable and current portion of
> long-term debt                                                        $ 2,412 million  $ 6,431 million
> Trade accounts payable                                       4,609 million        4,327 million
> Accrued and other current liabilities             13,859 million        12,212 million
> Income taxes payable                                          1,224 million        1,597 million
> Dividends payable                                                     1,768 million        1,674 million
> 
> Current liabilities of discontinued
> operations                                                                     -                        1,086 million
> 
> Total current liabilities                                        $23,872 million $27,327 million
> 
> Long-Term Debt                                                         $30,690 million $25,360 million
> 
> Deferred Income Taxes                                      $ 3,441 million $ 1,005 million
> 
> Other Noncurrent Liabilities                                   $ 9,434 million $12,306 million
> 
> Noncurrent Liabilities of Discontinued
> Operations                                                                   -                        186 million
> 
> Noncontrolling Interests                                           $ 73 million           $ 87 million
> 
> Total Equity                                                                $38,257 million $25,404 million
> 
> Total Liabilities and Equity                                 $105,694 million $91,588 million **Alliance Revenue**
> 
> - Lynparza: $989 million
> - Koselugo: $29 million
> 
> **Total Alliance Revenue:** $1,018 million
> 
> **Cost of Sales**
> 
> - Lynparza: $167 million
> - Koselugo: $247 million
> 
> **Selling, General and Administrative Expenses**
> 
> - Lynparza: $178 million
> - Koselugo: $160 million
> 
> **Research and Development Expenses**
> 
> - Lynparza: $120 million
> - Koselugo: $133 million
> 
> **Total Alliance Expenses:** $835 million
> 
> **December 31, 2021 Receivables**
> 
> - AstraZeneca: $271 million
> - Eisai: $200 million
> - Bayer AG: $114 million
> 
> **December 31, 2021 Payables**
> 
> - AstraZeneca: $415 million
> - Eisai: $625 million
> - Bayer AG: $472 million
> 
> **December 31, 2021 Intangible Assets (Other)**
> 
> - Lynparza: $1.1 billion
> - Lenvima: $1.0 billion
> - Adempas/Verquvo: $806 million ($25 million contingent payment pending)
> 
> **December 31, 2021 Net Sales**
> 
> - Adempas: $252 million
> - Verquvo: $7 million
> 
> **December 31, 2021 Cost of Sales**
> 
> - Adempas: $424 million
> - Verquvo: $196 million
> 
> **December 31, 2021 Selling, General and Administrative Expenses**
> 
> - Adempas/Verquvo: $126 million
> 
> **December 31, 2021 Research and Development Expenses**
> 
> - Adempas/Verquvo: $53 million
> 
> **Total Adempas/Verquvo Expenses:** $852 million
> 
> **Ridgeback Biotherapeutics LP (Molnupiravir)**
> 
> - Sales: $952 million
> - Cost of Sales: $494 million
> - Selling, General and Administrative Expenses: $33 million
> - Research and Development Expenses: $60 million
> 
> **Note:** Figures have been rounded for clarity. In 2022, Merck faces ongoing lawsuits related to the alleged false claims about the mumps component of the Measles-Mumps-Rubella (MMR II) vaccine and (separately) the efficacy of the M-M-R II vaccine. Merck is not exercising its right to participate in the mumps false claim lawsuit and two putative class action lawsuits alleging fraud over the M-M-R II vaccine are pending in the Eastern District of Pennsylvania and have been consolidated. The Company has also filed numerous counterclaims and motions to dismiss, none of which have been successful, and it has incurred significant costs in defending these lawsuits. The U.S. government has not exercised its right to participate in the lawsuits. Merck continues to believe that it has valid defenses to all claims. Moreover, two generic manufacturers of the M-M-R II vaccine have filed abbreviated New Drug Applications (ANDAs) with the FDA to market generic forms of the M-M-R II and litigation has commenced over these applications. The Company intends to vigorously defend all patents covering the M-M-R II vaccine.

#### Loop through multiple pdfs

In [None]:
# Loop through selected pdfs to create df and get gemini summary and json file

# # Specify the directory path
# directory = '/content/drive/MyDrive/PLP Project/data'
# list_of_files = ['Walmart Inc._20210131.pdf']

directory = '/content/drive/MyDrive/PLP Project/data/All'
# Loop through all files in the directory
for file_name in os.listdir(directory):

# Loop through list of files in the directory
# for file_name in list_of_files:
    # Construct the full path to the PDF file
    pdf_path = os.path.join(directory, file_name)

    # Extract text and split sections from the PDF
    df = extract_text_and_split_sections(pdf_path)

    # add gemini summary
    df_gemini = gemini_summary (df)

    # Create DataFrame variable name
    df_name = naming_df (pdf_path)
    # Assign DataFrame to the created name
    globals()[df_name] = df_gemini

    # Print the DataFrame name and the first few rows
    print(df_name)
    # print(df.head())

    # Specify the json output directory path
    json_directory = '/content/drive/MyDrive/PLP Project/updated'

    # Check if the DataFrame exists
    if df_name in globals():
        json_file_path = os.path.join(json_directory, f"{df_name}.json")
        df = globals()[df_name]
        df.to_json(json_file_path, orient='records')
        print(f"DataFrame '{df_name}' saved to: {json_file_path}")
    else:
        print(f"DataFrame '{df_name}' not found. Skipping...")

Verizon_Communications_Inc__20221231_df
DataFrame 'Verizon_Communications_Inc__20221231_df' saved to: /content/drive/MyDrive/PLP Project/updated/Verizon_Communications_Inc__20221231_df.json
Verizon_Communications_Inc__20211231_df
DataFrame 'Verizon_Communications_Inc__20211231_df' saved to: /content/drive/MyDrive/PLP Project/updated/Verizon_Communications_Inc__20211231_df.json
UnitedHealth_Group_Incorporated_20231231_df
DataFrame 'UnitedHealth_Group_Incorporated_20231231_df' saved to: /content/drive/MyDrive/PLP Project/updated/UnitedHealth_Group_Incorporated_20231231_df.json
T-Mobile_US,_Inc__20211231_df
DataFrame 'T-Mobile_US,_Inc__20211231_df' saved to: /content/drive/MyDrive/PLP Project/updated/T-Mobile_US,_Inc__20211231_df.json
The_Walt_Disney_Company_20230930_df
DataFrame 'The_Walt_Disney_Company_20230930_df' saved to: /content/drive/MyDrive/PLP Project/updated/The_Walt_Disney_Company_20230930_df.json
The_Walt_Disney_Company_20221001_df
DataFrame 'The_Walt_Disney_Company_20221001_