In [2]:
import pandas as pd
import re
from PyPDF2 import PdfReader
import os

def count_words_in_pdf(pdf_path: str, words: list[str]) -> pd.DataFrame:
    """
    Searches a PDF file for a list of words (case-insensitive) and returns a DataFrame
    with the total count of each word found in the entire document, along with the filename.

    Args:
        pdf_path: The path to the PDF file.
        words: A list of words to search for.

    Returns:
        A pandas DataFrame with one row and columns for each word in the
        `words` list, containing the total count of that word in the PDF,
        and a 'filename' column.
        Returns an empty DataFrame if the PDF cannot be read.
    """
    word_counts = {word.lower(): 0 for word in words}  # Initialize counts (lowercase for case-insensitive)
    filename = os.path.basename(pdf_path)  # Extract filename

    try:
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                if text:
                    for word in words:
                        # Use regex for case-insensitive whole word counting
                        count = len(re.findall(r'\b' + re.escape(word) + r'\b', text, re.IGNORECASE))
                        word_counts[word.lower()] += count

        word_counts['filename'] = filename  # Add filename to the dictionary
        return pd.DataFrame([word_counts])

    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading PDF file '{pdf_path}': {e}")
        return pd.DataFrame()

def process_pdfs_in_folder(folder_path: str, words: list[str]) -> pd.DataFrame:
    """
    Processes all PDF files in a folder, counts the specified words in each,
    and combines the results into a single DataFrame.

    Args:
        folder_path: The path to the folder containing the PDF files.
        words: A list of words to search for.

    Returns:
        A pandas DataFrame with the word counts for each PDF file.
        Returns an empty DataFrame if no PDF files are found or if an error occurs.
    """
    all_results = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            result_df = count_words_in_pdf(pdf_path, words)
            if not result_df.empty:  # Only append if the DataFrame is not empty (no errors)
                all_results.append(result_df)

    if not all_results:
        print(f"No PDF files found in the folder: '{folder_path}' or error reading files.")
        return pd.DataFrame()  # Return an empty DataFrame

    return pd.concat(all_results, ignore_index=True)  # Combine all results into one DataFrame

if __name__ == "__main__":
    # Example usage:
    folder_path = "10K"  # Relative path to the 10K folder
    search_words = ["accrual", "accruals", "accrue", "accrued", "accrues", "accruing", "acquire", "acquired", "acquiree",
                    "acquirer", "acquirers", "acquires", "acquiring", "acquisition", "acquisitions", "affiliate",
                    "affiliated", "affiliates", "affiliation", "affiliations", "alliance", "alliances", "bankrupt",
                    "bankruptcies", "bankruptcy", "carryback", "carryforward", "collaborate", "collaborations",
                    "collaboration", "collaborative", "collaborators", "collateral", "collateralization",
                    "collateralize", "collateralized", "complex", "complexities", "complexity", "contingencies",
                    "contingency", "contingent", "contingently", "contract", "contracted", "contracting", "contracts",
                    "contractual", "contractually", "conversion", "conversions", "convertible", "copyright", "copyrights",
                    "counterparties", "counterparty", "covenant", "covenants", "derivative", "derivatives", "embedded",
                    "entities", "exercisability", "exercisable", "exercised", "floating", "foreign", "franchise",
                    "franchises", "futures", "global", "globally", "hedge", "hedged", "hedges", "hedging", "infringe",
                    "infringed", "infridgement", "infringes", "infringing", "insolvency", "insolvent", "intangible",
                    "intangibles", "interconnection", "international", "internationally", "lawsuit", "lawsuits", "lease",
                    "leaseback", "lease", "leasehold", "leases", "leasing", "lessee", "lessees", "lessor", "lessors",
                    "license", "licensed", "licensee", "licensees", "licenses", "licensing", "licensor", "licensors",
                    "lien", "liens", "liquidate", "liquidated", "liquidating", "liquidation", "liquidations", "liquidator",
                    "litigation", "merge", "merged", "merger", "mergers", "merging", "nationalization", "outsource",
                    "outsourced", "outsourcing", "partner", "partnering", "partners", "partnerships", "partnership",
                    "patent", "patentable", "patented", "patents", "reacquired", "recapitalization", "recapitalizations",
                    "reclassification", "reclassifications", "reclassified", "reclassify", "reissued", "reorganization",
                    "reorganizations", "reorganized", "repatriate", "repatriated", "repatriation", "restructure", "restructured",
                    "restructuring", "restructurings", "revaluation", "revalued", "revocable", "revocation", "revoke",
                    "revoked", "royalties", "royalty", "securitizations", "securitization", "securitized", "segment",
                    "segmented", "segments", "sovereign", "subcontract", "subcontractor", "subcontractors", "subcontracts",
                    "sublease", "subleased", "subleases", "sublet", "sublicense", "subsidiaries", "subsidiary", "subsidies",
                    "subsidy", "swap", "swaps", "takeover", "takeovers", "trademark", "trademarks", "unexercisable",
                    "unexercised", "unrecognized", "unremittted", "venture", "ventures", "warranty", "warranties", "worldwide"]

    word_count_df = process_pdfs_in_folder(folder_path, search_words)
    print(word_count_df)


    accrual  accruals  accrue  accrued  accrues  accruing  acquire  acquired  \
0         7        10       5       16        0         0        2        73   
1         0         4       2        9        0         0        6        40   
2         7         0       0        3        0         3       10         7   
3         7        11       3       14        0         0        0        53   
4         0         4       2        9        0         0        6        43   
5         6         0       0        4        0         3        9         7   
6         7        12       3       14        0         0        0        35   
7         0         4       5        9        0         0        9        51   
8         7         0       1        4        0         3       15         4   
9         7        12       3       16        0         0        0        49   
10        0         4       5        9        0         0        9        51   
11       16         8       4       12  

In [3]:
word_count_df

Unnamed: 0,accrual,accruals,accrue,accrued,accrues,accruing,acquire,acquired,acquiree,acquirer,...,unexercisable,unexercised,unrecognized,unremittted,venture,ventures,warranty,warranties,worldwide,filename
0,7,10,5,16,0,0,2,73,0,1,...,0,0,3,0,4,0,2,2,33,2012_3M.pdf
1,0,4,2,9,0,0,6,40,0,0,...,0,0,3,0,1,1,0,0,6,2012_Amazon.pdf
2,7,0,0,3,0,3,10,7,0,16,...,0,0,0,0,4,6,1,0,23,2012_AmericanExpress.pdf
3,7,11,3,14,0,0,0,53,0,1,...,0,0,3,0,4,2,2,2,32,2013_3M.pdf
4,0,4,2,9,0,0,6,43,0,0,...,0,0,2,0,1,1,0,0,6,2013_Amazon.pdf
5,6,0,0,4,0,3,9,7,0,19,...,0,0,0,0,16,8,1,0,22,2013_AmericanExpress.pdf
6,7,12,3,14,0,0,0,35,0,0,...,0,0,4,0,4,2,2,2,32,2014_3M.pdf
7,0,4,5,9,0,0,9,51,0,0,...,0,0,3,0,1,1,0,0,6,2014_Amazon.pdf
8,7,0,1,4,0,3,15,4,0,19,...,0,0,0,0,12,4,1,0,25,2014_AmericanExpress.pdf
9,7,12,3,16,0,0,0,49,0,3,...,0,0,4,0,0,3,2,2,37,2015_3M.pdf
