In [2]:
import pandas as pd
import re
from PyPDF2 import PdfReader
import os

def count_words_in_pdf(pdf_path: str, words: list[str]) -> pd.DataFrame:
    """
    Searches a PDF file for a list of words (case-insensitive) and returns a DataFrame
    with the total count of each word found in the entire document, along with the filename.

    Args:
        pdf_path: The path to the PDF file.
        words: A list of words to search for.

    Returns:
        A pandas DataFrame with one row and columns for each word in the
        `words` list, containing the total count of that word in the PDF,
        and a 'filename' column.
        Returns an empty DataFrame if the PDF cannot be read.
    """
    word_counts = {word.lower(): 0 for word in words}  # Initialize counts (lowercase for case-insensitive)
    filename = os.path.basename(pdf_path)  # Extract filename

    try:
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PdfReader(pdf_file)
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                if text:
                    for word in words:
                        # Use regex for case-insensitive whole word counting
                        count = len(re.findall(r'\b' + re.escape(word) + r'\b', text, re.IGNORECASE))
                        word_counts[word.lower()] += count

        word_counts['filename'] = filename  # Add filename to the dictionary
        return pd.DataFrame([word_counts])

    except FileNotFoundError:
        print(f"Error: PDF file not found at '{pdf_path}'")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading PDF file '{pdf_path}': {e}")
        return pd.DataFrame()

def process_pdfs_in_folder(folder_path: str, words: list[str]) -> pd.DataFrame:
    """
    Processes all PDF files in a folder, counts the specified words in each,
    and combines the results into a single DataFrame.

    Args:
        folder_path: The path to the folder containing the PDF files.
        words: A list of words to search for.

    Returns:
        A pandas DataFrame with the word counts for each PDF file.
        Returns an empty DataFrame if no PDF files are found or if an error occurs.
    """
    all_results = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            result_df = count_words_in_pdf(pdf_path, words)
            if not result_df.empty:  # Only append if the DataFrame is not empty (no errors)
                all_results.append(result_df)

    if not all_results:
        print(f"No PDF files found in the folder: '{folder_path}' or error reading files.")
        return pd.DataFrame()  # Return an empty DataFrame

    return pd.concat(all_results, ignore_index=True)  # Combine all results into one DataFrame

if __name__ == "__main__":
    # Example usage:
    folder_path = "10K"  # Relative path to the 10K folder
    search_words = ["accrual", "accruals", "accrue", "accrued", "accrues", "accruing", "acquire", "acquired", "acquiree",
                    "acquirer", "acquirers", "acquires", "acquiring", "acquisition", "acquisitions", "affiliate",
                    "affiliated", "affiliates", "affiliation", "affiliations", "alliance", "alliances", "bankrupt",
                    "bankruptcies", "bankruptcy", "carryback", "carryforward", "collaborate", "collaborations",
                    "collaboration", "collaborative", "collaborators", "collateral", "collateralization",
                    "collateralize", "collateralized", "complex", "complexities", "complexity", "contingencies",
                    "contingency", "contingent", "contingently", "contract", "contracted", "contracting", "contracts",
                    "contractual", "contractually", "conversion", "conversions", "convertible", "copyright", "copyrights",
                    "counterparties", "counterparty", "covenant", "covenants", "derivative", "derivatives", "embedded",
                    "entities", "exercisability", "exercisable", "exercised", "floating", "foreign", "franchise",
                    "franchises", "futures", "global", "globally", "hedge", "hedged", "hedges", "hedging", "infringe",
                    "infringed", "infridgement", "infringes", "infringing", "insolvency", "insolvent", "intangible",
                    "intangibles", "interconnection", "international", "internationally", "lawsuit", "lawsuits", "lease",
                    "leaseback", "lease", "leasehold", "leases", "leasing", "lessee", "lessees", "lessor", "lessors",
                    "license", "licensed", "licensee", "licensees", "licenses", "licensing", "licensor", "licensors",
                    "lien", "liens", "liquidate", "liquidated", "liquidating", "liquidation", "liquidations", "liquidator",
                    "litigation", "merge", "merged", "merger", "mergers", "merging", "nationalization", "outsource",
                    "outsourced", "outsourcing", "partner", "partnering", "partners", "partnerships", "partnership",
                    "patent", "patentable", "patented", "patents", "reacquired", "recapitalization", "recapitalizations",
                    "reclassification", "reclassifications", "reclassified", "reclassify", "reissued", "reorganization",
                    "reorganizations", "reorganized", "repatriate", "repatriated", "repatriation", "restructure", "restructured",
                    "restructuring", "restructurings", "revaluation", "revalued", "revocable", "revocation", "revoke",
                    "revoked", "royalties", "royalty", "securitizations", "securitization", "securitized", "segment",
                    "segmented", "segments", "sovereign", "subcontract", "subcontractor", "subcontractors", "subcontracts",
                    "sublease", "subleased", "subleases", "sublet", "sublicense", "subsidiaries", "subsidiary", "subsidies",
                    "subsidy", "swap", "swaps", "takeover", "takeovers", "trademark", "trademarks", "unexercisable",
                    "unexercised", "unrecognized", "unremittted", "venture", "ventures", "warranty", "warranties", "worldwide"]

    word_count_df = process_pdfs_in_folder(folder_path, search_words)
    print(word_count_df)


     accrual  accruals  accrue  accrued  accrues  accruing  acquire  acquired  \
0          7        10       5       16        0         0        2        73   
1          0         4       2        9        0         0        6        40   
2          7         0       0        3        0         3       10         7   
3          1        13       6       27        0         0       17        78   
4          4         4       0       15        1         0        6        17   
..       ...       ...     ...      ...      ...       ...      ...       ...   
285        2         0       0       30        0         0        2        24   
286        0         1       1        9        1         0        9        11   
287        3         1       0       14        0         0       20        37   
288       14         2       2       28        0         0       11        22   
289        3         2       0       23        0         0        3        22   

     acquiree  acquirer  ..

In [5]:
# add name column, data column, and Sum column, then export csv - missing disney
# 2. Split 'filename' into 'Company Name' and 'Date'
def split_filename(filename):
    parts = filename.replace(".pdf", "").split("_")
    year = int(parts[0])  # Convert year to integer
    company_name = parts[1]
    return company_name, year

word_count_df[['Company Name', 'Year']] = word_count_df['filename'].apply(lambda x: pd.Series(split_filename(x)))

# 3. Company Name to Ticker Symbol Library
dow_jones_tickers = {
    "3M": "MMM",
    "AmericanExpress": "AXP",
    "Amgen": "AMGN",
    "Apple": "AAPL",
    "Boeing": "BA",
    "Caterpillar": "CAT",
    "Chevron": "CVX",
    "Cisco": "CSCO",
    "CocaCola": "KO",
    "GoldmanSachs": "GS",
    "HomeDepot": "HD",
    "Honeywell": "HON",
    "IBM": "IBM",
    "Johnson&Johnson": "JNJ",
    "JPMorgan": "JPM",
    "McDonalds": "MCD",
    "Merck": "MRK",
    "Microsoft": "MSFT",
    "Nike": "NKE",
    "P&G": "PG",
    "Sherwin": "SHW",
    "Salesforce": "CRM",
    "Travelers": "TRV",
    "UnitedHealth": "UNH",
    "Verizon": "VZ",
    "Visa": "V",
    "NVIDIA":'NVDA',
    "Walmart": "WMT",
    "Amazon": "AMZN"  #Added based on the missing info
}

# Function to get ticker, with a fallback
def get_ticker(company_name):
    return dow_jones_tickers.get(company_name, company_name)  # Returns name if not found

word_count_df['Ticker'] = word_count_df['Company Name'].apply(get_ticker)

# 4. Create the 'Sum' column
cols_to_sum = ["accrual", "accruals", "accrue", "accrued", "accrues", "accruing", "acquire", "acquired", "acquiree",
                    "acquirer", "acquirers", "acquires", "acquiring", "acquisition", "acquisitions", "affiliate",
                    "affiliated", "affiliates", "affiliation", "affiliations", "alliance", "alliances", "bankrupt",
                    "bankruptcies", "bankruptcy", "carryback", "carryforward", "collaborate", "collaborations",
                    "collaboration", "collaborative", "collaborators", "collateral", "collateralization",
                    "collateralize", "collateralized", "complex", "complexities", "complexity", "contingencies",
                    "contingency", "contingent", "contingently", "contract", "contracted", "contracting", "contracts",
                    "contractual", "contractually", "conversion", "conversions", "convertible", "copyright", "copyrights",
                    "counterparties", "counterparty", "covenant", "covenants", "derivative", "derivatives", "embedded",
                    "entities", "exercisability", "exercisable", "exercised", "floating", "foreign", "franchise",
                    "franchises", "futures", "global", "globally", "hedge", "hedged", "hedges", "hedging", "infringe",
                    "infringed", "infridgement", "infringes", "infringing", "insolvency", "insolvent", "intangible",
                    "intangibles", "interconnection", "international", "internationally", "lawsuit", "lawsuits", "lease",
                    "leaseback", "lease", "leasehold", "leases", "leasing", "lessee", "lessees", "lessor", "lessors",
                    "license", "licensed", "licensee", "licensees", "licenses", "licensing", "licensor", "licensors",
                    "lien", "liens", "liquidate", "liquidated", "liquidating", "liquidation", "liquidations", "liquidator",
                    "litigation", "merge", "merged", "merger", "mergers", "merging", "nationalization", "outsource",
                    "outsourced", "outsourcing", "partner", "partnering", "partners", "partnerships", "partnership",
                    "patent", "patentable", "patented", "patents", "reacquired", "recapitalization", "recapitalizations",
                    "reclassification", "reclassifications", "reclassified", "reclassify", "reissued", "reorganization",
                    "reorganizations", "reorganized", "repatriate", "repatriated", "repatriation", "restructure", "restructured",
                    "restructuring", "restructurings", "revaluation", "revalued", "revocable", "revocation", "revoke",
                    "revoked", "royalties", "royalty", "securitizations", "securitization", "securitized", "segment",
                    "segmented", "segments", "sovereign", "subcontract", "subcontractor", "subcontractors", "subcontracts",
                    "sublease", "subleased", "subleases", "sublet", "sublicense", "subsidiaries", "subsidiary", "subsidies",
                    "subsidy", "swap", "swaps", "takeover", "takeovers", "trademark", "trademarks", "unexercisable",
                    "unexercised", "unrecognized", "unremittted", "venture", "ventures", "warranty", "warranties", "worldwide"]
word_count_df['Sum'] = word_count_df[cols_to_sum].sum(axis=1)
word_count_df

Unnamed: 0,accrual,accruals,accrue,accrued,accrues,accruing,acquire,acquired,acquiree,acquirer,...,ventures,warranty,warranties,worldwide,filename,Company Name,Date,Ticker Symbol,Sum,Ticker
0,7,10,5,16,0,0,2,73,0,1,...,0,2,2,33,2012_3M.pdf,3M,2012,MMM,1622,MMM
1,0,4,2,9,0,0,6,40,0,0,...,1,0,0,6,2012_Amazon.pdf,Amazon,2012,AMZN,815,AMZN
2,7,0,0,3,0,3,10,7,0,16,...,6,1,0,23,2012_AmericanExpress.pdf,AmericanExpress,2012,AXP,916,AXP
3,1,13,6,27,0,0,17,78,0,0,...,2,0,0,22,2012_Amgen.pdf,Amgen,2012,AMGN,1591,AMGN
4,4,4,0,15,1,0,6,17,0,0,...,1,33,4,10,2012_Apple.pdf,Apple,2012,AAPL,836,AAPL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,2,0,0,30,0,0,2,24,0,1,...,8,0,3,5,2021_Travelers.pdf,Travelers,2021,TRV,1404,TRV
286,0,1,1,9,1,0,9,11,0,0,...,2,0,0,0,2021_UHG.pdf,UHG,2021,UHG,591,UHG
287,3,1,0,14,0,0,20,37,0,0,...,0,1,2,0,2021_Verizon.pdf,Verizon,2021,VZ,1716,VZ
288,14,2,2,28,0,0,11,22,0,10,...,1,0,0,6,2021_Visa.pdf,Visa,2021,V,1136,V


In [6]:
# Output CSV File
word_count_df.to_csv('complexity.csv', index=False)