In [2]:
import json
import re
import spacy
import pdfplumber
import pandas as pd
import numpy as np

from collections import defaultdict

In [6]:
try:
    table_data = []
    with pdfplumber.open("pdfs/ACTIVISIONBLIZZARD_2015_10K.pdf") as pdf:
        for i in range(len(pdf.pages)):
            table = pdf.pages[i].extract_table()
            if table is not None:
                df = pd.DataFrame(table[1:], columns=table[0])
                df.drop(columns=[""], inplace=True)
                df = df[~df.isin({None, np.nan, "None", ""}).all(axis=1)]
                table_data.append(df)
                with open(f"jsonl/extracted_data_{i}.jsonl", "w") as jsonl_file:
                    for _, row in df.iterrows():
                        row_dict = row.to_dict()
                        jsonl_file.write(json.dumps(row_dict) + "\n")

except Exception as e:
    print(e)

In [4]:
nlp = spacy.load("en_core_web_sm")


def extract_data_from_unstructured_pdf():
    def is_strong_heading(line):
        """
        Determine if a line is a heading based on strict rules:
        1. Text is bold.
        2. Text is not excessively long (less than 50 characters).
        3. Text starts with a capital letter.
        4. Surrounded by less dense text.
        5. Contextual relevance (avoid generic or noise-like text).
        """
        if not line.strip():
            return False

        if len(line) > 50:
            return False

        if not line[0].isupper():
            return False

        doc = nlp(line.strip())
        if len(doc) > 1 and any(token.pos_ in ["VERB", "ADP"] for token in doc):
            return False

        if re.search(r"bold|strong", line.lower()):
            return True

        return True

    def is_noise(line):
        """
        Identify and filter out noise such as:
        - Index entries
        - Repetitive patterns
        - Single numbers or irrelevant symbols
        """
        if re.match(r"^\s*[\dIVXLCDM]+(\.\d+)*\s*$", line.strip()):
            return True
        if re.match(r"^\s*[-–—]+\s*$", line.strip()):
            return True
        if len(line.split()) < 2 and len(line) < 5:
            return True
        return False

    data = defaultdict(str)
    current_heading = None

    with pdfplumber.open("pdfs/3M_2015_10K.pdf") as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if not text:
                continue

            lines = text.split('\n')
            for line in lines:
                clean_line = line.strip()

                if not clean_line or is_noise(clean_line):
                    continue

                if is_strong_heading(clean_line):
                    if current_heading:
                        data[current_heading] = data[current_heading].strip()
                    current_heading = clean_line
                else:
                    if current_heading:
                        data[current_heading] += " " + clean_line
                    else:
                        data["Uncategorized"] += " " + clean_line

    if current_heading:
        data[current_heading] = data[current_heading].strip()

    output_json = json.dumps(data, indent=4)
    with open("pdf_summary.jsonl", "w") as json_file:
        json_file.write(output_json)


extract_data_from_unstructured_pdf()