In [None]:
!pip install U langchain-community pypdf

In [2]:


from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/nvidea_10K.pdf")
docs = loader.load()

full_text = "\n".join(doc.page_content for doc in docs)


In [3]:
import re

ITEM_PATTERN = re.compile(
    r"(ITEM\s+1A\.|ITEM\s+1\.|ITEM\s+7\.)",
    re.IGNORECASE
)

parts = ITEM_PATTERN.split(full_text)

sections = {}
current_item = None

for part in parts:
    part = part.strip()
    if re.match(ITEM_PATTERN, part):
        current_item = part.upper()
        sections[current_item] = ""
    elif current_item:
        sections[current_item] += part

print("Extracted items:", sections.keys())


Extracted items: dict_keys(['ITEM 1.', 'ITEM 1A.', 'ITEM 7.'])


In [4]:
MDNA_KEYWORDS = {
     "Results": [
        "results of operations",
        "net revenue",
        "net income"
    ],
    "Liquidity": [
        "liquidity and capital resources",
        "funding and liquidity",
        "cash flows"
    ],
    "Outlook": [
        "outlook",
        "forward-looking",
        "future expectations",
        "economic conditions"
    ]
}

def extract_mdna(mdna_text):
    mdna_text_lower = mdna_text.lower()
    extracted = {}

    for name, keys in MDNA_KEYWORDS.items():
        for k in keys:
            if k in mdna_text_lower:
                start = mdna_text_lower.find(k)
                extracted[name] = mdna_text[start:start+8000]
                break
    return extracted

mdna_sections = extract_mdna(sections.get("ITEM 7.", ""))

mdna_sections.keys()


dict_keys(['Results'])

In [5]:
def split_sentences(text):
    return re.split(r'(?<=[.!?])\s+', text)


In [6]:
RISK_KEYWORDS = [
    "risk", "may", "could", "adverse", "uncertain",
    "impact", "negatively", "disruption", "depend",
    "regulatory", "competition", "supply"
]

LIQUIDITY_KEYWORDS = [
    "cash", "liquidity", "capital", "debt",
    "credit", "financing", "obligations"
]

OUTLOOK_KEYWORDS = [
    "expect", "anticipate", "forecast",
    "trend", "outlook", "future"
]


In [7]:
def keyword_score(sentence, keywords):
    s = sentence.lower()
    return sum(1 for k in keywords if k in s)


In [8]:
def select_by_keywords(text, keywords, max_chars=10000):
    sentences = split_sentences(text)
    ranked = sorted(sentences, key=lambda s: keyword_score(s, keywords), reverse=True)

    selected = []
    total = 0

    for s in ranked:
        if total + len(s) <= max_chars:
            selected.append(s)
            total += len(s)
        if total >= max_chars:
            break

    return " ".join(selected)


In [9]:
def process_section(name, text, max_chars=10000):
    text = text.strip()

    if name == "Risk Factors":
        return select_by_keywords(text, RISK_KEYWORDS, max_chars)

    if name == "Liquidity":
        return select_by_keywords(text, LIQUIDITY_KEYWORDS, max_chars)

    if name == "Outlook":
        return select_by_keywords(text, OUTLOOK_KEYWORDS, max_chars)

    return text[:max_chars]


In [10]:
synthetic_filing = {
    "filing_id": "nvidea_10K_2025",
    "sections": {
        "Business": process_section("Business", sections.get("ITEM 1.", "")),
        "Risk Factors": process_section("Risk Factors", sections.get("ITEM 1A.", "")),
        "Results": process_section("Results", mdna_sections.get("Results", "")),
        "Liquidity": process_section("Liquidity", mdna_sections.get("Liquidity", "")),
        "Outlook": process_section("Outlook", mdna_sections.get("Outlook", ""))
    }
}


In [None]:
for k, v in synthetic_filing["sections"].items():
    print(k, "→", len(v), "characters")


In [None]:
import json

with open("nvidea.json", "w") as f:
    json.dump(synthetic_filing, f, indent=2)

Saved apple_synthetic.json


In [84]:
import json
with open("/content/nvidea.json", "r") as f:
    other_filing = json.load(f)

for k, v in other_filing["sections"].items():
    print(k, "→", len(v), "characters")


Business → 10000 characters
Risk Factors → 10032 characters
Results → 8000 characters
Liquidity → 7976 characters
Outlook → 7990 characters
