# Extract "Item 7. Management's Discussion and Analysis..." (MD&A) from 10-K files.

Notes:
- Finds Item 7 start using tolerant regex variants (handles OCR/HTML artifacts like "Management s" or “Management’s”).
- Stops at the first boundary: Item 7A or Item 8 (tolerant to variants).
- Writes UTF-8 output files, logs a summary table at the end.

In [24]:
import re
import sys
import argparse

import pandas as pd
from pathlib import Path

# ---------- Normalization helpers ----------
def normalize_text(t: str) -> str:
    # Normalize common artifacts: non-breaking space, smart quotes, excessive spacing
    replacements = {
        "\u00a0": " ",   # nbsp
        "\u2018": "'", "\u2019": "'", "\u201B": "'",  # single quotes
        "\u201C": '"', "\u201D": '"',                 # double quotes
        "\u2212": "-", "\u2013": "-", "\u2014": "-",  # dashes
    }
    for k, v in replacements.items():
        t = t.replace(k, v)
    # Collapse many spaces/tabs but keep newlines (for block structure)
    t = re.sub(r"[ \t]+", " ", t)
    # Sometimes HTML has <br> without newline: add line breaks after block tags
    t = re.sub(r"(?i)</?(p|div|br|tr|li|h[1-6])[^>]*>", "\n", t)
    # Strip overly repeated blank lines
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t

# ---------- Regex patterns ----------
# Flexible token for "Item" with optional punctuation and spaces
ITEM = r"ITEM\s*"
# Allow “7” with optional punctuation and spaces
NUM7 = r"7\s*[\.\)]?\s*"

# “Management's Discussion and Analysis of Financial Condition and Results of Operations”
# Be tolerant to: missing apostrophe, curly quotes, extra spaces, line breaks, tag debris.
MDNA   = r"MANAGEMENT\s*['’]?\s*S?\s*DISCUSSION\s*AND\s*ANALYSIS\s*OF\s*FINANCIAL\s*CONDITION\s*AND\s*RESULTS\s*OF\s*OPERATIONS"

# Start anchors (several variants, OR-ed later)
START_PATTERNS = [
    re.compile(rf"^[ \t]*{ITEM}{NUM7}{MDNA}.*?$", flags=re.I | re.M),
    re.compile(rf"{ITEM}{NUM7}.*?{MDNA}",          flags=re.I | re.S),
]

# End boundaries: Item 7A OR Item 8
ITEM_7A = rf"(?im)^[ \t]*{ITEM}7\s*A\s*[\.\)]?\s*(?:QUANTITATIVE.*?MARKET\s*RISK|$)"
ITEM_8  = rf"(?im)^[ \t]*{ITEM}8\s*[\.\)]?\s*(?:FINANCIAL\s*STATEMENTS|$)"
END_BOUNDARY = re.compile(
    rf"^[ \t]*{ITEM}7\s*A\s*[\.\)]?.*$"      # Item 7A...
    rf"|^[ \t]*{ITEM}8\s*[\.\)]?.*$",        # or Item 8...
    flags=re.I | re.M
)

def find_item7_block(doc_text: str) -> str | None:
    """
    doc_text should be a fairly 'clean' text (normalized, light de-HTML).
    Returns the Item 7 block (second occurrence, to skip TOC) or None.
    """
    # Collect all plausible Item 7 start positions
    starts = []
    for pat in START_PATTERNS:
        for m in pat.finditer(doc_text):
            starts.append(m.start())
    # Sort to make sure earliest first
    starts = sorted(starts)

    if not starts:
        # Fallback: standalone "Item 7" header
        loose = list(re.finditer(r"^[ \t]*ITEM\s*7\s*[\.\)]?\s*$",
                                 doc_text, flags=re.I | re.M))
        if not loose:
            return None
        # if multiple, pick the second
        start_idx = loose[1].start() if len(loose) > 1 else loose[0].start()
    else:
        # if multiple, pick the second; else the first
        start_idx = starts[1] if len(starts) > 1 else starts[0]

    # Find first boundary after start
    tail = doc_text[start_idx:]
    m_end = END_BOUNDARY.search(tail)
    end_idx = start_idx + m_end.start() if m_end else len(doc_text)

    chunk = doc_text[start_idx:end_idx].strip()
    return chunk if len(chunk) >= 400 else None


def read_file(path: Path) -> str:
    # Read as binary and decode with utf-8 (ignore errors)
    data = path.read_bytes()
    try:
        return data.decode("utf-8", errors="ignore")
    except Exception:
        return data.decode("latin-1", errors="ignore")


In [25]:
import os

sample_data_path = r".\data\l6\sample10k"

item7_df = pd.DataFrame(columns=["filename", "cik", "item7_text"])
for p in sorted(os.listdir(sample_data_path)):
    file_info = p.split('.')[0].split("_")
    cik = file_info[-2]
    file_name = file_info[-1]

    p = Path(sample_data_path) / p

    raw = read_file(p)
    text = normalize_text(raw)

    # Optionally strip HTML tags for matching; keep a copy for output
    # We run matching on a "lightly de-tagged" version to be robust.
    detag = re.sub(r"(?is)<[^>]+>", " ", text)
    detag = normalize_text(detag)

    item7 = find_item7_block(detag)
    if item7:
        item7_df.loc[item7_df.shape[0]] = pd.Series({"filename": file_name, "cik": cik, "item7_text": item7}, name=item7_df.shape[0])
    else:
        print(f"[MISS] {p.name}  (Item 7 not found with current patterns)")

print(f"Extracted Item 7 from {item7_df.shape[0]} files.")

[MISS] 20241001_10-K_edgar_data_1020859_0001020859-24-000045.txt  (Item 7 not found with current patterns)
[MISS] 20241001_10-K_edgar_data_1796949_0001654954-24-012526.txt  (Item 7 not found with current patterns)
[MISS] 20241001_10-K_edgar_data_1836754_0001493152-24-038926.txt  (Item 7 not found with current patterns)
[MISS] 20241001_10-K_edgar_data_69422_0001493152-24-038852.txt  (Item 7 not found with current patterns)
[MISS] 20241002_10-K_edgar_data_1624985_0001683168-24-006824.txt  (Item 7 not found with current patterns)
[MISS] 20241002_10-K_edgar_data_1885849_0001885849-24-000015.txt  (Item 7 not found with current patterns)
[MISS] 20241003_10-K_edgar_data_1297341_0001213900-24-085121.txt  (Item 7 not found with current patterns)
[MISS] 20241003_10-K_edgar_data_1592057_0001592057-24-000036.txt  (Item 7 not found with current patterns)
[MISS] 20241003_10-K_edgar_data_1916879_0001065949-24-000110.txt  (Item 7 not found with current patterns)
[MISS] 20241004_10-K_edgar_data_1377789

In [23]:
item7_df.shape

(49, 3)

In [9]:
chunk

"Item 7. \r\n Management's Discussion and Analysis of Financial Condition and Results of Operations \r\n 28"