In [1]:
import os
import requests
from sec_api import QueryApi, PdfGeneratorApi, XbrlApi
from datetime import datetime
import time
import json
import pandas as pd
import dotenv

dotenv.load_dotenv('../.env')

True

In [2]:
# API Configuration
# You need to get your API key from https://sec-api.io/
# For demonstration, we'll use a placeholder - replace with your actual API key
SEC_API_KEY = os.getenv("SEC_API_KEY")


In [3]:
# Initialize the APIs
query_api = QueryApi(api_key=SEC_API_KEY)
pdf_generator = PdfGeneratorApi(api_key=SEC_API_KEY)
xbrl_api = XbrlApi(SEC_API_KEY)

In [4]:

# Create the data/raw directory if it doesn't exist
raw_data_dir = "../data/raw/MSFT/10-K/PDFs"
xbrl_data_dir="../data/raw/MSFT/10-K/XBRL"
os.makedirs(raw_data_dir, exist_ok=True)
os.makedirs(xbrl_data_dir, exist_ok=True)



In [5]:

# Microsoft's ticker symbol and CIK (Central Index Key)
ticker = "MSFT"
cik = "0000789019"  # Microsoft's CIK


In [6]:

print(f" Searching for {ticker} 10-K filings from the last 2 years...")
print(f" PDFs will be saved to: {os.path.abspath(raw_data_dir)}")
print(f"XBRL data will be saved to:{os.path.abspath(xbrl_data_dir)} ")
if SEC_API_KEY:
    try:
        # Query for Microsoft 10-K filings from the last 2 years
        query = {
            "query": f"ticker:{ticker} AND formType:\"10-K\" AND filedAt:[2022-01-01 TO 2024-12-31]",
            "from": "0",
            "size": "10",
            "sort": [{"filedAt": {"order": "desc"}}]
        }
        
        print("🔍 Querying SEC database...")
        response = query_api.get_filings(query)
        
        filings = response["filings"]
        print(f"📋 Found {len(filings)} 10-K filings for {ticker}")
        
        if len(filings) == 0:
            print("❌ No filings found. Try adjusting the date range or check the ticker symbol.")
        else:
            print("\n📄 Available filings:")
            for i, filing in enumerate(filings):
                filed_date = filing["filedAt"][:10]  # Extract date part
                accession_no = filing["accessionNo"]
                print(f"  {i+1}. Filed: {filed_date} | Accession: {accession_no}")
    
    except Exception as e:
        print(f"❌ Error querying filings: {e}")
        print("Make sure your API key is valid and you have internet connection.")
        filings = []
else:
    print("⏩ Skipping API calls - please configure your API key first")
    filings = []


 Searching for MSFT 10-K filings from the last 2 years...
 PDFs will be saved to: /Users/smatcha/Documents/BigData/pdf-parser/data/raw/MSFT/10-K/PDFs
XBRL data will be saved to:/Users/smatcha/Documents/BigData/pdf-parser/data/raw/MSFT/10-K/XBRL 
🔍 Querying SEC database...
📋 Found 3 10-K filings for MSFT

📄 Available filings:
  1. Filed: 2024-07-30 | Accession: 0000950170-24-087843
  2. Filed: 2023-07-27 | Accession: 0000950170-23-035122
  3. Filed: 2022-07-28 | Accession: 0001564590-22-026876


In [7]:
# Generate PDFs from the found filings using sec-api PdfGeneratorApi
if SEC_API_KEY and 'filings' in locals() and len(filings) > 0:
    print("🔄 Converting SEC filings to PDF format...")
    print(f"📂 Saving PDFs to: {raw_data_dir}")
    
    successfully_downloaded = 0
    
    for i, filing in enumerate(filings):
        try:
            # Extract filing information
            filed_date = filing["filedAt"][:10].replace("-", "")  # Format: YYYYMMDD
            accession_no = filing["accessionNo"].replace("-", "")
            form_type = filing["formType"]
            
            # Create filename
            pdf_filename = f"{ticker}_{form_type}_{filed_date}_{accession_no}.pdf"
            pdf_path = os.path.join(raw_data_dir, pdf_filename)
            
            # Skip if file already exists
            if os.path.exists(pdf_path):
                print(f"  ⏩ Skipping {pdf_filename} (already exists)")
                successfully_downloaded += 1
                continue
            
            print(f"  🔄 Processing filing {i+1}/{len(filings)}: {filed_date}")
            
            # Get the filing URL
            filing_url = filing["linkToFilingDetails"]
            
            # Generate PDF using sec-api
            print(f"    📄 Generating PDF from: {filing_url}")
            
            # Add a small delay to respect rate limits
            if i > 0:
                time.sleep(1)  # 1 second delay between requests
            
            pdf_content = pdf_generator.get_pdf(filing_url)
            
            # Save PDF to file
            with open(pdf_path, 'wb') as pdf_file:
                pdf_file.write(pdf_content)
            
            # Check file size
            file_size = os.path.getsize(pdf_path) / (1024 * 1024)  # Size in MB
            
            if file_size > 0.1:  # At least 100KB
                print(f"    ✅ Successfully saved: {pdf_filename} ({file_size:.1f} MB)")
                successfully_downloaded += 1
            else:
                print(f"    ⚠️  Warning: Small file size for {pdf_filename} ({file_size:.1f} MB)")
                successfully_downloaded += 1
                
        except Exception as e:
            print(f"    ❌ Error processing filing {i+1}: {e}")
            continue
    
    print(f"\n📊 Download Summary:")
    print(f"  Total filings found: {len(filings)}")
    print(f"  PDFs successfully downloaded: {successfully_downloaded}")
    print(f"  Success rate: {(successfully_downloaded/len(filings)*100):.1f}%")
    
    # List final PDF files
    if os.path.exists(raw_data_dir):
        pdf_files = [f for f in os.listdir(raw_data_dir) if f.lower().endswith('.pdf')]
        
        if pdf_files:
            print(f"\n📋 PDF files ready for parsing in {raw_data_dir}:")
            total_size = 0
            for pdf_file in sorted(pdf_files):
                file_path = os.path.join(raw_data_dir, pdf_file)
                file_size = os.path.getsize(file_path) / (1024 * 1024)
                total_size += file_size
                print(f"  📄 {pdf_file} ({file_size:.1f} MB)")
            
            print(f"\n💾 Total size: {total_size:.1f} MB")
            print(f"🎯 Ready for PDF parsing with your PDF parser!")
        else:
            print(f"\n❌ No PDF files were created. Check the error messages above.")

elif SEC_API_KEY == "YOUR_API_KEY_HERE":
    print("⚠️  Please configure your SEC API key first before generating PDFs")
    print("📋 Steps:")
    print("  1. Sign up at https://sec-api.io/")
    print("  2. Get your API key from the dashboard")
    print("  3. Replace 'YOUR_API_KEY_HERE' in the first cell")
    print("  4. Re-run both cells")
    
elif 'filings' not in locals() or len(filings) == 0:
    print("❌ No filings available for PDF generation")
    print("Please run the first cell successfully to fetch filings first")
    
else:
    print("⚠️  Unexpected state - please re-run the first cell")


🔄 Converting SEC filings to PDF format...
📂 Saving PDFs to: ../data/raw/MSFT/10-K/PDFs
  ⏩ Skipping MSFT_10-K_20240730_000095017024087843.pdf (already exists)
  ⏩ Skipping MSFT_10-K_20230727_000095017023035122.pdf (already exists)
  ⏩ Skipping MSFT_10-K_20220728_000156459022026876.pdf (already exists)

📊 Download Summary:
  Total filings found: 3
  PDFs successfully downloaded: 3
  Success rate: 100.0%

📋 PDF files ready for parsing in ../data/raw/MSFT/10-K/PDFs:
  📄 MSFT_10-K_20220728_000156459022026876.pdf (11.7 MB)
  📄 MSFT_10-K_20230727_000095017023035122.pdf (11.9 MB)
  📄 MSFT_10-K_20240730_000095017024087843.pdf (12.2 MB)

💾 Total size: 35.8 MB
🎯 Ready for PDF parsing with your PDF parser!


In [8]:
# === XBRL Downloader — one cell drop-in ===

import os
import re
import io
import json
import time
import zipfile
import unicodedata
from pathlib import Path
from typing import Dict, List, Optional

import requests

# Optional (only if you also want JSON/CSV)
try:
    from sec_api import XbrlApi
except Exception:
    XbrlApi = None  # sec-api not installed/available; raw XBRL download will still work


# ==============================
# Configuration (safe defaults)
# ==============================
# You can override these earlier in your notebook.
ticker = globals().get("ticker", "MSFT")
cik = globals().get("cik", "0000789019")  # Microsoft (with leading zeros)
xbrl_data_dir = globals().get("xbrl_data_dir", f"./data/xbrl/{ticker}")

# If you also want JSON/CSV from sec-api, set to True and provide your SEC_API_KEY
PRODUCE_JSON_AND_CSV = False
SEC_API_KEY = os.environ.get("SEC_API_KEY", globals().get("SEC_API_KEY"))
xbrl_api = XbrlApi(SEC_API_KEY) if (PRODUCE_JSON_AND_CSV and XbrlApi and SEC_API_KEY) else None


# ==============================
# User-Agent (ASCII-only to avoid latin-1 errors)
# ==============================
def ascii_http_header(s: str) -> str:
    """
    Convert Unicode to safe ASCII for HTTP headers:
    - Replace en/em dashes with '-'
    - NFKD normalize
    - Replace non-latin-1 chars with '-'
    """
    if not s:
        return ""
    s = s.replace("\u2013", "-").replace("\u2014", "-")
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch if ord(ch) < 256 else "-" for ch in s)

UA_RAW = os.environ.get(
    "SEC_USER_AGENT",
    # Put your name/email/purpose here to comply with SEC fair access guidance
    "Riyanshi Kedia riyanshi@example.com - academic XBRL research"
)
USER_AGENT = ascii_http_header(UA_RAW).strip() or "pdf-parser - contact@example.com"
REQUEST_KW = {"headers": {"User-Agent": USER_AGENT}, "timeout": 30}

print("Using User-Agent:", USER_AGENT)


# ==============================
# Helper functions
# ==============================
def _clean_cik(cik_str: str) -> str:
    """Strip leading zeros from CIK for EDGAR paths."""
    return str(int(str(cik_str).strip()))

def _clean_accession(acc: str) -> str:
    """Remove hyphens from accession number for path segments."""
    return str(acc).replace("-", "").strip()

def submission_base_url(cik_str: str, accession_no: str) -> str:
    """e.g. https://www.sec.gov/Archives/edgar/data/789019/000156459022026876"""
    return f"https://www.sec.gov/Archives/edgar/data/{_clean_cik(cik_str)}/{_clean_accession(accession_no)}"

def xbrl_zip_url(cik_str: str, accession_no: str) -> str:
    """e.g. .../000156459022026876/0001564590-22-026876-xbrl.zip (note: hyphens kept in file name)"""
    return f"{submission_base_url(cik_str, accession_no)}/{accession_no}-xbrl.zip"

def index_json_url(cik_str: str, accession_no: str) -> str:
    """EDGAR directory index JSON."""
    return f"{submission_base_url(cik_str, accession_no)}/index.json"


# Common XBRL file patterns (instance + linkbases + schema + (i)XBRL HTML)
XBRL_FILE_PATTERNS = [
    r".*?-ins\.xml$",    # instance
    r".*?-pre\.xml$",    # presentation
    r".*?-cal\.xml$",    # calculation
    r".*?-def\.xml$",    # definition
    r".*?-lab\.xml$",    # labels
    r".*?\.xsd$",        # schema
    r".*?\.(?:htm|html)$",  # (inline) XBRL HTML
]
_XBRL_REGEXES = [re.compile(p, re.IGNORECASE) for p in XBRL_FILE_PATTERNS]

def looks_like_xbrl(filename: str) -> bool:
    return any(rx.search(filename) for rx in _XBRL_REGEXES)


def download_xbrl_zip_or_files(cik_str: str, accession_no: str, dest_dir: str) -> Dict[str, List[str]]:
    """
    Try downloading -xbrl.zip. If not present, fall back to per-file download via index.json.
    Returns: {"mode": "zip" | "files", "saved": [list_of_paths]}
    """
    os.makedirs(dest_dir, exist_ok=True)
    saved: List[str] = []

    # 1) Try ZIP first
    zip_url = xbrl_zip_url(cik_str, accession_no)
    try:
        r = requests.get(zip_url, **REQUEST_KW)
        if r.status_code == 200 and r.content:
            zip_path = os.path.join(dest_dir, f"{_clean_accession(accession_no)}-xbrl.zip")
            with open(zip_path, "wb") as f:
                f.write(r.content)
            # Extract contents for convenience
            with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
                zf.extractall(dest_dir)
                saved.extend([str(Path(dest_dir, name)) for name in zf.namelist()])
            saved.append(zip_path)
            return {"mode": "zip", "saved": saved}
        # else: fall through to index.json mode
    except Exception as e:
        print(f"    ℹ️ ZIP attempt failed: {e}")

    # 2) Fallback: enumerate directory and download files individually
    idx_url = index_json_url(cik_str, accession_no)
    r = requests.get(idx_url, **REQUEST_KW)
    if r.status_code != 200:
        raise RuntimeError(f"index.json not accessible (status {r.status_code}) at {idx_url}")

    data = r.json()
    items = (data.get("directory") or {}).get("item") or []
    if not items:
        raise RuntimeError("No items found in submission index.json")

    base = submission_base_url(cik_str, accession_no)
    for it in items:
        name = it.get("name")
        if not name:
            continue
        if looks_like_xbrl(name):
            file_url = f"{base}/{name}"
            out_path = os.path.join(dest_dir, name)
            Path(out_path).parent.mkdir(parents=True, exist_ok=True)
            rr = requests.get(file_url, **REQUEST_KW)
            if rr.status_code == 200 and rr.content:
                with open(out_path, "wb") as f:
                    f.write(rr.content)
                saved.append(out_path)
            time.sleep(0.2)  # be polite with SEC

    if not saved:
        raise RuntimeError("No XBRL-like files matched in submission directory")

    return {"mode": "files", "saved": saved}


# Optional: flatten key statements from sec-api JSON (if PRODUCE_JSON_AND_CSV=True)
def extract_financial_statements_for_csv(x: dict) -> List[dict]:
    rows: List[dict] = []
    for stmt_key in ("StatementsOfIncome", "BalanceSheets", "StatementsOfCashFlows"):
        block = x.get(stmt_key)
        if isinstance(block, dict):
            for concept, facts in block.items():
                if isinstance(facts, list):
                    for fact in facts:
                        if isinstance(fact, dict) and "value" in fact:
                            rows.append({
                                "statement": stmt_key,
                                "concept": concept,
                                "value": fact.get("value"),
                                "unit": fact.get("unit"),
                                "period_instant": (fact.get("period") or {}).get("instant"),
                                "period_start": (fact.get("period") or {}).get("startDate"),
                                "period_end": (fact.get("period") or {}).get("endDate"),
                                "segment": fact.get("segment"),
                            })
    return rows


# ==============================
# Main routine
# ==============================
def store_xbrl_from_filings(
    filings: List[dict],
    ticker: str,
    cik_str: str,
    out_root: str,
    also_save_json_csv: bool = False,
    xbrl_api_obj: Optional["XbrlApi"] = None,
):
    """
    For each filing (expects keys: filedAt, accessionNo, formType),
    download and store raw XBRL artifacts to a filing-specific folder.
    Optionally, also save JSON + CSV via sec-api xbrl_to_json(accession_no=...).
    """
    print("\n📦 Downloading XBRL artifacts from filings...")
    print(f"📂 Output root: {out_root}")
    Path(out_root).mkdir(parents=True, exist_ok=True)

    ok = 0
    for i, filing in enumerate(filings):
        try:
            filed_date = str(filing["filedAt"])[:10].replace("-", "")
            accession_no = filing["accessionNo"]
            form_type = filing.get("formType", "UNKNOWN")

            filing_dir = os.path.join(out_root, f"{ticker}_{form_type}_{filed_date}_xbrl")
            if os.path.exists(filing_dir) and any(Path(filing_dir).glob("*")):
                print(f"  ⏩ [{i+1}/{len(filings)}] {form_type} {filed_date} already present.")
                ok += 1
                continue

            print(f"  📥 [{i+1}/{len(filings)}] {form_type} {filed_date} → {filing_dir}")
            res = download_xbrl_zip_or_files(cik_str, accession_no, filing_dir)
            print(f"    ✅ Saved {len(res['saved'])} file(s) via {res['mode']}")
            ok += 1

            # Optional: JSON & CSV via sec-api
            if also_save_json_csv and xbrl_api_obj:
                try:
                    xbrl_json = xbrl_api_obj.xbrl_to_json(accession_no=accession_no)
                    json_path = os.path.join(filing_dir, f"{ticker}_{form_type}_{filed_date}_financials.json")
                    with open(json_path, "w") as f:
                        json.dump(xbrl_json, f, indent=2)

                    rows = extract_financial_statements_for_csv(xbrl_json)
                    if rows:
                        import pandas as pd
                        pd.DataFrame(rows).to_csv(
                            os.path.join(filing_dir, f"{ticker}_{form_type}_{filed_date}_financials.csv"),
                            index=False,
                        )
                    print("    🧾 JSON/CSV saved from sec-api xbrl_to_json(accession_no=...)")
                except Exception as je:
                    print(f"    ⚠️ sec-api JSON/CSV step failed: {je}")

            time.sleep(0.5)  # SEC rate-limit courtesy

        except Exception as e:
            print(f"    ❌ Error on filing {i+1}: {e}")
            continue

    print("\n📊 XBRL Download Summary")
    print(f"  Total filings: {len(filings)}")
    print(f"  With XBRL saved: {ok}")
    print(f"  Success rate: {(ok/len(filings)*100):.1f}%")


# ==============================
# Execute if `filings` is available
# ==============================
if "filings" in globals() and isinstance(globals()["filings"], list) and globals()["filings"]:
    store_xbrl_from_filings(
        filings=filings,
        ticker=ticker,
        cik_str=cik,
        out_root=xbrl_data_dir,
        also_save_json_csv=PRODUCE_JSON_AND_CSV,
        xbrl_api_obj=xbrl_api,
    )
else:
    print("ℹ️ `filings` not found or empty. Provide a list of filing dicts with keys "
          "filedAt, accessionNo, formType (links optional), then re-run this cell.")

Using User-Agent: Riyanshi Kedia riyanshi@example.com - academic XBRL research

📦 Downloading XBRL artifacts from filings...
📂 Output root: ../data/raw/MSFT/10-K/XBRL
  📥 [1/3] 10-K 20240730 → ../data/raw/MSFT/10-K/XBRL/MSFT_10-K_20240730_xbrl
    ✅ Saved 15 file(s) via zip
  📥 [2/3] 10-K 20230727 → ../data/raw/MSFT/10-K/XBRL/MSFT_10-K_20230727_xbrl
    ✅ Saved 13 file(s) via zip
  📥 [3/3] 10-K 20220728 → ../data/raw/MSFT/10-K/XBRL/MSFT_10-K_20220728_xbrl
    ✅ Saved 13 file(s) via zip

📊 XBRL Download Summary
  Total filings: 3
  With XBRL saved: 3
  Success rate: 100.0%
