In [6]:
# ------------------------- CONFIG -------------------------
OUTPUT_DIR = "sec_swot_output"  # where to save CSVs + JSONs
PORTFOLIO_DIR = "sec_portfolio"  # datamule Portfolio working directory
TICKERS = ["AAPL"]  # modify: list of tickers to download
FORMS = ["10-K"]
DATE_RANGE = ("2023-01-01", "2024-12-31")  # (start_date, end_date) or None
MAX_SENTENCE_LENGTH = 500
MIN_SENTENCE_LENGTH = 30

In [5]:
# ------------------------- IMPORTS -------------------------
import os
import json
import re
from pathlib import Path
from tqdm import tqdm

# local optional imports
try:
    from datamule import Portfolio
except Exception as e:
    raise RuntimeError("datamule is required. Install with: pip install datamule")

import pandas as pd

# try to import transformers + torch for zero-shot; fallback to weak supervision
candidate_labels = ["Strength", "Weakness", "Opportunity", "Threat"]

In [3]:
# ------------------------- UTILITIES -------------------------

def ensure_dir(path):
    Path(path).mkdir(parents=True, exist_ok=True)


def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Normalize whitespace and remove control chars
    t = re.sub(r"[\r\x0c]+", "\n", text)
    t = re.sub(r"\s+", " ", t)
    t = t.strip()
    return t


def split_sentences(text: str):
    # naive punctuation-based split; good starting point
    sents = re.split(r'(?<=[.!?])\s+', text)
    out = []
    for s in sents:
        s = s.strip()
        if len(s) >= MIN_SENTENCE_LENGTH and len(s) <= MAX_SENTENCE_LENGTH:
            out.append(s)
    return out


def extract_text_from_contents(contents):
    """Recursively extract strings from nested datamule 'contents' dicts or lists."""
    texts = []
    if isinstance(contents, str):
        texts.append(contents)
    elif isinstance(contents, dict):
        for k, v in contents.items():
            texts.extend(extract_text_from_contents(v))
    elif isinstance(contents, list):
        for item in contents:
            texts.extend(extract_text_from_contents(item))
    return texts


# simple weak-supervision keyword rules
KEYWORDS = {
    "Strength": ["strong", "leading", "advantage", "growth", "robust", "increase in", "strength"],
    "Weakness": ["decline", "risk", "cost", "vulnerable", "loss", "decrease", "weak"],
    "Opportunity": ["opportunit", "potential", "emerging", "expand", "growth opportunity", "could benefit"],
    "Threat": ["competition", "regulation", "lawsuit", "uncertain", "disruptor", "threat", "risk of"]
}


def weak_label(sentence: str):
    s = sentence.lower()
    for label, kws in KEYWORDS.items():
        for kw in kws:
            if kw in s:
                return label
    return None

In [11]:
# ------------------------- MAIN PIPELINE -------------------------

def analyze_portfolio(tickers=TICKERS, forms=FORMS, date_range=DATE_RANGE, portfolio_dir=PORTFOLIO_DIR, output_dir=OUTPUT_DIR):
    ensure_dir(output_dir)
    # create or reuse portfolio
    print("Initializing Portfolio in:", portfolio_dir)
    port = Portfolio(portfolio_dir)

    # download submissions for tickers
    print("Downloading filings (this can take a while)...")
    try:
        port.download_submissions(filing_date=date_range, submission_type=forms, ticker=tickers)
    except Exception as e:
        print("Warning: download_submissions raised:", e)
        # continue; maybe files already present

    # process local submissions (uses datamule's internal caching)
    try:
        port.process_submissions(lambda s: None)
    except Exception:
        # process_submissions may require callback; ignore if fails
        pass
    print("Using weak supervision keyword-based classification.")


    # iterate documents of requested type
    docs = list(port.document_type(forms[0]))
    print(f"Found {len(docs)} documents of type {forms[0]} in portfolio.")

    summary_index = []

    for doc in tqdm(docs, desc="Processing filings"):
        try:
            doc.parse()
        except Exception:
            # parse may fail for some docs - skip
            print("Warning: parse failed for a document; skipping")
            continue

                # Fix metadata extraction
        meta = doc.data.get('metadata', {})
        doc_content = doc.data.get('document', {})

        # Improved metadata extraction
        accession = (meta.get('accession_number') or 
                    meta.get('accession') or 
                    doc.__dict__.get('accession') or 
                    getattr(doc, 'accession_number', None) or
                    'unknown')
        
        # For Apple filings, manually set the ticker if we can identify it
        cik = (meta.get('cik') or 
               getattr(doc, 'cik', None) or
               doc.__dict__.get('cik'))
        
        # Map known CIKs to tickers
        cik_to_ticker = {
            '0000320193': 'AAPL',
            '320193': 'AAPL'
        }
        
        ticker = (meta.get('ticker') or 
                 meta.get('symbol') or 
                 cik_to_ticker.get(str(cik)) if cik else None or
                 TICKERS[0] if len(TICKERS) == 1 else 'UNKNOWN')  # Use config ticker if only one
        
        filing_date = (meta.get('filing_date') or 
                      meta.get('filed_as_of_date') or
                      getattr(doc, 'filing_date', None))
        
        # Debug: Print what we found
        # collect sentences
        sentences = []
        for part_id, part in doc_content.items():
            # part is a dict with 'contents'
            if isinstance(part, dict):
                contents = part.get('contents', {})
                texts = extract_text_from_contents(contents)
                for t in texts:
                    tclean = clean_text(t)
                    if len(tclean) >= MIN_SENTENCE_LENGTH:
                        sents = split_sentences(tclean)
                        sentences.extend(sents)

        # fallback: if no sentences found, try reading any string values directly from doc_content
        if not sentences:
            for part_id, part in doc_content.items():
                if isinstance(part, str) and len(part) > 30:
                    sents = split_sentences(part)
                    sentences.extend(sents)

        print(f"Extracted {len(sentences)} sentences from accession {accession}")

        if not sentences:
            print("No textual sentences found for this filing - skipping output generation.")
            continue

        # classify sentences
        records = []
        for sent in tqdm(sentences, desc="Classifying sentences"):
            lab = weak_label(sent)
            score = 1.0 if lab else 0.0
            if lab:
                records.append({"sentence": sent, "label": lab, "score": score})

        # prepare DataFrame and per-label grouping
        df = pd.DataFrame(records)
        if df.empty:
            print("No records after labeling - skipping.")
            continue

        # save per-filing CSV
                # save per-filing CSV
        out_csv = Path(output_dir) / f"swot_{ticker}_{accession}.csv"
        df.to_csv(out_csv, index=False)
        
        # create a compact JSON report: top N bullets per label
        def extract_key_phrases(sentences, max_phrases=3):
            """Extract key phrases from sentences using simple frequency analysis"""
            from collections import Counter
            import re
    
            # Combine all sentences and extract meaningful words
            text = " ".join(sentences).lower()
            words = re.findall(r'\b[a-z]{4,}\b', text)  # Words with 4+ characters
    
            # Filter out common words
            stopwords = {'that', 'with', 'have', 'this', 'will', 'from', 'they', 'been', 'said', 'each', 'which', 'their', 'there', 'these', 'those', 'would', 'could', 'should', 'other', 'such', 'more', 'also', 'may', 'can', 'its', 'our', 'us', 'we', 'you', 'your', 'the', 'and', 'or', 'but', 'so', 'if', 'when', 'where', 'how', 'what', 'who', 'why'}
            meaningful_words = [w for w in words if w not in stopwords]
    
            # Get most common words as key phrases
            counter = Counter(meaningful_words)
            return [word for word, count in counter.most_common(max_phrases)]

        report = {}
        total_classified = len(df)
        
        for lab in candidate_labels:
            lab_df = df[df['label'] == lab].sort_values('score', ascending=False)
            all_sentences = lab_df['sentence'].tolist()
            bullets = all_sentences[:3]  # Only top 3 sentences
            key_phrases = extract_key_phrases(all_sentences[:10])  # Extract from top 10
            
            report[lab] = {
                "count": len(lab_df), 
                "top_bullets": bullets,
                "key_themes": key_phrases,
                "summary": f"{len(lab_df)} {lab.lower()} indicators found"
            }

        report_meta = {"ticker": ticker, "cik": cik, "accession": accession, "filing_date": filing_date}
        out_json = Path(output_dir) / f"swot_report_{ticker}_{accession}.json"
        with open(out_json, 'w', encoding='utf-8') as fh:
            json.dump({"meta": report_meta, "report": report}, fh, indent=2, ensure_ascii=False)

        summary_index.append({**report_meta, "csv": str(out_csv), "json": str(out_json)})

    # master index
    with open(Path(output_dir)/"index.json", 'w', encoding='utf-8') as fh:
        json.dump(summary_index, fh, indent=2, ensure_ascii=False)

    print("All done. Reports saved to", output_dir)


if __name__ == '__main__':
    ensure_dir(OUTPUT_DIR)
    analyze_portfolio()

Initializing Portfolio in: sec_portfolio
Loading submissions


Loading regular submissions:   0%|          | 0/2 [00:00<?, ?it/s]

Loading regular submissions: 100%|██████████| 2/2 [00:00<00:00, 106.31it/s]


Successfully loaded 2 submissions
Downloading filings (this can take a while)...



[A


--- Starting query planning phase ---
Analyzing request and splitting into manageable chunks...
Fetching https://efts.sec.gov/LATEST/search-index?ciks=0000320193&forms=10-K%2C-10-K%2FA&startdt=2023-01-01&enddt=2024-12-31&from=0&size=1...
Found 2 total documents to retrieve.
Fetching https://efts.sec.gov/LATEST/search-index?ciks=0000320193&forms=10-K%2C-10-K%2FA&startdt=2023-01-01&enddt=2024-12-31&from=0&size=1...
Planning: Analyzing query: cik=0000320193, forms=10-K,-10-K/A, dates=2023-01-01 to 2024-12-31 [2 hits]
No additional forms to process with negation

--- Starting query phase ---


Querying documents [Rate: 0/s | 0 MB/s]:   0%|          | 0/2 [00:00<?, ?it/s]

Fetching https://efts.sec.gov/LATEST/search-index?ciks=0000320193&forms=10-K%2C-10-K%2FA&startdt=2023-01-01&enddt=2024-12-31&from=0&size=100...


Querying documents [Rate: 3.0/s | 0.01 MB/s]: 100%|██████████| 2/2 [00:00<00:00,  9.38it/s]



--- Query complete: 2 submissions retrieved ---

--- Streaming complete: 2 EFTS results processed ---
Loading submissions


Loading regular submissions: 100%|██████████| 2/2 [00:00<00:00, 102.57it/s]


Successfully loaded 2 submissions


Processing submissions: 100%|██████████| 2/2 [00:00<00:00, 13025.79it/s]


Using weak supervision keyword-based classification.
Found 2 documents of type 10-K in portfolio.


Writing: 0 submissions [00:01, ? submissions/s]<?, ?it/s]
Streaming submissions: 0it [00:00, ?it/s]


Extracted 1184 sentences from accession 000032019323000106


Classifying sentences: 100%|██████████| 1184/1184 [00:00<00:00, 161966.54it/s]
Processing filings:  50%|█████     | 1/2 [00:00<00:00,  1.79it/s]

Extracted 1182 sentences from accession 000032019324000123


Classifying sentences: 100%|██████████| 1182/1182 [00:00<00:00, 149179.05it/s]
Processing filings: 100%|██████████| 2/2 [00:00<00:00,  2.00it/s]

All done. Reports saved to sec_swot_output



