In [1]:
import itertools

from theia.data.client import DataPipelineClient
client = DataPipelineClient(storage_type="local_read_through_to_s3", local_dir="output")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sp_all_filings_linked_company_ids = client.load_tabular_dataset("sp_all_filings_linked_company_ids").get_dataframe()

In [3]:
sp_bmi_company_ids_all_years = client.load_tabular_dataset("sp_bmi_company_ids_all_years").get_dataframe()

In [4]:
bmi_company_ids = set(sp_bmi_company_ids_all_years["company_id"].unique())

In [5]:
# Map IDs to company names for lookup
id_to_name = dict(zip(sp_bmi_company_ids_all_years["company_id"], sp_bmi_company_ids_all_years["company_name"]))

In [6]:
bmi_sample_df = sp_all_filings_linked_company_ids[
    sp_all_filings_linked_company_ids["company_ids"].isin(bmi_company_ids)
]

In [7]:
bmi_sample_df["texts"] = bmi_sample_df["doc"].apply(lambda x: x.read_text())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmi_sample_df["texts"] = bmi_sample_df["doc"].apply(lambda x: x.read_text())


In [8]:
import pandas as pd
import json
from llama_index.core import Document

In [9]:
def parse_df_to_documents_with_company(df, id_map):
    documents = []
    for _, row in df.iterrows():
        raw_ids = row.get("company_ids", [])
        c_ids = [str(i) for i in raw_ids] if isinstance(raw_ids, list) else [str(raw_ids)]

        primary_id = c_ids[0] if c_ids else "Unknown"

        # Metadata from the dataframe columns
        base_metadata = {
            "company_id": primary_id,
            "all_company_ids": ",".join(c_ids),
            "company_name": id_map.get(int(primary_id), "Unknown") if primary_id.isdigit() else "Unknown",
            "filing_id": row.get("filing_id"),
            "filing_date": str(row.get("filing_date")),
            "filing_type": row.get("filing_type"),
            "accession_number": row.get("accession_number"),
        }

        # if isinstance(c_ids, list):
        #     names = [id_map.get(i, "Unknown") for i in c_ids]
        #     base_metadata["company_id"] = ",".join(map(str, c_ids))
        #     base_metadata["company_name"] = ",".join(names)
        # else:
        #     base_metadata["company_id"] = str(c_ids)
        #     base_metadata["company_name"] = id_map.get(c_ids, "Unknown")
            
        json_content = row.get("texts", "")
        if not json_content:
            continue

        for line in json_content.strip().splitlines():
            try:
                section_data = json.loads(line)

                text_content = section_data.get("section_text")
                if not text_content or len(text_content.strip()) < 10:
                    continue

                # Combine base metadata with section-specific metadata
                full_metadata = {
                    **base_metadata,
                    "heading": section_data.get("heading"),
                    "standardized_heading": section_data.get("standardized_heading"),
                    "section_id": section_data.get("heading_id")
                }

                doc = Document(
                    text=text_content,
                    metadata=full_metadata,
                    excluded_llm_metadata_keys=["filing_id", "accession_number"] # Hide internal IDs from LLM
                )
                documents.append(doc)
            except:
                continue
    
    return documents

In [10]:
all_docs = parse_df_to_documents_with_company(bmi_sample_df, id_to_name)

In [11]:
from llama_index.core import VectorStoreIndex

In [12]:
import random

In [13]:
# Use an arbitrary random sample for now
sample = random.sample(all_docs, 100)

In [17]:
vector_index = VectorStoreIndex.from_documents(sample, show_progress=True)

Parsing nodes: 100%|██████████| 100/100 [00:00<00:00, 1315.50it/s]
Generating embeddings: 100%|██████████| 170/170 [00:03<00:00, 43.26it/s]


In [18]:
from llama_index.core.workflow import Event, Workflow, StartEvent, StopEvent, step
from pydantic import BaseModel, Field

In [19]:
class GateDecision(BaseModel):
    is_business_activity: bool
    confidence: float = Field(description="Confidence score between 0 and 1")
    reasoning: str

In [20]:
class VectorPassEvent(Event):
    """Signals a section passed the vector threshold."""
    doc: any



In [21]:
class LLMPassEvent(Event):
    """Signals a section passed the LLM reasoning gate."""
    doc: any

In [22]:
from llama_index.core.program import LLMTextCompletionProgram
from llama_index.llms.openai import OpenAI

In [None]:
class BusinessActivityGate(Workflow):
    def __init__(self, index, vector_threshold=0.7, llm_threshold=0.6, **kwargs):
        super().__init__(**kwargs)
        self.index = index
        self.vector_threshold = vector_threshold
        self.llm_threshold = llm_threshold
        self.llm = OpenAI(model="gpt-4o-mini")

        # Conceptual anchors to avoid literal keyword matching
        self.anchors = [
            "Revenue-generating products, commercial services, and market offerings.",
            "Proprietary technology, R&D initiatives, and intellectual property assets.",
            "Operational infrastructure, manufacturing processes, and supply chain details."
        ]
    
    @step
    async def vector_pre_filter(self, ev: StartEvent) -> VectorPassEvent | StopEvent:
        doc = ev.doc 
        retriever = self.index.as_retriever(similarity_top_k=2)

        # Check semantic similarity to conceptual anchors
        scores = []
        for anchor in self.anchors:
            results = await retriever.aretrieve(f"{anchor} Context: {doc.text[:300]}")
            if results:
                scores.append(results[0].score)
        
        max_score = max(scores) if scores else 0

        if max_score >= self.vector_threshold:
            return VectorPassEvent(doc=doc)
        
        return StopEvent(result=None)
    
    @step
    async def llm_reasoning_gate(self, ev: VectorPassEvent) -> StopEvent:
        program = LLMTextCompletionProgram.from_defaults(
            output_cls=GateDecision,
            llm=self.llm,
            prompt_template_str="Does this describe actual business activities (not legal boilerplate or generic terms like dividends, taxes and revenue)? Context: {context_str}"
        )

        decision = await program.acall(context_str=ev.doc.text)

        if decision.is_business_activity and decision.confidence >= self.llm_threshold:
            ev.doc.metadata["gate_reasoning"] = decision.reasoning
            return StopEvent(result=ev.doc)
        
        return StopEvent(result=None)
            

In [24]:
workflow = BusinessActivityGate(index=vector_index, timeout=60)

gated_docs = []
for doc in sample:
    result = await workflow.run(doc=doc)
    if result:
        gated_docs.append(result)

In [25]:
print(f"Passed Gate: {len(gated_docs)} / {len(sample)}")

Passed Gate: 68 / 100


In [26]:
from typing import List, Optional

In [27]:
class BusinessInformation(BaseModel):
    products_offered: List[str] = Field(description="Products or services the company offers, builds or operates.")
    tech_owned: List[str] = Field(description="Technologies, services or IP the company develops or owns.")
    tech_used: List[str] = Field(description="Technologies used or relied upon but NOT owned.")
    discontinued_items: List[str] = Field(description="Exact snippets from the text supporting these findings.")

In [28]:
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor, ImplicitPathExtractor
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI

In [29]:
BaseModel.model_config["arbitrary_types_allowed"] = True

In [30]:
# Ontology definition (minimal)
entities = ["COMPANY", "PRODUCT", "TECHNOLOGY", "IP", "MARKET"]
relations = ["OFFERS", "OWNS", "DEPENDS_ON", "SUNSETS", "DEVELOPED_BY"]
validation_schema = [
    ("COMPANY", "OFFERS", "PRODUCT"),
    ("COMPANY", "OWNS", "TECHNOLOGY"),
    ("COMPANY", "DEPENDS_ON", "TECHNOLOGY"),
    ("COMPANY", "SUNSETS", "PRODUCT")
]

In [33]:
from llama_index.core.prompts import PromptTemplate

In [35]:
# Update ontology prompt with negative constraints
extraction_template = (
    "Extract business activities and their relationships from the following text: {text}\n"
    "CRITICAL: Do not extract generic financial line items, accounting terms, or boilerplate.\n"
    "IGNORE: dividends, tax, interest rates, shares, cash, revenue, profit, or legal proceedings.\n"
    "Focus only on tangible products, specific software/hardware, and proprietary technology."
)
extract_prompt = PromptTemplate(extraction_template)

In [43]:
kg_extractor = SchemaLLMPathExtractor(
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    llm=OpenAI(model="gpt-4o"),
    extract_prompt=extract_prompt,
    strict=False
)

In [47]:
import nest_asyncio
nest_asyncio.apply()

In [48]:
index = PropertyGraphIndex(
    [],
    kg_extractors=[kg_extractor],
    llm=OpenAI(model="gpt-4o-mini")
)

In [41]:
import time

In [50]:
# Add documents in small batches of 5
batch_size = 5
for i in range(0, len(gated_docs), batch_size):
    batch = gated_docs[i : i + batch_size]
    print(f"Processing batch {i//batch_size + 1}...")

    # Insert nodes batch by batch
    for doc in batch:
        index.insert(doc)
    
    # Sleep for 10-20 seconds to let TPM reset
    print("Sleeping to avoid rate limits...")
    time.sleep(20)

index.storage_context.persist(persist_dir="./storage_graph")

Processing batch 1...
Sleeping to avoid rate limits...
Processing batch 2...
Sleeping to avoid rate limits...
Processing batch 3...
Sleeping to avoid rate limits...
Processing batch 4...
Sleeping to avoid rate limits...
Processing batch 5...
Sleeping to avoid rate limits...
Processing batch 6...
Sleeping to avoid rate limits...
Processing batch 7...


Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-obaYMkTa6nX9qsK3ab81ttFU on tokens per min (TPM): Limit 30000, Used 29198, Requested 1237. Please try again in 870ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o in organization org-obaYMkTa6nX9qsK3ab81ttFU on tokens per min (TPM): Limit 30000, Used 29904, Requested 1190. Please try again in 2.188s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}.
Retrying llama_index.llms.openai.base.OpenAI._achat in 1.0 seconds as it raised RateLimitError: Error code: 429 - {

Sleeping to avoid rate limits...
Processing batch 8...
Sleeping to avoid rate limits...
Processing batch 9...
Sleeping to avoid rate limits...
Processing batch 10...
Sleeping to avoid rate limits...
Processing batch 11...
Sleeping to avoid rate limits...
Processing batch 12...
Sleeping to avoid rate limits...
Processing batch 13...
Sleeping to avoid rate limits...
Processing batch 14...
Sleeping to avoid rate limits...


In [71]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="storage_graph/")
index = load_index_from_storage(storage_context)

In [72]:
unique_ids = set([doc.metadata["company_id"] for doc in gated_docs])

In [105]:
from collections import defaultdict
from llama_index.core.graph_stores.types import Relation, EntityNode

In [106]:
def prune_global_nodes(index, threshold=0.3):

    # Get all the nodes from the graph store
    all_nodes = index.property_graph_store.get()

    # Map objects to the companies the mention them
    node_to_companies = defaultdict(set)

    # Identify relations and track their targets
    for item in all_nodes:
        if isinstance(item, Relation):
            subj = item.source_id
            obj = item.target_id

            node_to_companies[obj].add(subj)

    total_companies = len(unique_ids)
    nodes_to_remove = []

    print(f"Analyzing graph with {total_companies} unique companies...")

    # Identify the "global noise"
    for node_name, company_set in node_to_companies.items():
        freq = len(company_set) / total_companies
        if freq > threshold:
            print(f"Flagging '{node_name}' - found in {len(company_set)} companies ({freq:.1%})")
            nodes_to_remove.append(node_name)
    
    # Prune the nodes
    if nodes_to_remove:
        index.property_graph_store.delete_nodes(nodes_to_remove)
        print(f"Successfully pruned {len(nodes_to_remove)} nodes.")
    else:
        print("No nodes exceeded the noise threshold.")

In [79]:
prune_global_nodes(index)

Analyzing graph with 3 unique companies...
No nodes exceeded the noise threshold.


In [82]:
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters
from llama_index.core.indices.property_graph import VectorContextRetriever

In [96]:
def get_engine_for_company(index, company_id):
    filters = MetadataFilters(filters=[
        MetadataFilter(key="company_id", value=str(company_id))
    ])

    retriever = VectorContextRetriever(
        index.property_graph_store,
        vector_store=index.vector_store,
        filters=filters,
        similarity_top_k=2
    )

    return index.as_query_engine(sub_retrievers=[retriever], include_text=True, similarity_top_k=3)

In [84]:
# Core questions (pink)
analyst_questions = [
    "What products or services does this company offer, build, or operate?",
    "What technologies, services, and intellectual property does the company develop or own?",
    "What technologies does the company use / rely on but not develop or own?",
    "Are any products or services being sunset, discontinued, or deprioritised?"
]

In [89]:
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator

In [97]:
faithfulness_tmpl = (
    "You are an expert financial analyst auditor. Your task is to evaluate if a given answer "
    "is supported by the provided context.\n"
    "Context: {context_str}\n"
    "Query: {query_str}\n"
    "Answer: {generated_answer}\n\n"
    "Step 1: Breakdown the answer into individual claims.\n"
    "Step 2: For each claim, identify the specific line in the context that supports or contradicts it.\n"
    "Step 3: Provide a 'Feedback' section with this analysis.\n"
    "Step 4: End with [RESULT] followed by YES or NO.\n\n"
    "Feedback: "
)

In [98]:
faithfulness_evaluator = FaithfulnessEvaluator(llm=OpenAI(model="gpt-4o"), eval_template=PromptTemplate(faithfulness_tmpl))
relevancy_evaluator = RelevancyEvaluator(llm=OpenAI(model="gpt-4"))

In [99]:
results_report = []

for c_id in unique_ids:
    print(f"Analysing Company ID: {c_id}...")

    engine = get_engine_for_company(index, c_id)

    company_data = {"company_id": c_id}

    for q in analyst_questions:
        response = engine.query(q)

        faithfulness = faithfulness_evaluator.evaluate_response(query=q, response=response)
        relevancy = relevancy_evaluator.evaluate_response(query=q, response=response)

        # Capture the context used for the evaluation
        retrieved_sources = [node.node.get_content() for node in response.source_nodes]
        
        # Store response, faithfulness and relevancy
        company_data[q] = {
            "answer": str(response),
            "faithfulness": {
                "passing": faithfulness.passing,
                "score": faithfulness.score,
                "reasoning": faithfulness.feedback,
                "query_tracked": q
            },
            "sources_used": retrieved_sources[:3]  # Top 3 sources used
        }
    
    results_report.append(company_data)

Analysing Company ID: 317486...
Analysing Company ID: 874790...
Analysing Company ID: 262144...


In [100]:
results_report

[{'company_id': '317486',
  'What products or services does this company offer, build, or operate?': {'answer': 'The company offers scalable and cost-effective information technology infrastructure and owns a corporate office and distribution center facility.',
   'faithfulness': {'passing': True,
    'score': 1.0,
    'reasoning': 'Step 1: Breakdown the answer into individual claims.\n- Claim 1: The company offers scalable and cost-effective information technology infrastructure.\n- Claim 2: The company owns a corporate office and distribution center facility.\n\nStep 2: For each claim, identify the specific line in the context that supports or contradicts it.\n- Claim 1: Supported by the line "Christopher & Banks Corporation -> DEPENDS_ON -> Scalable and Cost Effective Information Technology Infrastructure."\n- Claim 2: Supported by the line "Christopher & Banks Corporation -> OWNS -> Corporate Office and Distribution Center Facility."\n\nStep 3: Provide a \'Feedback\' section with t