In [94]:
import itertools

from theia.data.client import DataPipelineClient
client = DataPipelineClient(storage_type="local_read_through_to_s3", local_dir="output")

In [195]:
sp_all_filings_linked_company_ids = client.load_tabular_dataset("sp_all_filings_linked_company_ids").get_dataframe()

In [196]:
sp_bmi_company_ids_all_years = client.load_tabular_dataset("sp_bmi_company_ids_all_years").get_dataframe()

In [197]:
bmi_company_ids = set(sp_bmi_company_ids_all_years["company_id"].unique())

In [None]:
# Map IDs to company names for lookup
id_to_name = dict(zip(sp_bmi_company_ids_all_years["company_id"], sp_bmi_company_ids_all_years["company_name"]))

In [200]:
bmi_sample_df = sp_all_filings_linked_company_ids[
    sp_all_filings_linked_company_ids["company_ids"].isin(bmi_company_ids)
]

In [202]:
bmi_sample_df["texts"] = bmi_sample_df["doc"].apply(lambda x: x.read_text())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bmi_sample_df["texts"] = bmi_sample_df["doc"].apply(lambda x: x.read_text())


In [None]:
import pandas as pd
import json
from llama_index.core import Document

In [213]:
def parse_df_to_documents_with_company(df, id_map):
    documents = []
    for _, row in df.iterrows():
        raw_ids = row.get("company_ids", [])
        c_ids = [str(i) for i in raw_ids] if isinstance(raw_ids, list) else [str(raw_ids)]

        primary_id = c_ids[0] if c_ids else "Unknown"

        # Metadata from the dataframe columns
        base_metadata = {
            "company_id": primary_id,
            "all_company_ids": ",".join(c_ids),
            "company_name": id_map.get(int(primary_id), "Unknown") if primary_id.isdigit() else "Unknown",
            "filing_id": row.get("filing_id"),
            "filing_date": str(row.get("filing_date")),
            "filing_type": row.get("filing_type"),
            "accession_number": row.get("accession_number"),
        }

        # if isinstance(c_ids, list):
        #     names = [id_map.get(i, "Unknown") for i in c_ids]
        #     base_metadata["company_id"] = ",".join(map(str, c_ids))
        #     base_metadata["company_name"] = ",".join(names)
        # else:
        #     base_metadata["company_id"] = str(c_ids)
        #     base_metadata["company_name"] = id_map.get(c_ids, "Unknown")
            
        json_content = row.get("texts", "")
        if not json_content:
            continue

        for line in json_content.strip().splitlines():
            try:
                section_data = json.loads(line)

                text_content = section_data.get("section_text")
                if not text_content or len(text_content.strip()) < 10:
                    continue

                # Combine base metadata with section-specific metadata
                full_metadata = {
                    **base_metadata,
                    "heading": section_data.get("heading"),
                    "standardized_heading": section_data.get("standardized_heading"),
                    "section_id": section_data.get("heading_id")
                }

                doc = Document(
                    text=text_content,
                    metadata=full_metadata,
                    excluded_llm_metadata_keys=["filing_id", "accession_number"] # Hide internal IDs from LLM
                )
                documents.append(doc)
            except:
                continue
    
    return documents

In [214]:
all_docs = parse_df_to_documents_with_company(bmi_sample_df, id_to_name)

In [220]:
from llama_index.core import VectorStoreIndex

In [222]:
import random

In [None]:
# Use an arbitrary random sample for now
sample = random.sample(all_docs, 100)

In [224]:
vector_index = VectorStoreIndex.from_documents(sample, show_progress=True)

Parsing nodes: 100%|██████████| 100/100 [00:00<00:00, 2087.28it/s]
Generating embeddings: 100%|██████████| 128/128 [00:02<00:00, 63.83it/s]


In [119]:
from llama_index.core.workflow import Event, Workflow, StartEvent, StopEvent, step
from pydantic import BaseModel, Field

In [228]:
class GateDecision(BaseModel):
    is_business_activity: bool
    confidence: float = Field(description="Confidence score between 0 and 1")
    reasoning: str

In [229]:
class VectorPassEvent(Event):
    """Signals a section passed the vector threshold."""
    doc: any



In [122]:
class LLMPassEvent(Event):
    """Signals a section passed the LLM reasoning gate."""
    doc: any



In [123]:
from llama_index.core.program import LLMTextCompletionProgram
from llama_index.llms.openai import OpenAI

In [230]:
class BusinessActivityGate(Workflow):
    def __init__(self, index, vector_threshold=0.7, llm_threshold=0.6, **kwargs):
        super().__init__(**kwargs)
        self.index = index
        self.vector_threshold = vector_threshold
        self.llm_threshold = llm_threshold
        self.llm = OpenAI(model="gpt-4o-mini")
    
    @step
    async def vector_pre_filter(self, ev: StartEvent) -> VectorPassEvent | StopEvent:
        doc = ev.doc 
        retriever = self.index.as_retriever(similarity_top_k=1)

        # Check semantic similarity to 'business activity' concept
        results = await retriever.aretrieve(f"Actual business activities and revenue: {doc.text[:300]}")

        if results and results[0].score >= self.vector_threshold:
            return VectorPassEvent(doc=doc)
    
        return StopEvent(result=None)
    
    @step
    async def llm_reasoning_gate(self, ev: VectorPassEvent) -> StopEvent:
        program = LLMTextCompletionProgram.from_defaults(
            output_cls=GateDecision,
            llm=self.llm,
            prompt_template_str="Does this describe actual business activities (not boilerplate)? Context: {context_str}"
        )

        decision = await program.acall(context_str=ev.doc.text)

        if decision.is_business_activity and decision.confidence >= self.llm_threshold:
            ev.doc.metadata["gate_reasoning"] = decision.reasoning
            return StopEvent(result=ev.doc)
        
        return StopEvent(result=None)
            

In [234]:
workflow = BusinessActivityGate(index=vector_index, timeout=60)

gated_docs = []
for doc in sample:
    result = await workflow.run(doc=doc)
    if result:
        gated_docs.append(result)

In [235]:
print(f"Passed Gate: {len(gated_docs)} / {len(sample)}")

Passed Gate: 73 / 100


In [155]:
from typing import List, Optional

In [156]:
class BusinessInformation(BaseModel):
    products_offered: List[str] = Field(description="Products or services the company offers, builds or operates.")
    tech_owned: List[str] = Field(description="Technologies, services or IP the company develops or owns.")
    tech_used: List[str] = Field(description="Technologies used or relied upon but NOT owned.")
    discontinued_items: List[str] = Field(description="Exact snippets from the text supporting these findings.")

In [236]:
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor, ImplicitPathExtractor
from llama_index.core import PropertyGraphIndex
from llama_index.llms.openai import OpenAI

In [238]:
BaseModel.model_config["arbitrary_types_allowed"] = True

In [None]:
# Ontology definition (minimal)
entities = ["COMPANY", "PRODUCT", "TECHNOLOGY", "IP", "MARKET"]
relations = ["OFFERS", "OWNS", "DEPENDS_ON", "SUNSETS", "DEVELOPED_BY"]
validation_schema = [
    ("COMPANY", "OFFERS", "PRODUCT"),
    ("COMPANY", "OWNS", "TECHNOLOGY"),
    ("COMPANY", "DEPENDS_ON", "TECHNOLOGY"),
    ("COMPANY", "SUNSETS", "PRODUCT")
]

In [240]:
kg_extractor = SchemaLLMPathExtractor(
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    llm=OpenAI(model="gpt-4o"),
    strict=False
)

In [241]:
import nest_asyncio
nest_asyncio.apply()

In [243]:
index = PropertyGraphIndex(
    [],
    kg_extractors=[kg_extractor],
    llm=OpenAI(model="gpt-4o-mini")
)

In [244]:
import time

In [250]:
# Add documents in small batches of 5
batch_size = 5
for i in range(0, len(gated_docs), batch_size):
    batch = gated_docs[i : i + batch_size]
    print(f"Processing batch {i//batch_size + 1}...")

    # Insert nodes batch by batch
    for doc in batch:
        index.insert(doc)
    
    # Sleep for 10-20 seconds to let TPM reset
    print("Sleeping to avoid rate limits...")
    time.sleep(15)

index.storage_context.persist(persist_dir="./storage_graph")

Processing batch 1...
Sleeping to avoid rate limits...
Processing batch 2...
Sleeping to avoid rate limits...
Processing batch 3...
Sleeping to avoid rate limits...
Processing batch 4...
Sleeping to avoid rate limits...
Processing batch 5...
Sleeping to avoid rate limits...
Processing batch 6...
Sleeping to avoid rate limits...
Processing batch 7...
Sleeping to avoid rate limits...
Processing batch 8...
Sleeping to avoid rate limits...
Processing batch 9...
Sleeping to avoid rate limits...
Processing batch 10...
Sleeping to avoid rate limits...
Processing batch 11...
Sleeping to avoid rate limits...
Processing batch 12...
Sleeping to avoid rate limits...
Processing batch 13...
Sleeping to avoid rate limits...
Processing batch 14...
Sleeping to avoid rate limits...
Processing batch 15...
Sleeping to avoid rate limits...


In [None]:
# With no company filter
query_engine = index.as_query_engine(
    include_text=True,
    similarity_top_k=3
)

In [253]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="storage_graph/")
index = load_index_from_storage(storage_context)

In [None]:
# Built-in visual representation of KG to check it is being generated as expected
index.property_graph_store.save_networkx_graph(name="./kg_visual.html")

In [260]:
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters
from llama_index.core.indices.property_graph import VectorContextRetriever

In [261]:
def get_engine_for_company(index, company_id):
    filters = MetadataFilters(filters=[
        MetadataFilter(key="company_id", value=str(company_id))
    ])

    retriever = VectorContextRetriever(
        index.property_graph_store,
        vector_store=index.vector_store,
        filters=filters
    )

    return index.as_query_engine(sub_retrievers=[retriever])

In [None]:
# Core questions (pink)
analyst_questions = [
    "What products or services does this company offer, build, or operate?",
    "What technologies, services, and intellectual property does the company develop or own?",
    "What technologies does the company use / rely on but not develop or own?",
    "Are any products or services being sunset, discontinued, or deprioritised?"
]

In [263]:
unique_ids = set([doc.metadata["company_id"] for doc in gated_docs])

In [292]:
from llama_index.core.evaluation import FaithfulnessEvaluator

In [293]:
faithfulness_evaluator = FaithfulnessEvaluator(llm=OpenAI(model="gpt-4o"))

In [294]:
results_report = []

for c_id in unique_ids:
    print(f"Analysing Company ID: {c_id}...")

    engine = get_engine_for_company(index, c_id)

    company_data = {"company_id": c_id}

    for q in analyst_questions:
        response = engine.query(q)

        faithfulness = faithfulness_evaluator.evaluate_response(response=response)
        
        # Store response and faithfulness
        company_data[q] = str(response)
        company_data["faithfulness_score"] = faithfulness
    
    results_report.append(company_data)

Analysing Company ID: 317486...
Analysing Company ID: 262144...
Analysing Company ID: 874790...


In [295]:
results_report

[{'company_id': '317486',
  'What products or services does this company offer, build, or operate?': 'The company offers business operations, develops information technology infrastructure, and depends on business operations.',
  'faithfulness_score': EvaluationResult(query=None, contexts=['Here are some facts extracted from the provided text:\n\nChristopher & Banks Corporation -> SUNSETS -> 37 store closures in fiscal 2011\nChristopher & Banks Corporation -> SUNSETS -> stores closed in fiscal 2012\nChristopher & Banks Corporation -> SUNSETS -> stores closed in fourth quarter\nChristopher & Banks Corporation -> DEPENDS_ON -> consumer spending on durable goods\n\nThe Company is subject, from time to time, to various claims, lawsuits or actions that arise in the ordinary course of business. Although the amount of any liability that could arise with respect to any current proceedings cannot, in our opinion, be accurately predicted, any such liability is not expected to have a material adv

In [268]:
from llama_index.core.prompts import PromptTemplate

In [278]:
# Take a sample of documents that were gated (accepted)
eval_sample = gated_docs[:10]

llm_judge = OpenAI(model="gpt-4o")

In [None]:
gate_results = []
for doc in eval_sample:
    text = doc.text[:500]
    judge_response = llm_judge.complete(
        
        "Analyze the following text from an S&P filing. ",
        "Does this text contain specific information about business products, services, ",
        f"or operational technologies? Answer only 'YES' or 'NO'.\n\nText: {text}"
        
    ).text.strip()
    gate_results.append({
        "doc_id": doc.id_,
        "workflow_decision": "GATED",
        "judge_decision": judge_response
    })

In [None]:
# text = gated_docs[3].text

In [None]:
# llm_judge.complete(
#     f"""Analyze the following text from an S&P filing. 
#         Does this describe actual business activities?
#         Ignore generic boilerplate and T&Cs. Answer only 'YES' or 'NO'.\n\nText: {text}"""
# )

CompletionResponse(text='NO', additional_kwargs={'prompt_tokens': 49, 'completion_tokens': 1, 'total_tokens': 50}, raw=ChatCompletion(id='chatcmpl-Cr1h4AvsCXu7BPK2OnY3MvoglWnhM', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='NO', refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1766753930, model='gpt-4o-2024-08-06', object='chat.completion', service_tier='default', system_fingerprint='fp_deacdd5f6f', usage=CompletionUsage(completion_tokens=1, prompt_tokens=49, total_tokens=50, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))), logprobs=None, delta=None)