# Download raw data

In [None]:
%load_ext autoreload
%autoreload 2

# Import necessary modules
import sys
from pathlib import Path

# Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))


from knowledge_base.src.ingestion.sec_downloader import SECDownloader

# Create our downloader instance
downloader = SECDownloader()

# Test the connection by getting basic company info
company_info = downloader.get_company_info('MSTR')
print("Company Info for MSTR:")
print(company_info)

In [None]:
# Import required modules
import json
from pathlib import Path

# Download and process filings
filings = downloader.download_company_filings(
    ticker='RDDT',
    filing_types=['10-K'],  # Just annual reports
    num_filings=2  # Get the most recent filings
)

print("\nDownloaded Filing Metadata:")
for filing in filings:
    print(f"\nFiling Type: {filing.get('type')}")
    print(f"Filing Date: {filing.get('period_of_report', 'N/A')}")
    print(f"Accession Number: {filing.get('accession_number', 'N/A')}")
    print(f"File Path: {filing.get('file_path')}")
    
    # Get metadata file path
    doc_dir = Path(filing['file_path']).parent
    metadata_path = doc_dir / "metadata.json"
    
    # Read and display the saved metadata
    if metadata_path.exists():
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
            print("\nStored Metadata:")
            print(json.dumps(metadata, indent=2))


# SQL Database

## Processing for SQL Database data

In [25]:

TEST_TICKER = 'RDDT'

%load_ext autoreload
%autoreload 2

from datetime import datetime
from pathlib import Path
import sys

# Add the project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from knowledge_base.src.ingestion.sec_downloader import SECDownloader
from knowledge_base.src.ingestion.sec_sql_extractor import SECDataExtractor
from knowledge_base.src.storage.sql_manager import FinancialMetricsManager


# Force reload the updated module
import importlib
import knowledge_base.src.ingestion.sql_extractor
importlib.reload(knowledge_base.src.ingestion.sec_downloader)
importlib.reload(knowledge_base.src.ingestion.sql_extractor)
importlib.reload(knowledge_base.src.storage.sql_manager)

import os
print(f"Current working directory: {os.getcwd()}")
print(f"Project root: {Path.cwd().parent}")

# Then import the extractor
from knowledge_base.src.ingestion.sql_extractor import SECDataExtractor

# Initialize components
downloader = SECDownloader()
extractor = SECDataExtractor()
sql_manager = FinancialMetricsManager()

# Step 1: Download SEC filing
filings = downloader.download_company_filings(
    ticker=TEST_TICKER,
    filing_types=['10-K'],
    num_filings=5
)

print("len(filings)", len(filings))

# Step 2: Process the filing
if filings:

    print("if filings called")
    
    filing = filings[0]
    print("filing", filing)
    file_dir = Path(filing['file_path']).parent
    
    # Look for XBRL or HTML version
    xbrl_file = next(file_dir.glob("*.xml"), None)
    html_file = next(file_dir.glob("*.htm*"), None)


    # Add client to the database before adding the document
    client_data = {
        "id": filing['ticker'],
        "company_name": filing.get('company_name', ''),
        "cik": filing.get('cik', ''),
        "industry": "",  # Fill if available
        "sector": "",    # Fill if available
        "market_cap": None  # Fill if available
    }
    sql_manager.sql_store.add_client(client_data)
    
    # Create document in SQL store first
    doc_data = {
        "document_id": filing['accession_number'],
        "client_id": filing['ticker'],
        "filing_type": filing['type'],
        "filing_date": filing['period_of_report'],
        "file_path": filing['file_path'],
        "file_size": filing['file_size'],
        "download_date": datetime.fromisoformat(filing['downloaded_at']),
        "has_revenue_data": filing['has_revenue_data'],
        "has_profit_data": filing['has_profit_data'],
        "has_balance_sheet": filing['has_balance_sheet'],
        "has_cash_flow": filing['has_cash_flow']
    }
    
    # Add document to get SQL document_id
    document_id = sql_manager.sql_store.add_document(doc_data)
    
    if document_id:

        print("IF document_id CALLED")

        # Try XBRL first, then HTML, then full submission
        if xbrl_file:
            print(f"Processing XBRL file: {xbrl_file}")
            metrics = extractor.process_document(str(xbrl_file), TEST_TICKER)
        elif html_file:
            print(f"Processing HTML file: {html_file}")
            metrics = extractor.process_document(str(html_file), TEST_TICKER)
        else:
            print(f"Processing full submission: {filing['file_path']}")
            metrics = extractor.process_document(filing['file_path'], TEST_TICKER)
            
        print(f"Extracted {len(metrics)} metrics")
        if metrics:
            sql_manager.save_extracted_metrics(metrics, document_id)
            print("Metrics saved to database")
            print("metrics", metrics)

        # Validate the extractions
        validation = sql_manager.validate_client_metrics(TEST_TICKER, 2024)
        print("\nValidation results:", validation)

        # Get comparative metrics
        comparative = sql_manager.get_comparative_metrics(
            ["AAPL", "MSFT", "GOOGL"],
            ["revenue", "net_income"],
            2024
        )
        print("\nComparative metrics:", comparative)


2025-08-06 16:29:27,417 - pyrate_limiter - INFO - Initializing default bucket(InMemoryBucket) with rates: [limit=10/1000]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Current working directory: /Users/daylight/Desktop/Financial Insight AI/notebooks
Project root: /Users/daylight/Desktop/Financial Insight AI


2025-08-06 16:29:30,101 - knowledge_base.src.storage.sql_store - INFO - Created SQL engine: sqlite:////Users/daylight/Desktop/Financial Insight AI/knowledge_base/data/financial_kb.db
2025-08-06 16:29:30,104 - knowledge_base.src.storage.sql_store - INFO - Created SQL tables
2025-08-06 16:29:30,280 - knowledge_base.src.ingestion.sec_downloader - INFO - Downloading 10-K filings for RDDT


RDDT ['10-K'] 5


2025-08-06 16:29:30,600 - knowledge_base.src.ingestion.sec_downloader - INFO - Company fiscal year end: 20XX-12-31 (from code: 1231)
2025-08-06 16:29:30,602 - knowledge_base.src.ingestion.sec_downloader - INFO - Available fields in filings: ['accessionNumber', 'filingDate', 'reportDate', 'acceptanceDateTime', 'act', 'form', 'fileNumber', 'filmNumber', 'items', 'core_type', 'size', 'isXBRL', 'isInlineXBRL', 'primaryDocument', 'primaryDocDescription']
2025-08-06 16:29:30,603 - knowledge_base.src.ingestion.sec_downloader - INFO - form (first 3 entries): ['SCHEDULE 13G/A', '144', '10-Q']
2025-08-06 16:29:30,603 - knowledge_base.src.ingestion.sec_downloader - INFO - reportDate (first 3 entries): ['', '', '2025-06-30']
2025-08-06 16:29:30,604 - knowledge_base.src.ingestion.sec_downloader - INFO - accessionNumber (first 3 entries): ['0000315066-25-001946', '0001950047-25-005438', '0001713445-25-000196']


KeyboardInterrupt: 

## Quick test for SQL DB

In [None]:
%load_ext autoreload
%autoreload 2

# Simple SQL Database Test
import sqlite3
import pandas as pd
from pathlib import Path

# Import necessary modules
import sys
from pathlib import Path

# # Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Force reload the updated module
import importlib
import knowledge_base.src.ingestion.sql_extractor
importlib.reload(knowledge_base.src.ingestion.sec_downloader)
importlib.reload(knowledge_base.src.ingestion.sql_extractor)
importlib.reload(knowledge_base.src.storage.sql_manager)

# # Database path
db_path = Path("../knowledge_base/data/financial_kb.db")

print("\n" + "="*60)
print("DETAILED DATABASE ANALYSIS")
print("="*60)

# Connect to database for detailed analysis
conn = sqlite3.connect(db_path)

try:
    # Get all tables
    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
    print(f"Tables found: {tables['name'].tolist()}")
    
    # Analyze each table
    for table_name in tables['name']:
        print(f"\n--- {table_name.upper()} TABLE ---")
        
        # Get table schema
        schema = pd.read_sql_query(f"PRAGMA table_info({table_name})", conn)
        print(f"Columns: {schema['name'].tolist()}")
        
        # Get record count
        count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table_name}", conn)
        record_count = count['count'].iloc[0]
        print(f"Records: {record_count}")
        
        # Show sample data if records exist
        if record_count > 0:
            sample = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 200", conn)
            print("Sample data:")
            print(sample.to_string(index=False))
        else:
            print("(No data)")
    
    # Additional analysis
    print(f"\n=== ADDITIONAL ANALYSIS ===")
    
    # Check for any financial metrics
    metrics_count = pd.read_sql_query("SELECT COUNT(*) as count FROM financial_metrics", conn)['count'].iloc[0]
    print(f"Financial metrics extracted: {metrics_count}")
    
    # Check for any clients
    clients_count = pd.read_sql_query("SELECT COUNT(*) as count FROM clients", conn)['count'].iloc[0]
    print(f"Clients registered: {clients_count}")
    
    # Check document processing status
    docs = pd.read_sql_query("""
        SELECT 
            client_id,
            filing_type,
            filing_date,
            has_revenue_data,
            has_profit_data,
            has_balance_sheet,
            has_cash_flow,
            financial_density
        FROM documents
    """, conn)
    
    if not docs.empty:
        print(f"\nDocument processing status:")
        print(docs.to_string(index=False))
    
    # Check for any chunks
    chunks_count = pd.read_sql_query("SELECT COUNT(*) as count FROM document_chunks", conn)['count'].iloc[0]
    print(f"Document chunks created: {chunks_count}")
    
    print(f"\n✅ Database analysis complete!")
    
except Exception as e:
    print(f"❌ Error analyzing database: {e}")

finally:
    conn.close()

# Vector Database

## Create chunks for embeddings

In [None]:
# # This was using document_processor.py

# from datetime import datetime
# from pathlib import Path
# from knowledge_base.src.ingestion.document_processor import FinancialDocumentProcessor
# from knowledge_base.src.ingestion.sec_downloader import SECDownloader
# from knowledge_base.config.settings import get_settings

# # Initialize settings and components
# settings = get_settings()
# processor = FinancialDocumentProcessor()
# downloader = SECDownloader()

# # Define test parameters
# TEST_TICKER = "NVDA"  # Example: NVIDIA
# TEST_FILING_TYPES = ["10-K"]  # Or include "8-K", "10-Q"
# TEST_OUTPUT_DIR = Path(settings.data.processed_data_path)
# TEST_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# # Step 1: Download filings
# downloaded_filings = downloader.download_company_filings(
#     ticker=TEST_TICKER,
#     filing_types=TEST_FILING_TYPES,
#     num_filings=1  # Just get the most recent one
# )
# print(f"\nDownloaded {len(downloaded_filings)} filings")

# # Step 2: Process each filing
# all_processed_chunks = []
# for filing in downloaded_filings:
#     print(f"\nProcessing {filing['type']} filing from {filing['file_path']}")
    
#     # Add processing metadata
#     filing['source'] = 'SEC Edgar'
#     filing['processed_at'] = datetime.now().isoformat()
    
#     # Process the filing
#     chunks = processor.process_sec_filing(
#         file_path=filing['file_path'],
#         metadata=filing
#     )
#     all_processed_chunks.extend(chunks)
    
#     # Save processed chunks
#     output_file = f"{TEST_TICKER}_{filing['type']}_{filing['date']}_processed.json"
#     processor.save_processed_chunks(chunks, output_file)
#     print(f"Saved processed chunks to: {output_file}")

# # Step 3: Inspect results
# print(f"\nTotal chunks generated: {len(all_processed_chunks)}")
# if all_processed_chunks:
#     print("\nSample chunk content:")
#     print(all_processed_chunks[0].page_content[:500])
#     print("\nSample chunk metadata:")
#     print(all_processed_chunks[0].metadata)

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from pathlib import Path
import json

# Import necessary modules
import sys
from pathlib import Path
import os
print(f"Current working directory: {os.getcwd()}")
print(f"Project root: {Path.cwd().parent}")

# Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import the knowledge_base module first
import knowledge_base.src.ingestion.vector_db_processor

# Force reload the updated module
import importlib
importlib.reload(knowledge_base.src.ingestion.vector_db_processor)
from knowledge_base.src.ingestion.vector_db_processor import VectorProcessor

doc_file_path = "../knowledge_base/data/raw/RDDT/10-Q/000171344525000196/"
doc_file_path = "../knowledge_base/data/raw/TEAM/10-K/000165037224000036/"


with open(f"{doc_file_path}metadata.json", "r") as f:
    metadata_json = json.load(f)

print(metadata_json)
print(metadata_json['ticker'])


# Initialize the processor
processor = VectorProcessor(
    # vector_db_path="./chroma_db",
    # embedding_model="all-MiniLM-L6-v2",
    chunk_size=500,
    chunk_overlap=50
)

# Example: Process a document
print("Process a downloaded filing")
chunks = processor.process_document(
    file_path=f"{doc_file_path}/full-submission.txt",
    company_id=metadata_json['ticker'],
    doc_type=metadata_json['type'],
    period=metadata_json['period_of_report']
)
print(f"Processed {len(chunks)} chunks.")

# Store the chunks in the vector database
stored_count = processor.store_chunks(chunks)
print(f"Stored {stored_count} chunks.")

# Example: Search for revenue-related chunks
print(" Search for revenue-related chunks")
results = processor.search_chunks(
    company_id=metadata_json['ticker'],
    query="revenue growth",
    n_results=5
)
for result in results:
    print(f"- {result['text'][:100]}... (score: {result['distance']:.3f})")

# Check collection stats
stats = processor.get_collection_stats(metadata_json['ticker'])
print(f"Collection stats: {stats}")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Current working directory: /Users/daylight/Desktop/Financial Insight AI/notebooks
Project root: /Users/daylight/Desktop/Financial Insight AI
{'ticker': 'TEAM', 'type': '10-K', 'file_path': '/Users/daylight/Desktop/Financial Insight AI/knowledge_base/data/raw/TEAM/10-K/000165037224000036/full-submission.txt', 'downloaded_at': '2025-08-03T14:51:30.940033', 'file_size': 14383319, 'company_name': 'Atlassian Corp', 'cik': '0001650372', 'business_address': '', 'has_revenue_data': True, 'has_profit_data': True, 'has_balance_sheet': True, 'has_cash_flow': True, 'financial_keywords_count': 0, 'metadata_path': '/Users/daylight/Desktop/Financial Insight AI/knowledge_base/data/raw/TEAM/10-K/000165037224000036/metadata.json', 'date': '2024-06-30', 'period_of_report': '2024-06-30', 'accession_number': '0001650372-24-000036', 'fiscal_year_end': '2023-09-30'}
TEAM
Loading embedding model: all-MiniLM-L6-v2
Process a

## Vector DB: Get Embeddings

In [None]:
from dotenv import load_dotenv
import os
from openai import OpenAI

# Load the .env file from the root directory
load_dotenv()

# Access the API key
api_key = os.getenv("OPENAI_API_KEY") 

# Initialize OpenAI client
openai_client = OpenAI(api_key=api_key)

text = "This is a test text string!"

def get_embedding(text: str) -> list[float]:
    response = openai_client.embeddings.create(
        input=text,
        model="text-embedding-3-small",
    )
    return response.data[0].embedding


embedding = get_embedding(text)
print(len(embedding), embedding)

[-0.009071986190974712, -0.009768158197402954, -0.009717395529150963, -0.031762830913066864, -0.04870300740003586, -0.046527471393346786, 0.03341623768210411, 0.008209023624658585, 0.02483012154698372, -0.009673885069787502, 0.008375815115869045, -0.007143010850995779, -0.030254459008574486, -0.006624508183449507, 0.028818603605031967, 0.02748427540063858, -0.04768775776028633, 0.0026342119090259075, -0.05113960802555084, 0.04223441332578659, 0.01955081894993782, -0.00924602895975113, 0.015127229504287243, 0.033532265573740005, 0.04388782009482384, -0.04713661968708038, -0.028339985758066177, 0.03173382207751274, 0.047078605741262436, -0.03631695359945297, 0.026889629662036896, -0.04406186193227768, -0.005475099664181471, -0.03750624507665634, -0.005018237046897411, 0.05151670053601265, -0.004822439048439264, 0.018376030027866364, -0.03451851010322571, -0.024699589237570763, -0.05743416026234627, -0.038666531443595886, 0.015446308068931103, -0.0023296368308365345, -0.006736910901963711

In [18]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

from datetime import datetime
from pathlib import Path
import sys

# Add the project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from knowledge_base.src.utils.embeddings import EmbeddingClient

embedding_client = EmbeddingClient()


test_text = "Hello World!"
embedding = embedding_client.get_embedding(test_text)
print(embedding)


test_text_list = ["Hello World!", "Why hello!"]
embeddings_list = embedding_client.get_embeddings_batch(test_text_list)
print(embeddings_list)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2025-08-06 16:25:54,214 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[-0.00303721334785223, -0.056747838854789734, 0.029505394399166107, 0.0429670475423336, -0.04079113528132439, -0.025182578712701797, -0.01281613390892744, 0.03519178181886673, -0.031536247581243515, -0.0111116673797369, -0.015869665890932083, -0.031101064756512642, -0.020279519259929657, -0.024703877046704292, 0.029606936499476433, 0.035859063267707825, -0.03820905089378357, 0.017813483253121376, 0.011416295543313026, 0.040762122720479965, 0.047550976276397705, 0.002547632670029998, -0.0063645485788583755, -0.013896837830543518, 0.03478561341762543, -0.012214130721986294, -0.04438864812254906, 0.018524281680583954, 0.023673944175243378, -0.043373219668865204, 0.044939879328012466, -0.036236222833395004, -0.010205036960542202, 0.005954751279205084, 0.006237620022147894, 0.0006160556804388762, -0.0016319354763254523, 0.006502356380224228, -0.001445169560611248, -0.024007584899663925, 0.01213434711098671, -0.027779169380664825, 0.01024855487048626, 0.04032694175839424, -0.0520478636026382

2025-08-06 16:25:54,441 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[Embedding(embedding=[-0.003015623427927494, -0.05678003653883934, 0.029449021443724632, 0.04302748665213585, -0.04079342260956764, -0.025169484317302704, -0.01282410603016615, 0.03516474366188049, -0.031538017094135284, -0.011105037294328213, -0.015870556235313416, -0.03110281005501747, -0.020280657336115837, -0.02473427727818489, 0.029623104259371758, 0.035890087485313416, -0.038211192935705185, 0.017785469070076942, 0.011395175941288471, 0.04076441004872322, 0.04761166870594025, 0.0025350821670144796, -0.00633951835334301, -0.013933884911239147, 0.034845590591430664, -0.012258336879312992, -0.044275082647800446, 0.018510812893509865, 0.02366076596081257, -0.04340466856956482, 0.04494239762425423, -0.036238253116607666, -0.01019835565239191, 0.005947832018136978, 0.006252476945519447, 0.000601129955612123, -0.0016773612005636096, 0.006473707500845194, -0.0014978381805121899, -0.02403794601559639, 0.012142281047999859, -0.027766220271587372, 0.010249130427837372, 0.04032920300960541, 

## Test to calculate distances of similar and different embeddings

In [24]:
import numpy as np
from knowledge_base.src.utils.embeddings import EmbeddingClient

client = EmbeddingClient()
texts = [
    "The stock market reached a record high.",  # Similar to next line
    "Penguins habitate Antarctica.",     # Similar
    "Penguins live in Antarctica.",             # Different
    "Penguins live in Antarctica!"           
]
embeddings = [client.get_embedding(text) for text in texts]


# Cosine similarity (using dot product since OpenAI embeddings are normalized)
def cosine_similarity(a, b):
    return np.dot(a, b)  # Equivalent to cosine if vectors are normalized

# Euclidean distance
def euclidean_distance(a, b):
    return np.linalg.norm(np.array(a) - np.array(b))

# Compare all pairs
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        sim = cosine_similarity(embeddings[i], embeddings[j])
        dist = euclidean_distance(embeddings[i], embeddings[j])
        print(f"Text {i+1} & {j+1}: Cosine={sim:.3f}, Euclidean={dist:.3f}")
        print(f"  Text {i+1}: {texts[i]}")
        print(f"  Text {j+1}: {texts[j]}\n")

2025-08-06 16:28:15,098 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-08-06 16:28:15,282 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-08-06 16:28:16,113 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-08-06 16:28:16,417 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Text 1 & 2: Cosine=0.003, Euclidean=1.412
  Text 1: The stock market reached a record high.
  Text 2: Penguins habitate Antarctica.

Text 1 & 3: Cosine=0.000, Euclidean=1.414
  Text 1: The stock market reached a record high.
  Text 3: Penguins live in Antarctica.

Text 1 & 4: Cosine=0.039, Euclidean=1.387
  Text 1: The stock market reached a record high.
  Text 4: Penguins live in Antarctica!

Text 2 & 3: Cosine=0.893, Euclidean=0.464
  Text 2: Penguins habitate Antarctica.
  Text 3: Penguins live in Antarctica.

Text 2 & 4: Cosine=0.842, Euclidean=0.562
  Text 2: Penguins habitate Antarctica.
  Text 4: Penguins live in Antarctica!

Text 3 & 4: Cosine=0.938, Euclidean=0.351
  Text 3: Penguins live in Antarctica.
  Text 4: Penguins live in Antarctica!



## Qdrant DB

In [4]:
!docker run -p 6333:6333 -p 6334:6334 \
    -v $(pwd)/qdrant_storage:/qdrant/storage \
    qdrant/qdrant

docker: invalid reference format: repository name (library/Insight) must be lowercase

Run 'docker run --help' for more information


In [9]:
import os
import docker
from qdrant_client import QdrantClient

def start_qdrant_container():
    client = docker.from_env()
    qdrant_storage = os.path.abspath("knowledge_base/data/qdrant_storage")
    
    # Check if Qdrant container is already running
    containers = client.containers.list(filters={"name": "qdrant"})
    if containers:
        print("Qdrant container is already running.")
        return containers[0]
    
    # Start Qdrant container if not running
    container = client.containers.run(
        image="qdrant/qdrant",
        name="qdrant",
        ports={"6333/tcp": 6333, "6334/tcp": 6334},
        volumes={qdrant_storage: {"bind": "/qdrant/storage", "mode": "rw"}},
        detach=True,
        remove=True,  # Auto-remove container when stopped
    )
    print("Qdrant container started successfully.")
    return container

# Start Qdrant and get a Python client
container = start_qdrant_container()
qdrant_client = QdrantClient(host="localhost", port=6333)

Qdrant container is already running.


In [11]:
print(qdrant_client.get_collections())

collections=[]
