# Download raw data

In [None]:
%load_ext autoreload
%autoreload 2

# Import necessary modules
import sys
from pathlib import Path

# Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))


from knowledge_base.src.ingestion.sec_downloader import SECDownloader

# Create our downloader instance
downloader = SECDownloader()

# Test the connection by getting basic company info
company_info = downloader.get_company_info('MSTR')
print("Company Info for MSTR:")
print(company_info)

In [None]:
# Import required modules
import json
from pathlib import Path

# Download and process filings
filings = downloader.download_company_filings(
    ticker='RDDT',
    filing_types=['10-K'],  # Just annual reports
    num_filings=2  # Get the most recent filings
)

print("\nDownloaded Filing Metadata:")
for filing in filings:
    print(f"\nFiling Type: {filing.get('type')}")
    print(f"Filing Date: {filing.get('period_of_report', 'N/A')}")
    print(f"Accession Number: {filing.get('accession_number', 'N/A')}")
    print(f"File Path: {filing.get('file_path')}")
    
    # Get metadata file path
    doc_dir = Path(filing['file_path']).parent
    metadata_path = doc_dir / "metadata.json"
    
    # Read and display the saved metadata
    if metadata_path.exists():
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
            print("\nStored Metadata:")
            print(json.dumps(metadata, indent=2))


# Process Documents

## Processing for SQL Database data

In [None]:

TEST_TICKER = 'RDDT'

%load_ext autoreload
%autoreload 2

from datetime import datetime
from pathlib import Path
import sys

# Add the project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from knowledge_base.src.ingestion.sec_downloader import SECDownloader
from knowledge_base.src.ingestion.sec_sql_extractor import SECDataExtractor
from knowledge_base.src.storage.sql_manager import FinancialMetricsManager


# Force reload the updated module
import importlib
import knowledge_base.src.ingestion.sql_extractor
importlib.reload(knowledge_base.src.ingestion.sec_downloader)
importlib.reload(knowledge_base.src.ingestion.sql_extractor)
importlib.reload(knowledge_base.src.storage.sql_manager)

import os
print(f"Current working directory: {os.getcwd()}")
print(f"Project root: {Path.cwd().parent}")

# Then import the extractor
from knowledge_base.src.ingestion.sql_extractor import SECDataExtractor

# Initialize components
downloader = SECDownloader()
extractor = SECDataExtractor()
sql_manager = FinancialMetricsManager()

# Step 1: Download SEC filing
filings = downloader.download_company_filings(
    ticker=TEST_TICKER,
    filing_types=['10-K'],
    num_filings=5
)

print("len(filings)", len(filings))

# Step 2: Process the filing
if filings:

    print("if filings called")
    
    filing = filings[0]
    print("filing", filing)
    file_dir = Path(filing['file_path']).parent
    
    # Look for XBRL or HTML version
    xbrl_file = next(file_dir.glob("*.xml"), None)
    html_file = next(file_dir.glob("*.htm*"), None)


    # Add client to the database before adding the document
    client_data = {
        "id": filing['ticker'],
        "company_name": filing.get('company_name', ''),
        "cik": filing.get('cik', ''),
        "industry": "",  # Fill if available
        "sector": "",    # Fill if available
        "market_cap": None  # Fill if available
    }
    sql_manager.sql_store.add_client(client_data)
    
    # Create document in SQL store first
    doc_data = {
        "document_id": filing['accession_number'],
        "client_id": filing['ticker'],
        "filing_type": filing['type'],
        "filing_date": filing['period_of_report'],
        "file_path": filing['file_path'],
        "file_size": filing['file_size'],
        "download_date": datetime.fromisoformat(filing['downloaded_at']),
        "has_revenue_data": filing['has_revenue_data'],
        "has_profit_data": filing['has_profit_data'],
        "has_balance_sheet": filing['has_balance_sheet'],
        "has_cash_flow": filing['has_cash_flow']
    }
    
    # Add document to get SQL document_id
    document_id = sql_manager.sql_store.add_document(doc_data)
    
    if document_id:

        print("IF document_id CALLED")

        # Try XBRL first, then HTML, then full submission
        if xbrl_file:
            print(f"Processing XBRL file: {xbrl_file}")
            metrics = extractor.process_document(str(xbrl_file), TEST_TICKER)
        elif html_file:
            print(f"Processing HTML file: {html_file}")
            metrics = extractor.process_document(str(html_file), TEST_TICKER)
        else:
            print(f"Processing full submission: {filing['file_path']}")
            metrics = extractor.process_document(filing['file_path'], TEST_TICKER)
            
        print(f"Extracted {len(metrics)} metrics")
        if metrics:
            sql_manager.save_extracted_metrics(metrics, document_id)
            print("Metrics saved to database")
            print("metrics", metrics)

        # Validate the extractions
        validation = sql_manager.validate_client_metrics(TEST_TICKER, 2024)
        print("\nValidation results:", validation)

        # Get comparative metrics
        comparative = sql_manager.get_comparative_metrics(
            ["AAPL", "MSFT", "GOOGL"],
            ["revenue", "net_income"],
            2024
        )
        print("\nComparative metrics:", comparative)


## Quick test for SQL DB

In [None]:
%load_ext autoreload
%autoreload 2

# Simple SQL Database Test
import sqlite3
import pandas as pd
from pathlib import Path

# Import necessary modules
import sys
from pathlib import Path

# # Add the project root to Python path so we can import our modules
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Force reload the updated module
import importlib
import knowledge_base.src.ingestion.sql_extractor
importlib.reload(knowledge_base.src.ingestion.sec_downloader)
importlib.reload(knowledge_base.src.ingestion.sql_extractor)
importlib.reload(knowledge_base.src.storage.sql_manager)

# # Database path
db_path = Path("../knowledge_base/data/financial_kb.db")

print("\n" + "="*60)
print("DETAILED DATABASE ANALYSIS")
print("="*60)

# Connect to database for detailed analysis
conn = sqlite3.connect(db_path)

try:
    # Get all tables
    tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
    print(f"Tables found: {tables['name'].tolist()}")
    
    # Analyze each table
    for table_name in tables['name']:
        print(f"\n--- {table_name.upper()} TABLE ---")
        
        # Get table schema
        schema = pd.read_sql_query(f"PRAGMA table_info({table_name})", conn)
        print(f"Columns: {schema['name'].tolist()}")
        
        # Get record count
        count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table_name}", conn)
        record_count = count['count'].iloc[0]
        print(f"Records: {record_count}")
        
        # Show sample data if records exist
        if record_count > 0:
            sample = pd.read_sql_query(f"SELECT * FROM {table_name} LIMIT 200", conn)
            print("Sample data:")
            print(sample.to_string(index=False))
        else:
            print("(No data)")
    
    # Additional analysis
    print(f"\n=== ADDITIONAL ANALYSIS ===")
    
    # Check for any financial metrics
    metrics_count = pd.read_sql_query("SELECT COUNT(*) as count FROM financial_metrics", conn)['count'].iloc[0]
    print(f"Financial metrics extracted: {metrics_count}")
    
    # Check for any clients
    clients_count = pd.read_sql_query("SELECT COUNT(*) as count FROM clients", conn)['count'].iloc[0]
    print(f"Clients registered: {clients_count}")
    
    # Check document processing status
    docs = pd.read_sql_query("""
        SELECT 
            client_id,
            filing_type,
            filing_date,
            has_revenue_data,
            has_profit_data,
            has_balance_sheet,
            has_cash_flow,
            financial_density
        FROM documents
    """, conn)
    
    if not docs.empty:
        print(f"\nDocument processing status:")
        print(docs.to_string(index=False))
    
    # Check for any chunks
    chunks_count = pd.read_sql_query("SELECT COUNT(*) as count FROM document_chunks", conn)['count'].iloc[0]
    print(f"Document chunks created: {chunks_count}")
    
    print(f"\n✅ Database analysis complete!")
    
except Exception as e:
    print(f"❌ Error analyzing database: {e}")

finally:
    conn.close()

## Processing for Vector Database

In [None]:
# Test for processing document chunks for Vector DB


from datetime import datetime
from pathlib import Path
from knowledge_base.src.ingestion.document_processor import FinancialDocumentProcessor
from knowledge_base.src.ingestion.sec_downloader import SECDownloader
from knowledge_base.config.settings import get_settings

# Initialize settings and components
settings = get_settings()
processor = FinancialDocumentProcessor()
downloader = SECDownloader()

# Define test parameters
TEST_TICKER = "NVDA"  # Example: NVIDIA
TEST_FILING_TYPES = ["10-K"]  # Or include "8-K", "10-Q"
TEST_OUTPUT_DIR = Path(settings.data.processed_data_path)
TEST_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Step 1: Download filings
downloaded_filings = downloader.download_company_filings(
    ticker=TEST_TICKER,
    filing_types=TEST_FILING_TYPES,
    num_filings=1  # Just get the most recent one
)
print(f"\nDownloaded {len(downloaded_filings)} filings")

# Step 2: Process each filing
all_processed_chunks = []
for filing in downloaded_filings:
    print(f"\nProcessing {filing['type']} filing from {filing['file_path']}")
    
    # Add processing metadata
    filing['source'] = 'SEC Edgar'
    filing['processed_at'] = datetime.now().isoformat()
    
    # Process the filing
    chunks = processor.process_sec_filing(
        file_path=filing['file_path'],
        metadata=filing
    )
    all_processed_chunks.extend(chunks)
    
    # Save processed chunks
    output_file = f"{TEST_TICKER}_{filing['type']}_{filing['date']}_processed.json"
    processor.save_processed_chunks(chunks, output_file)
    print(f"Saved processed chunks to: {output_file}")

# Step 3: Inspect results
print(f"\nTotal chunks generated: {len(all_processed_chunks)}")
if all_processed_chunks:
    print("\nSample chunk content:")
    print(all_processed_chunks[0].page_content[:500])
    print("\nSample chunk metadata:")
    print(all_processed_chunks[0].metadata)