# FinSight Data Collector Testing Notebook

This notebook tests all components of the enhanced data collector including:
- Basic stock data collection
- HuggingFace sentiment analysis
- Actual SEC filing downloads
- Earnings call data collection
- Competitor SEC filings analysis

The system works with defaults. For enhanced features, optionally set API keys as environment variables or in a .env file.


In [1]:
import sys
import os
import json
import pandas as pd
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Add the project root to Python path
sys.path.append('.')

# Import our enhanced data collector
from agents.data_collector import DataCollectorTools, DataRequest
from config.config import FinSightConfig
from shared_memory.memory_manager import SharedMemoryManager

print("✅ Imports successful")


✅ Imports successful


## 1. Initialize Data Collector Tools


In [2]:
# Initialize configuration
config = FinSightConfig()
print(f"📁 Charts output path: {config.charts_output_path}")
print(f"🔑 Finnhub API key configured: {'Yes' if config.finnhub_api_key else 'No'}")
print(f"🏢 Edgar user agent: {config.edgar_company_name} {config.edgar_email}")

# Initialize data collector tools
tools = DataCollectorTools(config)
print("\nData Collector Tools initialized")
print(f"Sentiment model loaded: {'Yes' if tools.sentiment_model else 'No (using fallback)'}")


📁 Charts output path: ./outputs/charts
🔑 Finnhub API key configured: Yes
🏢 Edgar user agent: FinSight user@finsight.ai


Device set to use cpu



Data Collector Tools initialized
Sentiment model loaded: Yes


## 2. Test Basic Stock Data Collection


In [4]:
# Test stock data collection
symbol = "AAPL"  # You can change this to any stock symbol

print(f"📈 Testing stock data collection for {symbol}...")
stock_result = tools.get_stock_data(symbol)

if stock_result["success"]:
    data = stock_result["data"]
    print(f"\n✅ Stock data collected successfully!")
    print(f"📊 Current Price: ${data['current_price']:.2f}")
    print(f"📊 Market Cap: {data['market_cap']:,}" if data['market_cap'] else "📊 Market Cap: N/A")
    print(f"📊 P/E Ratio: {data['pe_ratio']:.2f}" if data['pe_ratio'] else "📊 P/E Ratio: N/A")
    print(f"📊 52-Week High: ${data['52_week_high']:.2f}")
    print(f"📊 52-Week Low: ${data['52_week_low']:.2f}")
    print(f"📊 Beta: {data['beta']:.2f}" if data['beta'] else "📊 Beta: N/A")
    print(f"📊 Historical data points: {len(data['historical_data']['dates'])}")
else:
    print(f"❌ Error: {stock_result['error']}")


📈 Testing stock data collection for AAPL...

✅ Stock data collected successfully!
📊 Current Price: $195.64
📊 Market Cap: 2,922,039,738,368
📊 P/E Ratio: 30.43
📊 52-Week High: $260.10
📊 52-Week Low: $169.21
📊 Beta: 1.21
📊 Historical data points: 501


## 3. Test Financial Statements Collection


In [5]:
print(f"💰 Testing financial statements collection for {symbol}...")
financials_result = tools.get_company_financials(symbol)

if financials_result["success"]:
    data = financials_result["data"]
    print(f"\n✅ Financial statements collected successfully!")
    
    # Show key metrics
    metrics = data['key_metrics']
    print(f"\n📊 Key Financial Metrics:")
    print(f"   • Revenue Growth: {metrics.get('revenue_growth', 'N/A')}")
    print(f"   • Profit Margin: {metrics.get('profit_margin', 'N/A')}")
    print(f"   • Operating Margin: {metrics.get('operating_margin', 'N/A')}")
    print(f"   • Return on Equity: {metrics.get('return_on_equity', 'N/A')}")
    print(f"   • Current Ratio: {metrics.get('current_ratio', 'N/A')}")
    print(f"   • Debt to Equity: {metrics.get('debt_to_equity', 'N/A')}")
    
    # Show available statement periods
    print(f"\n📄 Financial Statements Available:")
    print(f"   • Income Statement periods: {len(data['income_statement'])}")
    print(f"   • Balance Sheet periods: {len(data['balance_sheet'])}")
    print(f"   • Cash Flow periods: {len(data['cash_flow'])}")
else:
    print(f"❌ Error: {financials_result['error']}")


💰 Testing financial statements collection for AAPL...

✅ Financial statements collected successfully!

📊 Key Financial Metrics:
   • Revenue Growth: 0.051
   • Profit Margin: 0.24301
   • Operating Margin: 0.31028998
   • Return on Equity: 1.38015
   • Current Ratio: 0.821
   • Debt to Equity: 146.994

📄 Financial Statements Available:
   • Income Statement periods: 5
   • Balance Sheet periods: 5
   • Cash Flow periods: 5


## 4. Test Enhanced News Collection with HuggingFace Sentiment Analysis


In [6]:
print(f"📰 Testing news collection with sentiment analysis for {symbol}...")
news_result = tools.get_company_news(symbol, days_back=7)

if news_result["success"]:
    data = news_result["data"]
    articles = data['articles']
    print(f"\n✅ News collection successful!")
    print(f"📰 Found {len(articles)} articles from the last 7 days")
    
    if articles:
        print(f"\n📊 Sentiment Analysis Results:")
        sentiment_counts = {'positive': 0, 'negative': 0, 'neutral': 0}
        
        for i, article in enumerate(articles[:5]):
            sentiment = article['sentiment']
            sentiment_counts[sentiment] += 1
            score = article['sentiment_score']
            model = article['sentiment_model']
            
            print(f"\n📄 Article {i+1}:")
            print(f"   📰 Headline: {article['headline'][:100]}...")
            print(f"   🎯 Sentiment: {sentiment.upper()} (score: {score:.3f})")
            print(f"   🤖 Model used: {model}")
            print(f"   🔗 Source: {article['source']}")
        
        print(f"\n📈 Overall Sentiment Distribution:")
        total = len(articles)
        for sentiment, count in sentiment_counts.items():
            percentage = (count / total) * 100
            print(f"   • {sentiment.capitalize()}: {count} articles ({percentage:.1f}%)")
    else:
        print("📰 No recent articles found")
else:
    print(f"❌ Error: {news_result['error']}")


📰 Testing news collection with sentiment analysis for AAPL...

✅ News collection successful!
📰 Found 20 articles from the last 7 days

📊 Sentiment Analysis Results:

📄 Article 1:
   📰 Headline: Streaming Finally Outpaces Traditional TV. The Leading Platform Might Surprise You....
   🎯 Sentiment: NEUTRAL (score: 0.508)
   🤖 Model used: huggingface
   🔗 Source: Yahoo

📄 Article 2:
   📰 Headline: BABA Down 8% in a Month: Will Partnership With Apple Aid Recovery?...
   🎯 Sentiment: NEGATIVE (score: 0.971)
   🤖 Model used: huggingface
   🔗 Source: Yahoo

📄 Article 3:
   📰 Headline: Apple supplier Jabil plans $500M venture to build AI data centers...
   🎯 Sentiment: NEUTRAL (score: 0.709)
   🤖 Model used: huggingface
   🔗 Source: Yahoo

📄 Article 4:
   📰 Headline: Motley Fool Analysts Check In on Chime Financial, RH, Adobe, and More...
   🎯 Sentiment: NEUTRAL (score: 0.937)
   🤖 Model used: huggingface
   🔗 Source: Yahoo

📄 Article 5:
   📰 Headline: Trump says he'll extend TikTok sale deadli

## 5. Test SEC Filings Download (Actual Files)


In [5]:
symbol = "AAPL"  # You can change this to any stock symbol

print(f"📋 Testing SEC filings download for {symbol}...")
print("⚠️  This may take a few minutes as we download actual SEC filings")
print("🔄 System will try SEC-API.IO first (for PDFs), then fallback to direct EDGAR download")

sec_result = tools.get_sec_filings(symbol, filing_types=["10-K", "10-Q"], max_filings=2)

if sec_result["success"]:
    data = sec_result["data"]
    print(f"\n✅ SEC filings download successful!")
    print(f"📁 Download path: {data['download_path']}")
    print(f"📋 Total filings downloaded: {len(data['filings_downloaded'])}")
    print(f"🔄 Download method: {data.get('source', 'Unknown')}")
    
    # Show API usage statistics if available
    if 'api_calls_made' in data:
        print(f"📊 SEC-API.IO calls made: {data['api_calls_made']}")
        print(f"♻️  Files reused: {data['files_reused']}")
    
    # Show filing summaries
    if data['filing_summaries']:
        print(f"\n📄 Filing Details:")
        for i, filing in enumerate(data['filing_summaries']):
            if 'error' not in filing:
                print(f"\n📋 Filing {i+1}:")
                print(f"   📝 Type: {filing['filing_type']}")
                print(f"   📅 Date: {filing.get('date', 'Unknown')}")
                print(f"   📄 File: {filing.get('file_path', 'N/A')}")
                print(f"   🔄 Method: {filing.get('download_method', 'Unknown')}")
                
                # Show accession number if available
                if 'accession_number' in filing:
                    print(f"   🆔 Accession: {filing['accession_number']}")
                
                # Show text preview for non-PDF files
                text_summary = filing.get('text_summary', '')
                if text_summary:
                    print(f"   📝 Preview: {text_summary[:200]}...")
            else:
                print(f"\n❌ {filing['filing_type']}: {filing['error']}")
    
    # Show actual downloaded files structure
    if data['filings_downloaded']:
        print(f"\n📁 Downloaded File Structure:")
        for filing in data['filings_downloaded'][:2]:
            print(f"   📋 {filing['filing_type']} - {filing.get('filing_date', 'Unknown date')}")
            
            # Handle both PDF and HTML/TXT files
            file_path = filing.get('filing_path') or filing.get('file_path')
            if file_path:
                print(f"       📄 File path: {file_path}")
                
                # Check if file actually exists and show details
                if os.path.exists(file_path):
                    file_size = os.path.getsize(file_path)
                    file_ext = os.path.splitext(file_path)[1].lower()
                    
                    if file_ext == '.pdf':
                        print(f"       ✅ PDF file exists ({file_size:,} bytes) - Ready for multimodal analysis")
                    else:
                        print(f"       ✅ File exists ({file_size:,} bytes)")
                        print(f"       📊 Financial tables: {len(filing.get('tables_found', []))}") 
                        print(f"       📝 Text length: {len(filing.get('text_summary', ''))} characters")
                else:
                    print(f"       ❌ File not found")
            else:
                print(f"       ❌ No file path available")
                
            # Show download method
            method = filing.get('download_method', 'Unknown')
            if method == 'sec-api.io':
                print(f"       🚀 Downloaded as optimized PDF via SEC-API.IO")
            elif method == 'reused_existing':
                print(f"       ♻️  Reused existing file (saved API call)")
            elif method == 'edgar_direct':
                print(f"       📋 Downloaded via direct EDGAR access")
else:
    print(f"❌ Error: {sec_result['error']}")
    print("💡 Tip: If SEC-API.IO failed, check if SEC_API_KEY is set in your .env file")

📋 Testing SEC filings download for AAPL...
⚠️  This may take a few minutes as we download actual SEC filings
🔄 System will try SEC-API.IO first (for PDFs), then fallback to direct EDGAR download

✅ SEC filings download successful!
📁 Download path: outputs/sec_filings\AAPL
📋 Total filings downloaded: 0
🔄 Download method: sec-api.io
📊 SEC-API.IO calls made: 0
♻️  Files reused: 0


## 6. Test Earnings Call Data Collection


In [6]:
print(f"💼 Testing earnings call data collection for {symbol}...")
earnings_result = tools.get_earnings_call_data(symbol, quarters=4)

if earnings_result["success"]:
    data = earnings_result["data"]
    print(f"\n✅ Earnings data collection successful!")
    
    # Earnings calendar
    calendar = data['earnings_calendar']
    print(f"📅 Upcoming earnings events: {len(calendar)}")
    if calendar:
        for event in calendar[:3]:
            print(f"   📅 {event.get('date', 'Unknown')}: {event.get('description', 'Earnings call')}")
    
    # Earnings surprises
    surprises = data['earnings_surprises']
    print(f"\n📊 Historical earnings surprises: {len(surprises) if surprises else 0}")
    if surprises and len(surprises) > 0:
        for surprise in surprises[:3]:
            print(f"   📊 {surprise.get('period', 'Unknown')}: Actual {surprise.get('actual', 'N/A')} vs Estimate {surprise.get('estimate', 'N/A')}")
    
    # Earnings calls
    calls = data['earnings_calls']
    print(f"\n📞 Earnings calls found: {len(calls)}")
    if calls:
        for i, call in enumerate(calls[:3]):
            print(f"\n📞 Call {i+1}:")
            print(f"   📅 Date: {call.get('date', 'Unknown')}")
            print(f"   📊 Quarter: {call.get('quarter', 'Unknown')} {call.get('year', '')}")
            print(f"   💰 Revenue: ${call.get('revenue', 'N/A')}")
            print(f"   💰 Earnings: ${call.get('earnings', 'N/A')}")
            print(f"   📝 Source: {call.get('source', 'Unknown')}")
            
            # Show transcript info if available
            transcript = call.get('transcript', '')
            if transcript and len(transcript) > 50:
                print(f"   📝 Transcript: {transcript[:100]}...")
            else:
                print(f"   📝 Transcript: {transcript or 'Not available'}")
else:
    print(f"❌ Error: {earnings_result['error']}")


💼 Testing earnings call data collection for AAPL...


Could not get earnings surprises: 'Client' object has no attribute 'earnings_surprise'



✅ Earnings data collection successful!
📅 Upcoming earnings events: 1
   📅 2025-07-30: Earnings call

📊 Historical earnings surprises: 0

📞 Earnings calls found: 0




In [None]:
## 7. Test Competitor SEC Filings Analysis


In [None]:
print(f"🏢 Testing competitor SEC filings analysis for {symbol}...")
print("⚠️  This may take several minutes as we download competitor filings")

competitor_result = tools.get_competitor_sec_filings(symbol, filing_types=["10-K"], max_filings=1)

if competitor_result["success"]:
    data = competitor_result["data"]
    print(f"\n✅ Competitor analysis successful!")
    print(f"🏢 Primary symbol: {data['primary_symbol']}")
    print(f"🏢 Competitors analyzed: {', '.join(data['competitors_analyzed'])}")
    
    # Show competitor filing results
    competitor_filings = data['competitor_filings']
    print(f"\n📋 Competitor Filing Results:")
    
    for competitor, filing_data in competitor_filings.items():
        print(f"\n🏢 {competitor}:")
        if 'error' in filing_data:
            print(f"   ❌ Error: {filing_data['error']}")
        else:
            filings_downloaded = filing_data.get('filings_downloaded', [])
            print(f"   📋 Filings downloaded: {len(filings_downloaded)}")
            
            if filings_downloaded:
                for filing in filings_downloaded[:1]:
                    print(f"   📄 {filing['filing_type']} - {filing.get('filing_date', 'Unknown')}")
                    print(f"       📊 Tables found: {len(filing.get('tables_found', []))}")
                    print(f"       📝 Text preview: {filing.get('text_summary', '')[:100]}...")
else:
    print(f"❌ Error: {competitor_result['error']}")


In [None]:
## 8. Test Sentiment Analysis Independently


In [None]:
print("🤖 Testing sentiment analysis with sample texts...")

# Test cases with different sentiments
test_texts = [
    "Apple reports record quarterly revenue, beating analyst expectations by 15%",
    "Company faces significant challenges due to supply chain disruptions and declining margins",
    "Stock price remains stable with mixed analyst recommendations",
    "Strong growth in cloud services division drives revenue increase and profitability",
    "Regulatory concerns and market volatility create uncertainty for investors"
]

print(f"\n🧪 Testing {len(test_texts)} sample texts:")

for i, text in enumerate(test_texts, 1):
    sentiment_result = tools._analyze_sentiment(text)
    
    print(f"\n📝 Test {i}:")
    print(f"   Text: {text}")
    print(f"   🎯 Sentiment: {sentiment_result['label'].upper()}")
    print(f"   📊 Score: {sentiment_result['score']:.3f}")
    print(f"   🔍 Confidence: {sentiment_result['confidence']:.3f}")
    print(f"   🤖 Model: {sentiment_result['model']}")

print(f"\n✅ Sentiment analysis testing complete!")


In [None]:
## 9. Test ESG and Market Trends Data


In [None]:
# Test ESG data collection
print(f"🌱 Testing ESG data collection for {symbol}...")
esg_result = tools.get_esg_data(symbol)

if esg_result["success"]:
    data = esg_result["data"]
    print(f"\n✅ ESG data collection successful!")
    print(f"🌱 Employee count: {data.get('employee_count', 'N/A'):,}" if data.get('employee_count') else "🌱 Employee count: N/A")
    print(f"🌱 Sustainability score: {data.get('sustainability_score', 'N/A')}")
    print(f"🌱 ESG risk rating: {data.get('esg_risk_rating', 'N/A')}")
    print(f"ℹ️  Note: {esg_result.get('note', '')}")
else:
    print(f"❌ ESG Error: {esg_result['error']}")

# Test market trends collection
print(f"\n📈 Testing market trends collection for {symbol}...")
trends_result = tools.get_market_trends(symbol)

if trends_result["success"]:
    data = trends_result["data"]
    print(f"\n✅ Market trends collection successful!")
    
    # Show market indicators
    indicators = data.get('market_indicators', {})
    print(f"📊 Market Indicators:")
    print(f"   📊 Short Ratio: {indicators.get('short_ratio', 'N/A')}")
    print(f"   📊 Shares Outstanding: {indicators.get('shares_outstanding', 'N/A'):,}" if indicators.get('shares_outstanding') else "   📊 Shares Outstanding: N/A")
    print(f"   📊 Institutional Ownership: {indicators.get('institutional_ownership_pct', 'N/A')}")
    print(f"   📊 Insider Ownership: {indicators.get('insider_ownership_pct', 'N/A')}")
    
    # Show analyst recommendations if available
    recommendations = data.get('analyst_recommendations', {}).get('recent_recommendations', [])
    if recommendations:
        print(f"\n👥 Recent Analyst Recommendations: {len(recommendations)}")
        for rec in recommendations[:3]:
            date = rec.get('Date', 'Unknown')
            firm = rec.get('Firm', 'Unknown')
            rating = rec.get('To Grade', 'Unknown')
            print(f"   👥 {date}: {firm} - {rating}")
else:
    print(f"❌ Trends Error: {trends_result['error']}")


In [None]:
## 10. Summary and Data Structure Overview


In [None]:
print("📋 ENHANCED DATA COLLECTOR TESTING SUMMARY")
print("=" * 50)

# Test all components one more time and show structure
test_symbol = "AAPL"
print(f"🔍 Testing all components for {test_symbol}:")

components = [
    ("Stock Data", lambda: tools.get_stock_data(test_symbol)),
    ("Financial Statements", lambda: tools.get_company_financials(test_symbol)),
    ("News with Sentiment", lambda: tools.get_company_news(test_symbol, 3)),
    ("ESG Data", lambda: tools.get_esg_data(test_symbol)),
    ("Market Trends", lambda: tools.get_market_trends(test_symbol))
]

results_summary = {}

for component_name, test_func in components:
    try:
        result = test_func()
        status = "✅ Success" if result.get("success") else "❌ Failed"
        results_summary[component_name] = status
        print(f"   {status}: {component_name}")
    except Exception as e:
        results_summary[component_name] = f"❌ Error: {str(e)[:50]}..."
        print(f"   ❌ Error: {component_name} - {str(e)[:50]}...")

print(f"\n📊 COMPONENT STATUS SUMMARY:")
for component, status in results_summary.items():
    print(f"   {component}: {status}")

print(f"\n🎯 KEY ENHANCEMENTS IMPLEMENTED:")
print(f"   🤖 HuggingFace FinBERT sentiment analysis")
print(f"   📋 Actual SEC filing downloads and parsing")
print(f"   💼 Earnings call data collection")
print(f"   🏢 Competitor SEC filings analysis")
print(f"   📊 Enhanced data structures for multimodal analysis")

print(f"\n📁 File Storage Locations:")
print(f"   📋 SEC Filings: {config.charts_output_path}/sec_filings/")
print(f"   📊 Charts: {config.charts_output_path}/")
print(f"   💾 Vector DB: {config.vector_db_path}")
print(f"   🗄️  Shared Memory: {config.shared_memory_path}")

print(f"\n✅ TESTING COMPLETE!")
print(f"📝 All components of the enhanced data collector have been tested.")
print(f"🚀 The system is ready for comprehensive financial research!")
