# Data Validation: Fills vs Summary
Validate consistency between fills table and account_daily_summary table

In [ ]:
import sqlite3
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Add the parent directory to the path so we can import from src
parent_dir = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
sys.path.insert(0, str(parent_dir))

# Change working directory to the project root if we're in notebooks folder
if Path.cwd().name == 'notebooks':
    os.chdir(parent_dir)
    print(f"Changed working directory to: {Path.cwd()}")

# Import our data validation functions
from src.data.database_manager import DatabaseManager
from src.data.data_validator import validate_data_fast, validate_data_full

# Initialize database manager - it will now look for the database in the correct location
db = DatabaseManager()

print("🔍 Data Validation Notebook")
print("=" * 50)
print(f"Working directory: {Path.cwd()}")
print(f"Database path: {db.db_path}")

# Get basic database stats
try:
    stats = db.get_database_stats()
    print("\nDatabase Statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")
except Exception as e:
    print(f"⚠️ Error reading database: {e}")
    print("Make sure you've run 'python scripts/setup_database.py' first!")

In [None]:
# Load data using our database manager
accounts = db.get_accounts()
summary_data = db.get_summary_data()
fills_data = db.get_fills_data()

print("📊 Data Overview:")
print(f"Accounts: {len(accounts)}")
print(f"Summary records: {len(summary_data)}")
print(f"Fills records: {len(fills_data)}")

print("\n📅 Date Ranges:")
if not summary_data.empty:
    print(f"Summary: {summary_data['date'].min()} to {summary_data['date'].max()}")
if not fills_data.empty:
    print(f"Fills: {fills_data['datetime'].min()} to {fills_data['datetime'].max()}")

print("\n👥 Accounts:")
print(accounts[['account_id', 'account_name', 'account_type']])

In [None]:
# 🚀 Run Fast Validation (Aggregate Comparison)
print("🚀 Running Fast Validation...")
print("=" * 50)

fast_validation = validate_data_fast()
fast_stats = fast_validation['stats']

print(f"✅ Total accounts: {fast_stats['total_accounts']}")
print(f"✅ Accounts with data: {fast_stats['accounts_with_data']}")
print(f"✅ Failed validation: {fast_stats['failed_validation']}")

if 'portfolio' in fast_stats:
    ps = fast_stats['portfolio']
    print(f"\n💰 Portfolio Overview:")
    print(f"  Total P&L: ${ps['total_pnl']:,.2f}")
    print(f"  Total Volume: {ps['total_volume']:,.0f} shares")
    print(f"  Total Trades: {ps['total_trades']:,}")
    win_rate = ps['total_profitable_days'] / (ps['total_profitable_days'] + ps['total_losing_days']) * 100
    print(f"  Win Rate: {win_rate:.1f}%")

# Show any warnings
if fast_validation['warnings']:
    print(f"\n⚠️  Validation Warnings:")
    for warning in fast_validation['warnings']:
        print(f"  {warning}")

In [None]:
# 📈 Account Performance Analysis
print("📈 Account Performance Analysis")
print("=" * 50)

account_performance = []

for acc_id, acc_stats in fast_stats['by_account'].items():
    if acc_stats.get('trading_stats'):
        ts = acc_stats['trading_stats']
        performance = {
            'Account ID': acc_id,
            'Total P&L': ts['total_pnl'],
            'Win Rate (%)': ts['profitable_days'] / (ts['profitable_days'] + ts['losing_days']) * 100,
            'Avg Daily P&L': ts['avg_daily_pnl'],
            'Max Daily Gain': ts['max_daily_pnl'],
            'Max Daily Loss': ts['min_daily_pnl'],
            'Total Trades': ts['total_trades'],
            'Total Volume': ts['total_volume'],
            'Trading Days': ts['trading_days']
        }
        account_performance.append(performance)

# Create DataFrame for easier analysis
perf_df = pd.DataFrame(account_performance)

# Display top performers
print("🏆 Top Performers by Total P&L:")
top_performers = perf_df.nlargest(5, 'Total P&L')[['Account ID', 'Total P&L', 'Win Rate (%)', 'Trading Days']]
print(top_performers.to_string(index=False))

print("\n📊 Performance Summary:")
print(f"Best performing account: {perf_df.loc[perf_df['Total P&L'].idxmax(), 'Account ID']} (${perf_df['Total P&L'].max():,.2f})")
print(f"Worst performing account: {perf_df.loc[perf_df['Total P&L'].idxmin(), 'Account ID']} (${perf_df['Total P&L'].min():,.2f})")
print(f"Average win rate: {perf_df['Win Rate (%)'].mean():.1f}%")
print(f"Best single day: ${perf_df['Max Daily Gain'].max():,.2f}")
print(f"Worst single day: ${perf_df['Max Daily Loss'].min():,.2f}")

In [None]:
# 📊 Performance Visualization
print("📊 Creating Performance Visualizations...")

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Trading Performance Analysis', fontsize=16, fontweight='bold')

# 1. Total P&L by Account
ax1 = axes[0, 0]
colors = ['green' if x > 0 else 'red' for x in perf_df['Total P&L']]
bars = ax1.bar(range(len(perf_df)), perf_df['Total P&L'], color=colors, alpha=0.7)
ax1.set_xlabel('Account Index')
ax1.set_ylabel('Total P&L ($)')
ax1.set_title('Total P&L by Account')
ax1.axhline(y=0, color='black', linestyle='-', alpha=0.3)
ax1.grid(True, alpha=0.3)

# Add value labels on bars
for i, (bar, value) in enumerate(zip(bars, perf_df['Total P&L'])):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + (max(perf_df['Total P&L']) * 0.01),
             f'${value:,.0f}', ha='center', va='bottom', fontsize=8, rotation=45)

# 2. Win Rate Distribution
ax2 = axes[0, 1]
ax2.hist(perf_df['Win Rate (%)'], bins=10, alpha=0.7, color='blue', edgecolor='black')
ax2.axvline(x=50, color='red', linestyle='--', label='Break-even (50%)')
ax2.set_xlabel('Win Rate (%)')
ax2.set_ylabel('Number of Accounts')
ax2.set_title('Win Rate Distribution')
ax2.legend()
ax2.grid(True, alpha=0.3)

# 3. Trading Volume vs P&L
ax3 = axes[1, 0]
scatter = ax3.scatter(perf_df['Total Volume'], perf_df['Total P&L'], 
                     c=perf_df['Win Rate (%)'], cmap='RdYlGn', 
                     s=perf_df['Trading Days']*2, alpha=0.7)
ax3.set_xlabel('Total Volume (Shares)')
ax3.set_ylabel('Total P&L ($)')
ax3.set_title('Volume vs P&L (Size = Trading Days)')
ax3.grid(True, alpha=0.3)
plt.colorbar(scatter, ax=ax3, label='Win Rate (%)')

# 4. Risk-Return Profile (Max Loss vs Max Gain)
ax4 = axes[1, 1]
ax4.scatter(abs(perf_df['Max Daily Loss']), perf_df['Max Daily Gain'], 
           c=perf_df['Total P&L'], cmap='RdYlGn', s=100, alpha=0.7)
ax4.set_xlabel('Max Daily Loss ($)')
ax4.set_ylabel('Max Daily Gain ($)')
ax4.set_title('Risk-Return Profile')
ax4.grid(True, alpha=0.3)

# Add diagonal line (equal risk-reward)
max_val = max(ax4.get_xlim()[1], ax4.get_ylim()[1])
ax4.plot([0, max_val], [0, max_val], 'r--', alpha=0.5, label='Equal Risk-Reward')
ax4.legend()

plt.tight_layout()
plt.show()

In [None]:
# 🔬 Deep Dive: Data Quality Analysis
print("🔬 Deep Dive: Data Quality Analysis")
print("=" * 50)

# Check data quality for each account
print("🔍 Data Quality by Account:")
for acc_id, acc_stats in fast_stats['by_account'].items():
    print(f"\n📋 Account {acc_id}:")
    print(f"  Summary records: {acc_stats['summary_records']}")
    print(f"  Fills records: {acc_stats['fills_records']}")
    
    if acc_stats.get('date_range'):
        dr = acc_stats['date_range']
        print(f"  Date range: {dr['start']} to {dr['end']} ({dr['days']} days)")
    
    # Check for fast validation results
    if acc_stats.get('fast_validation'):
        fv = acc_stats['fast_validation']
        if fv.get('discrepancies'):
            print(f"  🔍 Aggregate Validation:")
            for metric, disc in fv['discrepancies'].items():
                diff_pct = disc['diff_pct']
                if diff_pct != float('inf') and diff_pct > 0.02:  # > 2% difference
                    print(f"    ⚠️  {metric}: {diff_pct:.1%} difference")
                else:
                    print(f"    ✅ {metric}: Good match")

# Data coverage analysis
print(f"\n📅 Data Coverage Analysis:")
total_expected_days = (summary_data['date'].max() - summary_data['date'].min()).days + 1
weekdays_expected = pd.bdate_range(summary_data['date'].min(), summary_data['date'].max())
print(f"Total period: {total_expected_days} days")
print(f"Trading days expected: {len(weekdays_expected)} days")
print(f"Actual summary records: {len(summary_data)} days")
coverage_pct = len(summary_data) / len(weekdays_expected) * 100
print(f"Coverage rate: {coverage_pct:.1f}%")

In [None]:
# 🐌 Full Validation (Day-by-Day Analysis)
print("🐌 Running Full Validation (Day-by-Day)...")
print("=" * 50)
print("⏳ This may take a moment...")

# Run full validation for detailed day-by-day analysis
full_validation = validate_data_full()
full_stats = full_validation['stats']

print(f"✅ Full validation complete!")
print(f"📊 Results:")
print(f"  Failed validation: {full_stats['failed_validation']}")
print(f"  Total discrepancy days: {full_stats['total_discrepancy_days']}")

# Show accounts with slow validation results
accounts_with_daily_issues = 0
for acc_id, acc_stats in full_stats['by_account'].items():
    if acc_stats.get('slow_validation'):
        sv = acc_stats['slow_validation']
        if sv.get('daily_discrepancies'):
            accounts_with_daily_issues += 1
            print(f"\n📋 Account {acc_id} - Daily Validation:")
            print(f"  Total compared days: {sv['stats']['total_compared_days']}")
            print(f"  Discrepancy days: {sv['stats']['discrepancy_days']}")
            print(f"  Discrepancy rate: {sv['stats']['discrepancy_rate']:.1%}")
            
            # Show first few discrepancies as examples
            if len(sv['daily_discrepancies']) > 0:
                print(f"  Sample discrepancies:")
                for i, disc in enumerate(sv['daily_discrepancies'][:3]):
                    print(f"    {disc['date']}: {', '.join(disc['issues'])}")
                if len(sv['daily_discrepancies']) > 3:
                    print(f"    ... and {len(sv['daily_discrepancies']) - 3} more")

print(f"\n📈 Summary:")
print(f"Accounts with daily-level discrepancies: {accounts_with_daily_issues}")

In [None]:
# 📊 Time Series Analysis: Daily P&L Trends
print("📊 Time Series Analysis: Daily P&L Trends")
print("=" * 50)

# Analyze daily P&L patterns
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Daily P&L Analysis', fontsize=16, fontweight='bold')

# Get daily P&L for all accounts
daily_pnl_data = []
for acc_id in accounts['account_id']:
    acc_summary = db.get_summary_data(account_id=acc_id)
    if not acc_summary.empty:
        acc_summary['account_id'] = acc_id
        daily_pnl_data.append(acc_summary[['account_id', 'date', 'net']])

if daily_pnl_data:
    all_daily_pnl = pd.concat(daily_pnl_data, ignore_index=True)
    
    # 1. Daily P&L Distribution
    ax1 = axes[0, 0]
    ax1.hist(all_daily_pnl['net'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    ax1.axvline(x=0, color='red', linestyle='--', label='Break-even')
    ax1.set_xlabel('Daily P&L ($)')
    ax1.set_ylabel('Frequency')
    ax1.set_title('Daily P&L Distribution (All Accounts)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # 2. Cumulative P&L Over Time
    ax2 = axes[0, 1]
    portfolio_daily = all_daily_pnl.groupby('date')['net'].sum().reset_index()
    portfolio_daily['cumulative_pnl'] = portfolio_daily['net'].cumsum()
    ax2.plot(portfolio_daily['date'], portfolio_daily['cumulative_pnl'], linewidth=2, color='green')
    ax2.set_xlabel('Date')
    ax2.set_ylabel('Cumulative P&L ($)')
    ax2.set_title('Portfolio Cumulative P&L')
    ax2.grid(True, alpha=0.3)
    ax2.tick_params(axis='x', rotation=45)
    
    # 3. Monthly P&L Heatmap
    ax3 = axes[1, 0]
    all_daily_pnl['year_month'] = all_daily_pnl['date'].dt.to_period('M')
    monthly_pnl = all_daily_pnl.groupby(['account_id', 'year_month'])['net'].sum().reset_index()
    
    # Create pivot table for heatmap
    if len(monthly_pnl) > 0:
        heatmap_data = monthly_pnl.pivot(index='account_id', columns='year_month', values='net')
        sns.heatmap(heatmap_data, annot=False, cmap='RdYlGn', center=0, ax=ax3)
        ax3.set_title('Monthly P&L by Account')
        ax3.set_xlabel('Month')
        ax3.set_ylabel('Account ID')
    
    # 4. Volatility Analysis (Rolling Std)
    ax4 = axes[1, 1]
    portfolio_daily['rolling_std'] = portfolio_daily['net'].rolling(window=20).std()
    ax4.plot(portfolio_daily['date'], portfolio_daily['rolling_std'], color='orange', linewidth=2)
    ax4.set_xlabel('Date')
    ax4.set_ylabel('20-Day Rolling Std ($)')
    ax4.set_title('Portfolio Volatility (Rolling Std)')
    ax4.grid(True, alpha=0.3)
    ax4.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# 📋 Final Validation Report
print("📋 Final Validation Report")
print("=" * 80)

# Combine all validation results
validation_summary = {
    'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'total_accounts': fast_stats['total_accounts'],
    'accounts_with_data': fast_stats['accounts_with_data'],
    'total_summary_records': len(summary_data),
    'total_fills_records': len(fills_data),
    'data_date_range': {
        'start': summary_data['date'].min().strftime('%Y-%m-%d') if not summary_data.empty else 'N/A',
        'end': summary_data['date'].max().strftime('%Y-%m-%d') if not summary_data.empty else 'N/A'
    },
    'portfolio_performance': fast_stats.get('portfolio', {}),
    'validation_status': {
        'fast_validation_passed': fast_stats['failed_validation'] == 0,
        'full_validation_passed': full_stats['failed_validation'] == 0,
        'total_warnings': len(fast_validation['warnings']) + len(full_validation['warnings'])
    }
}

print("🎯 VALIDATION SUMMARY")
print("=" * 50)
print(f"✅ Validation completed at: {validation_summary['timestamp']}")
print(f"📊 Data Coverage:")
print(f"  - Accounts: {validation_summary['total_accounts']}")
print(f"  - Summary records: {validation_summary['total_summary_records']:,}")
print(f"  - Fills records: {validation_summary['total_fills_records']:,}")
print(f"  - Date range: {validation_summary['data_date_range']['start']} to {validation_summary['data_date_range']['end']}")

print(f"\n💰 Portfolio Performance:")
if validation_summary['portfolio_performance']:
    pp = validation_summary['portfolio_performance']
    print(f"  - Total P&L: ${pp['total_pnl']:,.2f}")
    print(f"  - Total Volume: {pp['total_volume']:,.0f} shares")
    print(f"  - Total Trades: {pp['total_trades']:,}")
    win_rate = pp['total_profitable_days'] / (pp['total_profitable_days'] + pp['total_losing_days']) * 100
    print(f"  - Win Rate: {win_rate:.1f}%")

print(f"\n🔍 Validation Status:")
vs = validation_summary['validation_status']
print(f"  - Fast validation: {'✅ PASSED' if vs['fast_validation_passed'] else '❌ FAILED'}")
print(f"  - Full validation: {'✅ PASSED' if vs['full_validation_passed'] else '❌ FAILED'}")
print(f"  - Total warnings: {vs['total_warnings']}")

print(f"\n🏆 Top Performing Accounts:")
if not perf_df.empty:
    top_3 = perf_df.nlargest(3, 'Total P&L')
    for i, (_, account) in enumerate(top_3.iterrows(), 1):
        print(f"  {i}. Account {account['Account ID']}: ${account['Total P&L']:,.2f} ({account['Win Rate (%)']:.1f}% win rate)")

print(f"\n⚠️  Accounts Needing Attention:")
if not perf_df.empty:
    problem_accounts = perf_df[perf_df['Total P&L'] < 0]
    if len(problem_accounts) > 0:
        for _, account in problem_accounts.iterrows():
            print(f"  - Account {account['Account ID']}: ${account['Total P&L']:,.2f} loss")
    else:
        print("  - No accounts with losses! 🎉")

print("\n" + "=" * 80)
print("✅ Data validation complete! All systems operational.")
print("📈 Ready for risk analysis and model development.")

In [ ]:
# 📊 Visual Analysis: Trading Activity Patterns
print("📊 Visual Analysis: Trading Activity Patterns")
print("=" * 50)

# Create visualizations for accounts with missing fills data
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Trading Activity Analysis for Accounts 3957 and 3978', fontsize=16, fontweight='bold')

for i, acc_id in enumerate(['3957', '3978']):
    # Get account data
    acc_summary = db.get_summary_data(account_id=acc_id)
    
    if not acc_summary.empty:
        # Calculate monthly activity
        acc_summary['year_month'] = acc_summary['date'].dt.to_period('M')
        monthly_activity = acc_summary.groupby('year_month').agg({
            'fills': 'sum',
            'net': 'sum',
            'date': 'count'
        }).reset_index()
        monthly_activity.columns = ['year_month', 'fills', 'net_pnl', 'days']
        monthly_activity['active_days'] = acc_summary[acc_summary['fills'] > 0].groupby(
            acc_summary['year_month']).size().reindex(monthly_activity['year_month']).fillna(0)
        
        # Plot 1 & 2: Monthly fills activity
        ax = axes[0, i]
        x = range(len(monthly_activity))
        ax.bar(x, monthly_activity['fills'], alpha=0.7, color='blue', label='Total Fills')
        ax.set_xlabel('Month')
        ax.set_ylabel('Number of Fills')
        ax.set_title(f'Account {acc_id}: Monthly Trading Activity')
        ax.set_xticks(x[::3])  # Show every 3rd month
        ax.set_xticklabels(monthly_activity['year_month'].astype(str).iloc[::3], rotation=45)
        ax.grid(True, alpha=0.3)
        
        # Add activity rate on secondary axis
        ax2 = ax.twinx()
        activity_rate = (monthly_activity['active_days'] / monthly_activity['days'] * 100)
        ax2.plot(x, activity_rate, color='red', marker='o', linestyle='-', linewidth=2, label='Activity Rate %')
        ax2.set_ylabel('Activity Rate (%)', color='red')
        ax2.tick_params(axis='y', labelcolor='red')
        
        # Plot 3 & 4: Trading intensity heatmap
        ax = axes[1, i]
        
        # Create a calendar-like view of trading activity
        acc_summary['day_of_week'] = acc_summary['date'].dt.dayofweek
        acc_summary['week_of_year'] = acc_summary['date'].dt.isocalendar().week
        acc_summary['year'] = acc_summary['date'].dt.year
        
        # Focus on most recent year
        recent_year = acc_summary['year'].max()
        recent_data = acc_summary[acc_summary['year'] == recent_year].copy()
        
        # Create pivot table for heatmap
        if len(recent_data) > 0:
            heatmap_data = recent_data.pivot_table(
                values='fills', 
                index='day_of_week', 
                columns='week_of_year', 
                aggfunc='sum',
                fill_value=0
            )
            
            # Plot heatmap
            sns.heatmap(heatmap_data, cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Fills'})
            ax.set_title(f'Account {acc_id}: {recent_year} Trading Intensity')
            ax.set_xlabel('Week of Year')
            ax.set_ylabel('Day of Week')
            ax.set_yticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

plt.tight_layout()
plt.show()

# Additional analysis: Create a summary table
print("\n📊 Summary Table: Fills Data Completeness")
print("-" * 80)

summary_table = []
for acc_id in accounts['account_id']:
    acc_summary = db.get_summary_data(account_id=acc_id)
    acc_fills = db.get_fills_data(account_id=acc_id)
    
    if not acc_summary.empty:
        expected_fills = acc_summary['fills'].sum()
        actual_fills = len(acc_fills)
        capture_rate = actual_fills / expected_fills * 100 if expected_fills > 0 else 0
        
        # Get trading activity info
        trading_days = (acc_summary['fills'] > 0).sum()
        total_days = len(acc_summary)
        activity_rate = trading_days / total_days * 100
        
        # Last trading info
        last_trade_mask = acc_summary['fills'] > 0
        if last_trade_mask.any():
            last_trade_date = acc_summary.loc[last_trade_mask, 'date'].max()
            days_inactive = (pd.Timestamp.now() - last_trade_date).days
        else:
            last_trade_date = 'Never'
            days_inactive = 'N/A'
        
        summary_table.append({
            'Account': acc_id,
            'Expected Fills': expected_fills,
            'Actual Fills': actual_fills,
            'Capture Rate (%)': round(capture_rate, 1),
            'Trading Days': trading_days,
            'Total Days': total_days,
            'Activity Rate (%)': round(activity_rate, 1),
            'Last Trade': last_trade_date,
            'Days Inactive': days_inactive
        })

# Convert to DataFrame and display
summary_df = pd.DataFrame(summary_table)
summary_df = summary_df.sort_values('Capture Rate (%)')

print(summary_df.to_string(index=False))

# Highlight accounts with issues
print("\n⚠️  Accounts Requiring Attention:")
problem_accounts = summary_df[summary_df['Capture Rate (%)'] < 90]
if len(problem_accounts) > 0:
    for _, acc in problem_accounts.iterrows():
        print(f"  - Account {acc['Account']}: Only {acc['Capture Rate (%)']}% fills captured")
        if acc['Days Inactive'] != 'N/A' and isinstance(acc['Days Inactive'], (int, float)) and acc['Days Inactive'] > 30:
            print(f"    → Inactive for {acc['Days Inactive']} days")
else:
    print("  ✅ All accounts have good fill data capture rates!")

In [ ]:
# 🔍 Special Analysis: Accounts with Missing Fills Data
print("🔍 Special Analysis: Accounts with Missing Fills Data")
print("=" * 50)

# Analyze accounts that have summary data but missing or incomplete fills data
accounts_with_issues = []

for acc_id in accounts['account_id']:
    acc_summary = db.get_summary_data(account_id=acc_id)
    acc_fills = db.get_fills_data(account_id=acc_id)
    
    if not acc_summary.empty:
        # Calculate expected vs actual fills
        summary_fills_count = acc_summary['fills'].sum()
        actual_fills_count = len(acc_fills)
        
        # Check for significant discrepancies
        if summary_fills_count > 0 and actual_fills_count == 0:
            # Account has fills in summary but no fills data
            accounts_with_issues.append({
                'account_id': acc_id,
                'issue': 'NO_FILLS_DATA',
                'summary_fills': summary_fills_count,
                'actual_fills': actual_fills_count,
                'trading_days': (acc_summary['fills'] > 0).sum(),
                'date_range': f"{acc_summary['date'].min()} to {acc_summary['date'].max()}"
            })
        elif abs(summary_fills_count - actual_fills_count) > summary_fills_count * 0.1:  # >10% difference
            accounts_with_issues.append({
                'account_id': acc_id,
                'issue': 'FILLS_MISMATCH',
                'summary_fills': summary_fills_count,
                'actual_fills': actual_fills_count,
                'trading_days': (acc_summary['fills'] > 0).sum(),
                'date_range': f"{acc_summary['date'].min()} to {acc_summary['date'].max()}"
            })

# Display findings
if accounts_with_issues:
    print(f"⚠️  Found {len(accounts_with_issues)} accounts with fills data issues:\n")
    
    for issue in accounts_with_issues:
        print(f"📋 Account {issue['account_id']}:")
        print(f"   Issue: {issue['issue']}")
        print(f"   Summary shows: {issue['summary_fills']:,} fills")
        print(f"   Database has: {issue['actual_fills']:,} fills")
        print(f"   Trading days: {issue['trading_days']}")
        print(f"   Date range: {issue['date_range']}")
        print()

# Deep dive into accounts 3957 and 3978
print("\n📊 Deep Analysis: Accounts 3957 and 3978")
print("-" * 50)

for acc_id in ['3957', '3978']:
    print(f"\n🔎 Account {acc_id}:")
    
    # Get summary data
    acc_summary = db.get_summary_data(account_id=acc_id)
    acc_fills = db.get_fills_data(account_id=acc_id)
    
    if not acc_summary.empty:
        # Analyze trading patterns
        trading_days = acc_summary[acc_summary['fills'] > 0]
        non_trading_days = acc_summary[acc_summary['fills'] == 0]
        
        print(f"  Total days: {len(acc_summary)}")
        print(f"  Trading days: {len(trading_days)} ({len(trading_days)/len(acc_summary)*100:.1f}%)")
        print(f"  Non-trading days: {len(non_trading_days)} ({len(non_trading_days)/len(acc_summary)*100:.1f}%)")
        
        # Find periods of inactivity
        acc_summary['is_trading'] = acc_summary['fills'] > 0
        acc_summary['inactive_streak'] = (~acc_summary['is_trading']).cumsum()
        acc_summary['inactive_streak'][acc_summary['is_trading']] = 0
        
        # Find longest inactive periods
        inactive_periods = []
        current_period = {'start': None, 'end': None, 'days': 0}
        
        for idx, row in acc_summary.iterrows():
            if row['fills'] == 0:
                if current_period['start'] is None:
                    current_period['start'] = row['date']
                current_period['end'] = row['date']
                current_period['days'] += 1
            else:
                if current_period['start'] is not None:
                    inactive_periods.append(current_period.copy())
                    current_period = {'start': None, 'end': None, 'days': 0}
        
        # Don't forget the last period if it ends with inactivity
        if current_period['start'] is not None:
            inactive_periods.append(current_period)
        
        # Sort by duration
        inactive_periods.sort(key=lambda x: x['days'], reverse=True)
        
        print(f"\n  🚫 Longest inactive periods:")
        for i, period in enumerate(inactive_periods[:5]):  # Top 5 longest
            print(f"    {i+1}. {period['start']} to {period['end']} ({period['days']} days)")
        
        # Check if fills data exists for active trading periods
        if len(trading_days) > 0:
            sample_dates = trading_days.head(10)
            print(f"\n  📅 Checking fills data for sample trading days:")
            
            for _, day in sample_dates.iterrows():
                day_fills = acc_fills[acc_fills['datetime'].dt.date == day['date'].date()] if not acc_fills.empty else pd.DataFrame()
                expected = int(day['fills'])
                actual = len(day_fills)
                status = "✅" if actual > 0 else "❌"
                print(f"    {day['date'].strftime('%Y-%m-%d')}: Expected {expected} fills, Found {actual} {status}")
        
        # Summary statistics
        print(f"\n  📈 Summary Statistics:")
        print(f"    Total expected fills: {acc_summary['fills'].sum():,}")
        print(f"    Total actual fills: {len(acc_fills):,}")
        print(f"    Fill capture rate: {len(acc_fills)/acc_summary['fills'].sum()*100:.1f}%" if acc_summary['fills'].sum() > 0 else "N/A")
        print(f"    Last trading day: {trading_days['date'].max() if len(trading_days) > 0 else 'Never'}")
        print(f"    Days since last trade: {(pd.Timestamp.now() - trading_days['date'].max()).days if len(trading_days) > 0 else 'N/A'}")

print("\n💡 Insights:")
print("- Accounts may have summary data without fills during inactive periods")
print("- Fill data might be missing for certain date ranges")
print("- Some accounts may have stopped trading but still have daily summaries")
print("- Data download issues may have caused incomplete fills data")