# Risk Management MVP - Sanity Check Notebook

This notebook performs comprehensive sanity checks on all essential components of the risk management system.

## Checks Performed:
1. **Environment & Dependencies**
2. **Database Connectivity & Integrity**
3. **Data Downloader Functionality**
4. **Feature Engineering Pipeline**
5. **Model Training & Loading**
6. **Prediction Generation**
7. **Email Service Configuration**
8. **Common Issues & Edge Cases**

Run this notebook whenever you make changes to ensure everything is working correctly.

In [None]:
# 1. ENVIRONMENT & DEPENDENCIES CHECK
print("🔍 CHECKING ENVIRONMENT & DEPENDENCIES...\n")

import sys
import os
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")


# Check Python version
print(f"✓ Python version: {sys.version.split()[0]}")
assert sys.version_info >= (3, 9), "Python 3.9+ required"

# Check required packages
required_packages = {
    "pandas": "pd",
    "numpy": "np",
    "sklearn": "sklearn",
    "lightgbm": "lgb",
    "yaml": "yaml",
    "dotenv": "dotenv",
    "smtplib": "smtplib",
    "sqlite3": "sqlite3",
}

missing_packages = []
for package, import_name in required_packages.items():
    try:
        exec(f"import {import_name}")
        print(f"✓ {package} is installed")
    except ImportError:
        print(f"✗ {package} is NOT installed")
        missing_packages.append(package)

if missing_packages:
    print(f"\n⚠️  Missing packages: {', '.join(missing_packages)}")
    print("Run: conda install " + " ".join(missing_packages))
else:
    print("\n✅ All required packages are installed")

In [None]:
# Check directory structure
print("\n🔍 CHECKING DIRECTORY STRUCTURE...\n")

required_dirs = ["data", "data/models", "logs", "config", "src", "scripts", "notebooks"]
missing_dirs = []

for dir_path in required_dirs:
    path = Path.cwd().parent / dir_path
    if path.exists():
        print(f"✓ {dir_path}/ exists")
    else:
        print(f"✗ {dir_path}/ is MISSING")
        missing_dirs.append(dir_path)

if missing_dirs:
    print("\n⚠️  Creating missing directories...")
    for dir_path in missing_dirs:
        path = Path.cwd().parent / dir_path
        path.mkdir(exist_ok=True, parents=True)
        print(f"  Created {dir_path}/")

print("\n✅ Directory structure is complete")

In [None]:
# Check configuration files
print("\n🔍 CHECKING CONFIGURATION FILES...\n")

config_files = {
    "config/traders.yaml": "Trader configuration",
    "config/config.yaml": "Main configuration",
    ".env": "Environment variables",
}

missing_configs = []
for file_path, description in config_files.items():
    path = Path.cwd().parent / file_path
    if path.exists():
        print(f"✓ {file_path} ({description})")
    else:
        print(f"✗ {file_path} ({description}) is MISSING")
        missing_configs.append(file_path)

# Check .env variables
if (Path.cwd().parent / ".env").exists():
    from dotenv import load_dotenv

    load_dotenv()

    required_env_vars = ["API_TOKEN", "EMAIL_FROM", "EMAIL_PASSWORD", "EMAIL_TO"]
    missing_env_vars = []

    print("\n📋 Checking environment variables:")
    for var in required_env_vars:
        value = os.getenv(var)
        if value:
            print(f"✓ {var} is set")
        else:
            print(f"✗ {var} is NOT set")
            missing_env_vars.append(var)

    if missing_env_vars:
        print(f"\n⚠️  Missing environment variables: {', '.join(missing_env_vars)}")
        print("Update your .env file with the missing variables")
else:
    print("\n⚠️  .env file not found. Copy .env.template to .env and configure it.")

In [None]:
# 2. DATABASE CONNECTIVITY & INTEGRITY CHECK
print("\n🔍 CHECKING DATABASE...\n")

try:
    from src.database import Database

    db = Database()
    print("✓ Database module imported successfully")

    # Get database stats
    stats = db.get_database_stats()

    print("\n📊 Database Statistics:")
    for key, value in stats.items():
        print(f"  {key}: {value}")

    # Check if database has data
    if stats.get("traders_count", 0) == 0:
        print(
            "\n⚠️  Database is empty. Run 'python scripts/setup_database.py' to populate it."
        )
    else:
        print(f"\n✅ Database is populated with {stats['traders_count']} traders")

    # Test database queries
    print("\n🔍 Testing database queries...")
    traders_df = db.get_all_traders()
    print(f"✓ get_all_traders() returned {len(traders_df)} traders")

    if len(traders_df) > 0:
        # Test getting data for first trader
        test_trader = traders_df.iloc[0]
        account_id = str(test_trader["account_id"])
        totals_df, fills_df = db.get_trader_data(account_id)
        print(
            f"✓ get_trader_data() returned {len(totals_df)} totals, {len(fills_df)} fills for {test_trader['trader_name']}"
        )

except Exception as e:
    print(f"\n❌ Database check failed: {str(e)}")
    print("Check if database file exists at data/trading.db")

In [None]:
# 3. DATA DOWNLOADER CHECK
print("\n🔍 CHECKING DATA DOWNLOADER...\n")

try:
    from src.data_downloader import DataDownloader

    # Check if API token is set
    if not os.getenv("API_TOKEN"):
        print("⚠️  API_TOKEN not set. Data download will fail.")
    else:
        downloader = DataDownloader()
        print("✓ DataDownloader initialized successfully")
        print(f"✓ Found {len(downloader.traders)} traders in config")

        # Test API connectivity (without downloading)
        print("\n🔍 Testing API connectivity...")
        from datetime import date, timedelta

        # Try to download just 1 day of data for first trader
        if len(downloader.traders) > 0:
            test_trader = downloader.traders[0]
            test_date = date.today() - timedelta(days=1)

            print(f"Testing download for {test_trader['name']} on {test_date}...")
            # Note: This is a dry run - actual download commented out
            # success = downloader.download_totals(str(test_trader['account_id']), test_date, test_date)
            print("✓ API connection test skipped (uncomment to test actual download)")

except Exception as e:
    print(f"\n❌ Data downloader check failed: {str(e)}")
    print("Check your API_TOKEN in .env file")

In [None]:
# 4. FEATURE ENGINEERING CHECK
print("\n🔍 CHECKING FEATURE ENGINEERING...\n")

try:
    from src.feature_engineer import FeatureEngineer

    feature_engineer = FeatureEngineer()
    print("✓ FeatureEngineer initialized successfully")

    # Get feature list
    feature_columns = feature_engineer.get_feature_columns()
    print(f"✓ Total features defined: {len(feature_columns)}")

    # Check feature categories
    print("\n📋 Feature Categories:")
    basic_features = [
        f
        for f in feature_columns
        if f in ["orders_count", "fills_count", "quantity", "gross_pnl", "net_pnl"]
    ]
    rolling_features = [
        f
        for f in feature_columns
        if any(window in f for window in ["3d", "5d", "10d", "20d"])
    ]
    behavioral_features = [
        f
        for f in feature_columns
        if f in ["trading_hours", "symbols_traded", "avg_trade_size", "trade_frequency"]
    ]

    print(f"  Basic metrics: {len(basic_features)}")
    print(f"  Rolling statistics: {len(rolling_features)}")
    print(f"  Behavioral features: {len(behavioral_features)}")

    # Test feature creation with sample data
    if "db" in locals() and len(traders_df) > 0:
        print("\n🔍 Testing feature creation...")
        test_trader = traders_df.iloc[0]
        account_id = str(test_trader["account_id"])

        totals_df, fills_df = db.get_trader_data(account_id)
        if len(totals_df) >= 30:  # Need minimum data for features
            features_df = feature_engineer.create_features(totals_df, fills_df)
            print(
                f"✓ Created features: {features_df.shape[0]} rows x {features_df.shape[1]} columns"
            )

            # Check for NaN values
            nan_counts = features_df[feature_columns].isna().sum()
            nan_features = nan_counts[nan_counts > 0]
            if len(nan_features) > 0:
                print(f"\n⚠️  Features with NaN values: {len(nan_features)}")
                print(nan_features.head())
            else:
                print("✓ No NaN values in features")
        else:
            print(
                f"⚠️  Insufficient data for {test_trader['trader_name']} ({len(totals_df)} days)"
            )

except Exception as e:
    print(f"\n❌ Feature engineering check failed: {str(e)}")
    import traceback

    traceback.print_exc()

In [None]:
# 5. MODEL TRAINING & LOADING CHECK
print("\n🔍 CHECKING MODEL TRAINING & LOADING...\n")

try:
    from src.model_trainer import ModelTrainer

    model_trainer = ModelTrainer()
    print("✓ ModelTrainer initialized successfully")

    # Check models directory
    models_path = Path.cwd().parent / "data" / "models"
    if models_path.exists():
        model_files = list(models_path.glob("model_*.pkl"))
        print(f"✓ Found {len(model_files)} saved models")

        if len(model_files) > 0:
            # Test loading models
            all_models = model_trainer.get_all_models()
            print(f"✓ Successfully loaded {len(all_models)} models")

            # Get model summary
            summary_df = model_trainer.get_model_summary()
            if not summary_df.empty:
                print("\n📊 Model Performance Summary:")
                print(f"  Average RMSE: {summary_df['val_rmse'].mean():.2f}")
                print(f"  Average MAE: {summary_df['val_mae'].mean():.2f}")

                # Show best performing models
                best_models = summary_df.nsmallest(3, "val_rmse")[
                    ["account_id", "val_rmse", "test_rmse"]
                ]
                print("\n🏆 Top 3 Models by RMSE:")
                print(best_models.to_string(index=False))
        else:
            print(
                "\n⚠️  No models found. Run 'python scripts/train_models.py' to train models."
            )
    else:
        print("\n⚠️  Models directory not found.")

    # Check hyperparameter search configuration
    print("\n📋 Hyperparameter Search Configuration:")
    print(
        f"  Parameter grid combinations: {len(list(ParameterGrid(model_trainer.param_grid)))}"
    )
    print(f"  Max trials per model: {model_trainer.max_trials}")
    print(f"  Min training days required: {model_trainer.min_training_days}")

except Exception as e:
    print(f"\n❌ Model check failed: {str(e)}")
    import traceback

    traceback.print_exc()

In [None]:
# 6. PREDICTION GENERATION CHECK
print("\n🔍 CHECKING PREDICTION GENERATION...\n")

try:
    from src.predictor import RiskPredictor

    if "model_trainer" in locals() and len(model_trainer.get_all_models()) > 0:
        predictor = RiskPredictor()
        print("✓ RiskPredictor initialized successfully")

        # Test prediction for single trader
        test_models = list(predictor.models.keys())
        if test_models:
            test_account = test_models[0]
            print(f"\n🔍 Testing prediction for account {test_account}...")

            prediction = predictor.predict_trader(test_account)
            if prediction:
                print("✓ Prediction generated successfully:")
                print(f"  Risk Level: {prediction['risk_level']}")
                print(f"  Predicted P&L: ${prediction['predicted_pnl']:.2f}")
                print(f"  Risk Score: {prediction['risk_score']:.2f}")
                print(f"  Recommendation: {prediction['recommendation']}")
            else:
                print("⚠️  Prediction failed (insufficient data?)")

        # Check latest predictions in database
        latest_predictions = db.get_latest_predictions()
        if not latest_predictions.empty:
            print(f"\n📊 Latest predictions in database: {len(latest_predictions)}")
            risk_dist = latest_predictions["risk_score"].describe()
            print(
                f"  Risk score range: {risk_dist['min']:.2f} - {risk_dist['max']:.2f}"
            )
            print(f"  Average risk score: {risk_dist['mean']:.2f}")
    else:
        print("⚠️  No models available for predictions. Train models first.")

except Exception as e:
    print(f"\n❌ Prediction check failed: {str(e)}")
    import traceback

    traceback.print_exc()

In [None]:
# 7. EMAIL SERVICE CHECK
print("\n🔍 CHECKING EMAIL SERVICE...\n")

try:
    from src.email_service import EmailService

    email_service = EmailService()

    # Check email configuration
    if email_service.from_email and email_service.password:
        print("✓ Email credentials configured")
        print(f"✓ From: {email_service.from_email}")
        print(f"✓ To: {', '.join(email_service.to_emails)}")

        # Test email creation (without sending)
        test_predictions = [
            {
                "trader_name": "Test Trader",
                "risk_level": "High",
                "risk_score": 0.9,
                "predicted_pnl": -1000,
                "recent_pnl_5d": -500,
                "confidence": "High",
                "recommendation": "Test recommendation",
            }
        ]

        test_summary = {
            "total_traders": 1,
            "high_risk_count": 1,
            "medium_risk_count": 0,
            "low_risk_count": 0,
            "total_predicted_pnl": -1000,
            "total_recent_pnl": -500,
            "models_available": 1,
        }

        # Test HTML report generation
        html_report = email_service.create_html_report(test_predictions, test_summary)
        print("✓ HTML report generated successfully")
        print(f"  Report size: {len(html_report)} characters")

        # Test text report generation
        text_report = email_service.create_text_report(test_predictions, test_summary)
        print("✓ Text report generated successfully")
        print(f"  Report size: {len(text_report)} characters")

        print("\n💡 To test actual email sending, run:")
        print("   email_service.send_test_email()")

    else:
        print("⚠️  Email credentials not configured")
        print("Set EMAIL_FROM, EMAIL_PASSWORD, and EMAIL_TO in .env file")

except Exception as e:
    print(f"\n❌ Email service check failed: {str(e)}")
    import traceback

    traceback.print_exc()

In [None]:
# 8. COMMON ISSUES & EDGE CASES CHECK
print("\n🔍 CHECKING FOR COMMON ISSUES...\n")

issues_found = []

# Check 1: Traders with insufficient data
if "db" in locals() and "traders_df" in locals():
    print("📋 Checking trader data sufficiency...")
    insufficient_data = []

    for _, trader in traders_df.iterrows():
        if trader["trading_days"] < 30:
            insufficient_data.append((trader["trader_name"], trader["trading_days"]))

    if insufficient_data:
        print(f"⚠️  {len(insufficient_data)} traders have < 30 days of data:")
        for name, days in insufficient_data[:5]:  # Show first 5
            print(f"    {name}: {days} days")
        issues_found.append("Traders with insufficient data")
    else:
        print("✓ All traders have sufficient data")

# Check 2: Date gaps in data
if "db" in locals() and len(traders_df) > 0:
    print("\n📋 Checking for data gaps...")
    import pandas as pd

    # Check first trader for gaps
    test_trader = traders_df.iloc[0]
    totals_df, _ = db.get_trader_data(str(test_trader["account_id"]))

    if not totals_df.empty:
        dates = pd.to_datetime(totals_df["date"]).sort_values()
        date_diffs = dates.diff().dt.days
        gaps = date_diffs[date_diffs > 1]

        if len(gaps) > 0:
            print(f"⚠️  Found {len(gaps)} gaps in data for {test_trader['trader_name']}")
            print(f"    Largest gap: {gaps.max()} days")
            issues_found.append("Data gaps detected")
        else:
            print("✓ No significant data gaps found")

# Check 3: Memory usage
print("\n📋 Checking memory usage...")
import psutil

process = psutil.Process(os.getpid())
memory_mb = process.memory_info().rss / 1024 / 1024
print(f"  Current memory usage: {memory_mb:.1f} MB")

if memory_mb > 500:
    print("⚠️  High memory usage detected")
    issues_found.append("High memory usage")
else:
    print("✓ Memory usage is reasonable")

# Check 4: Feature engineering edge cases
if "feature_engineer" in locals():
    print("\n📋 Checking feature engineering edge cases...")

    # Test with minimal data
    import numpy as np

    minimal_totals = pd.DataFrame(
        {
            "date": pd.date_range("2024-01-01", periods=5),
            "net_pnl": [100, -50, 200, -100, 50],
            "gross_pnl": [110, -40, 210, -90, 60],
            "total_fees": [10, 10, 10, 10, 10],
            "orders_count": [5, 3, 7, 4, 2],
            "fills_count": [10, 6, 14, 8, 4],
            "quantity": [1000, 600, 1400, 800, 400],
        }
    )

    try:
        minimal_features = feature_engineer.create_features(
            minimal_totals, pd.DataFrame()
        )
        print("✓ Feature engineering handles minimal data correctly")
    except Exception as e:
        print(f"⚠️  Feature engineering fails with minimal data: {str(e)}")
        issues_found.append("Feature engineering edge case failure")

In [None]:
# FINAL SUMMARY
print("\n" + "=" * 60)
print("🎯 SANITY CHECK SUMMARY")
print("=" * 60 + "\n")

# Component status summary
components = {
    "Environment": "missing_packages" not in locals() or len(missing_packages) == 0,
    "Database": "db" in locals() and stats.get("traders_count", 0) > 0,
    "Data Downloader": "downloader" in locals(),
    "Feature Engineering": "feature_engineer" in locals(),
    "Model Training": "model_trainer" in locals()
    and len(model_trainer.get_all_models()) > 0,
    "Predictions": "predictor" in locals(),
    "Email Service": "email_service" in locals()
    and email_service.from_email is not None,
}

all_good = all(components.values())

print("Component Status:")
for component, status in components.items():
    icon = "✅" if status else "❌"
    print(f"  {icon} {component}")

if issues_found:
    print(f"\n⚠️  Issues found ({len(issues_found)}):")
    for issue in issues_found:
        print(f"  - {issue}")

# Next steps
print("\n📋 Next Steps:")

if not components["Database"]:
    print("1. Run: python scripts/setup_database.py")
elif not components["Model Training"]:
    print("1. Run: python scripts/train_models.py")
elif not components["Email Service"]:
    print("1. Configure email settings in .env file")
elif all_good and not issues_found:
    print("1. System is ready! Run: python scripts/daily_predict.py")
    print(
        "2. Or run the enhanced pipeline: python scripts/daily_prediction_enhanced.py"
    )
else:
    print("1. Address the issues identified above")
    print("2. Re-run this sanity check")

print("\n" + "=" * 60)
if all_good and not issues_found:
    print("✅ ALL CHECKS PASSED - SYSTEM IS READY!")
else:
    print("⚠️  SOME CHECKS FAILED - SEE ABOVE FOR DETAILS")
print("=" * 60)

In [None]:
# OPTIONAL: Quick system test
print(
    "\n💡 Run the cell below to perform a quick end-to-end test (if all checks passed)"
)

In [None]:
# OPTIONAL QUICK END-TO-END TEST
# Uncomment and run this cell to test the full pipeline with one trader

# if all_good and not issues_found:
#     print("🚀 Running quick end-to-end test...\n")
#
#     # Select a trader with good data
#     good_traders = traders_df[traders_df['trading_days'] >= 100]
#     if len(good_traders) > 0:
#         test_trader = good_traders.iloc[0]
#         account_id = str(test_trader['account_id'])
#
#         print(f"Testing with {test_trader['trader_name']} ({account_id})")
#
#         # 1. Get data
#         totals_df, fills_df = db.get_trader_data(account_id)
#         print(f"✓ Loaded {len(totals_df)} days of data")
#
#         # 2. Create features
#         features_df = feature_engineer.create_features(totals_df, fills_df)
#         print(f"✓ Created {len(features_df)} feature rows")
#
#         # 3. Check if model exists
#         if account_id in model_trainer.get_all_models():
#             print("✓ Model exists")
#
#             # 4. Generate prediction
#             prediction = predictor.predict_trader(account_id)
#             if prediction:
#                 print(f"✓ Generated prediction: {prediction['risk_level']} risk")
#                 print(f"  Predicted P&L: ${prediction['predicted_pnl']:.2f}")
#                 print(f"  Recommendation: {prediction['recommendation']}")
#             else:
#                 print("✗ Prediction failed")
#         else:
#             print("⚠️  No model for this trader")
#
#         print("\n✅ End-to-end test complete!")
#     else:
#         print("⚠️  No traders with sufficient data for testing")
# else:
#     print("⚠️  Cannot run end-to-end test until all checks pass")