# Time Series Forecasting - Refactored Architecture

This notebook demonstrates the new modular architecture for time series forecasting experiments.

## Features:
- 🔧 Configuration-driven experiments
- 📊 Unified logging and metrics
- 🎨 Interactive visualizations
- 🔄 Rolling window validation
- ⚡ Parallel model execution

In [1]:
# ============================================================================
# Cell 1: Configuration and Imports
# ============================================================================

import sys

from pathlib import Path

# Setup paths
current_dir = Path.cwd()
if "ENEXIS" in str(current_dir):
    while current_dir.name != "ENEXIS" and current_dir.parent != current_dir:
        current_dir = current_dir.parent
    project_root = current_dir
else:
    project_root = current_dir

src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

# Import directly from files to avoid __init__.py issues
from config.experiment_config import ExperimentConfig
from core.data_manager import DataManager
from core.logging_manager import ExperimentLogger

# Import specific files instead of modules
from core.experiment import TimeSeriesExperiment
from models.factory import ModelFactory
from evaluation.metrics import MetricsCalculator

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

logger.info("📚 All imports loaded successfully")

from utils.build_training_set import build_training_set

build_training_set(
    train_start="2025-01-01 00:00:00",
    train_end="2025-03-14 23:00:00",
    run_date="2025-03-15 00:00:00"
)

NameError: name 'logging' is not defined

In [None]:
# ============================================================================
# Cell 2: Initialize Components
# ============================================================================

# Option 1: Load from YAML config file (recommended)
# config = ExperimentConfig.from_file("config/experiment_config.yaml")

# Option 2: Use default configuration (for quick testing)
config = ExperimentConfig()

# Initialize core components
data_manager = DataManager(config)
experiment_logger = ExperimentLogger(config.logs_database_path)
model_factory = ModelFactory(config.model_configs)

logger.info("🔧 Components initialized")
logger.info(f"📊 Loaded {len(config.model_configs)} model configurations")
logger.info(f"🎯 Target: {config.target_column}")
logger.info(f"📅 Period: {config.train_start} to {config.forecast_end}")

# Display model information
model_info = model_factory.get_model_info()
print(f"\n🤖 MODEL CONFIGURATION:")
print(f"Total models: {model_info['total_models']}")
print(f"Enabled: {model_info['enabled_models']}")
if model_info['disabled_models']:
    print(f"Disabled: {model_info['disabled_models']}")

In [None]:
# ============================================================================
# Cell 3: Fix Config Dates - Work Around Read-Only Properties
# ============================================================================

# Database is working, let's check actual date range and work with what we can set
import sqlite3
import pandas as pd

with sqlite3.connect(config.database_path) as conn:
    date_query = "SELECT MIN(target_datetime) as min_date, MAX(target_datetime) as max_date FROM master_warp"
    date_info = pd.read_sql(date_query, conn)
    min_date = pd.to_datetime(date_info['min_date'].iloc[0])
    max_date = pd.to_datetime(date_info['max_date'].iloc[0])
    
    print(f"📅 ACTUAL DATA RANGE:")
    print(f"Min date: {min_date}")
    print(f"Max date: {max_date}")
    print(f"Total days: {(max_date - min_date).days}")

# Check what we can actually modify in config
print(f"\n⚙️ CURRENT CONFIG:")
print(f"Train start: {config.train_start}")
print(f"Train end: {config.train_end}")
print(f"Forecast start: {config.forecast_start}")
if hasattr(config, 'forecast_end'):
    print(f"Forecast end: {config.forecast_end}")
print(f"Horizon: {config.horizon}")

# Try to update what we can
try:
    # Use a forecast period that exists in your data (last week of data)
    forecast_start = max_date - pd.Timedelta(days=7)
    forecast_end = max_date
    
    config.train_start = min_date
    config.train_end = forecast_start - pd.Timedelta(hours=1)
    config.forecast_start = forecast_start
    
    # Calculate horizon in hours
    horizon_hours = int((forecast_end - forecast_start).total_seconds() / 3600) + 1
    config.horizon = horizon_hours
    
    print(f"\n✅ UPDATED CONFIG (using last week for forecast):")
    print(f"Train start: {config.train_start}")
    print(f"Train end: {config.train_end}")
    print(f"Forecast start: {config.forecast_start}")
    print(f"Forecast period: {horizon_hours} hours ({(horizon_hours/24):.1f} days)")
    
except Exception as e:
    print(f"⚠️ Could not update config: {e}")
    print("Using original config dates - might need manual adjustment")

# Clear cache
data_manager.clear_cache()

# Try with current config
try:
    data_splits = data_manager.create_splits()
    logger.info("✅ Data splits created successfully")
    
    # Get split info
    split_info = data_splits.get_info()
    print(f"\n🔄 SUCCESSFUL DATA SPLITS:")
    print(f"Training samples: {split_info['train_samples']}")
    print(f"Test samples: {split_info['test_samples']}")
    print(f"Training period: {split_info['train_period']}")
    print(f"Forecast period: {split_info['forecast_period']}")
    
except Exception as e:
    print(f"\n❌ Still failed to create splits: {e}")
    
    # Try to create splits manually with good dates
    print("🔧 Trying manual date override...")
    
    try:
        # Override the create_splits call with manual dates
        manual_forecast_start = max_date - pd.Timedelta(days=7)
        manual_train_end = manual_forecast_start - pd.Timedelta(hours=1)
        
        data_splits = data_manager.create_splits(
            train_start=min_date,
            train_end=manual_train_end,
            forecast_start=manual_forecast_start,
            forecast_horizon=168  # 7 days in hours
        )
        
        logger.info("✅ Manual data splits created successfully")
        
        split_info = data_splits.get_info()
        print(f"\n🔄 MANUAL DATA SPLITS:")
        print(f"Training samples: {split_info['train_samples']}")
        print(f"Test samples: {split_info['test_samples']}")
        print(f"Training period: {split_info['train_period']}")
        print(f"Forecast period: {split_info['forecast_period']}")
        
    except Exception as manual_error:
        print(f"❌ Manual override also failed: {manual_error}")
        raise Exception("Could not create data splits with any date configuration")

# If we get here, data_splits is working
print(f"\n📊 DATA READY FOR MODELING!")
print(f"Target column: {config.target_column}")
print(f"Ready to proceed to next cells")

In [None]:
# ============================================================================
# Cell 4: Run Single Experiments
# ============================================================================

# Initialize experiment
experiment = TimeSeriesExperiment(config, data_manager, experiment_logger)

# Run single experiment
logger.info("🚀 Starting single experiment...")
single_run_results = experiment.run_single_experiment(data_splits)

# Display results summary
print("\n📊 SINGLE EXPERIMENT RESULTS:")
print("=" * 60)

successful_models = [name for name, result in single_run_results.items() if result.success]
failed_models = [name for name, result in single_run_results.items() if not result.success]

print(f"✅ Successful models: {len(successful_models)}")
print(f"❌ Failed models: {len(failed_models)}")

# Calculate and display metrics
metrics_calc = MetricsCalculator()
model_metrics = {}

for name, result in single_run_results.items():
    if result.success:
        metrics = metrics_calc.calculate_all_metrics(data_splits.y_test, result.predictions)
        model_metrics[name] = metrics
        print(f"  {name.replace('_', ' ').title()}:")
        print(f"    RMSE: {metrics['rmse']:.6f}")
        print(f"    MAE:  {metrics['mae']:.6f}")
        print(f"    MAPE: {metrics['mape']:.2f}%")
        print(f"    Time: {result.execution_time:.2f}s")
    else:
        print(f"  {name.replace('_', ' ').title()}: FAILED")
        print(f"    Error: {result.error_message}")
        print(f"    Time: {result.execution_time:.2f}s")

# Find best model
if model_metrics:
    best_model = min(model_metrics.items(), key=lambda x: x[1]['rmse'])
    print(f"\n🏆 BEST MODEL: {best_model[0].replace('_', ' ').title()}")
    print(f"    RMSE: {best_model[1]['rmse']:.6f}")

In [None]:
# ============================================================================
# Cell 5: Visualization
# ============================================================================

# Create visualizer
visualizer = ResultsVisualizer(use_plotly=True)

# Create comparison plot
print("📈 Creating comparison plot...")
comparison_plot = visualizer.create_comparison_plot(
    actual_values=data_splits.y_test,
    model_results=single_run_results,
    training_data=data_splits.y_train,
    title="Time Series Forecasting Model Comparison"
)
comparison_plot.show()

# Create performance summary table
performance_summary = visualizer.create_performance_summary(
    actual_values=data_splits.y_test,
    model_results=single_run_results
)

from IPython.display import display, HTML
display(HTML("<h3>📊 Model Performance Summary</h3>"))
display(performance_summary)

# Model diagnostics plot (if available)
has_diagnostics = any(result.diagnostics for result in single_run_results.values() if result.success)
if has_diagnostics:
    print("\n🔍 Creating diagnostics plot...")
    diagnostics_plot = visualizer.create_model_diagnostics_plot(single_run_results)
    diagnostics_plot.show()
else:
    print("\n📊 No diagnostic data available for visualization")

# Residuals analysis for successful models
successful_results = {name: result for name, result in single_run_results.items() if result.success}
if len(successful_results) > 0:
    print(f"\n🔬 Models available for residuals analysis: {len(successful_results)}")
    print("Note: Residuals analysis can be added with visualizer.create_residuals_analysis()")

In [None]:
# ============================================================================
# Cell 6: Rolling Window Validation
# ============================================================================

logger.info("🔄 Starting rolling window validation...")

# Run rolling validation
rolling_results = experiment.run_rolling_validation(
    n_windows=config.rolling_windows,
    parallel=config.parallel_execution
)

if not rolling_results.empty:
    print(f"\n🔄 ROLLING WINDOW VALIDATION RESULTS:")
    print("=" * 60)
    
    # Summary statistics by model
    print("\n📊 Summary by Model:")
    summary_stats = rolling_results.groupby('model_name').agg({
        'rmse': ['mean', 'std', 'min', 'max'],
        'mae': ['mean', 'std'],
        'execution_time': ['mean', 'sum'],
        'status': lambda x: f"{(x == 'completed').sum()}/{len(x)}"
    }).round(6)
    
    print(summary_stats)
    
    # Create rolling validation plot
    print("\n📈 Creating rolling validation plot...")
    rolling_plot = visualizer.create_rolling_validation_plot(rolling_results)
    rolling_plot.show()
    
    # Performance trend analysis
    trends = experiment.validator.analyze_performance_trends(rolling_results)
    
    print(f"\n📈 PERFORMANCE TRENDS:")
    print("-" * 40)
    
    for model_name, trend_info in trends.get('performance_trends', {}).items():
        trend = trend_info.get('trend', 'UNKNOWN')
        degradation = trend_info.get('degradation_percent', 0)
        windows_completed = trend_info.get('windows_completed', 0)
        
        print(f"  {model_name.replace('_', ' ').title()}:")
        print(f"    Trend: {trend} ({degradation:+.1f}%)")
        print(f"    Windows completed: {windows_completed}/{config.rolling_windows}")
        
        if trend in ['SEVERE', 'SIGNIFICANT']:
            print(f"    ⚠️  Performance degradation detected!")
        elif trend == 'IMPROVING':
            print(f"    ✅ Performance improving over time")
    
    # Success rates
    print(f"\n📊 SUCCESS RATES:")
    for model_name, success_info in trends.get('success_rate', {}).items():
        rate = success_info['success_rate']
        count = success_info['success_count']
        total = success_info['total_count']
        print(f"  {model_name.replace('_', ' ').title()}: {rate:.1f}% ({count}/{total})")

else:
    print("\n❌ No rolling validation results available")
    print("This could be due to insufficient data or all models failing")

In [None]:
# ============================================================================
# Cell 7: Full Experiment & Analysis
# ============================================================================

# Run complete experiment with comprehensive logging
experiment_name = f"Complete_Model_Comparison_{pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')}"

print(f"🎯 Running full experiment: {experiment_name}")
full_results = experiment.run_full_experiment(
    experiment_name=experiment_name,
    include_rolling=True
)

# Display experiment summary
summary = full_results.get('summary', {})

print(f"\n🎉 EXPERIMENT SUMMARY:")
print("=" * 60)
print(f"Experiment ID: {full_results['experiment_id']}")
print(f"Name: {full_results['experiment_name']}")
print(f"Status: {full_results.get('status', 'completed')}")

# Single run summary
single_summary = summary.get('single_run_summary', {})
if single_summary:
    print(f"\n📊 Single Run Results:")
    print(f"  Total models: {single_summary.get('total_models', 0)}")
    print(f"  Successful: {single_summary.get('successful_models', 0)}")
    print(f"  Failed: {single_summary.get('failed_models', 0)}")
    print(f"  Best model: {single_summary.get('best_model', 'Unknown')}")
    print(f"  Best RMSE: {single_summary.get('best_rmse', 'N/A')}")

# Rolling validation summary
rolling_summary = summary.get('rolling_validation_summary', {})
if rolling_summary:
    print(f"\n🔄 Rolling Validation Results:")
    print(f"  Total windows: {rolling_summary.get('total_windows', 0)}")
    print(f"  Models tested: {len(rolling_summary.get('models_tested', []))}")
    if 'best_model' in rolling_summary:
        print(f"  Best model (avg): {rolling_summary['best_model']}")
        print(f"  Best avg RMSE: {rolling_summary.get('best_avg_rmse', 'N/A')}")

# Overall recommendation
best_model = summary.get('overall_best_model')
if best_model:
    print(f"\n🏆 OVERALL BEST MODEL: {best_model.replace('_', ' ').title()}")

# Recommendations
recommendations = summary.get('recommendations', [])
if recommendations:
    print(f"\n💡 RECOMMENDATIONS:")
    for rec in recommendations:
        print(f"  {rec}")

print(f"\n📋 Results saved to database with experiment ID: {full_results['experiment_id']}")

In [None]:
# ============================================================================
# Cell 8: Advanced Analysis & Historical Comparison
# ============================================================================

# Advanced model comparison
print("🔍 ADVANCED MODEL ANALYSIS:")
print("=" * 50)

successful_results = {name: result for name, result in single_run_results.items() if result.success}
if len(successful_results) > 1:
    predictions_dict = {name: result.predictions for name, result in successful_results.items()}
    
    comparison = metrics_calc.compare_predictions(data_splits.y_test, predictions_dict)
    
    print(f"\nModels compared: {comparison['models']}")
    
    # Detailed metrics comparison
    print(f"\n📊 DETAILED METRICS COMPARISON:")
    for model_name, metrics in comparison['metrics_comparison'].items():
        print(f"\n  {model_name.replace('_', ' ').title()}:")
        print(f"    RMSE: {metrics.get('rmse', 'N/A'):.6f}")
        print(f"    MAE:  {metrics.get('mae', 'N/A'):.6f}")
        print(f"    MAPE: {metrics.get('mape', 'N/A'):.2f}%")
        print(f"    R²:   {metrics.get('r_squared', 'N/A'):.4f}")
        print(f"    Correlation: {metrics.get('correlation', 'N/A'):.4f}")
    
    # Model ranking
    ranking = comparison.get('ranking', {})
    if ranking:
        print(f"\n🏆 RANKING BY RMSE:")
        for i, entry in enumerate(ranking['by_rmse'], 1):
            model_display = entry['model'].replace('_', ' ').title()
            print(f"  {i}. {model_display}: {entry['rmse']:.6f}")

# Compare with previous experiments
print(f"\n🕒 HISTORICAL COMPARISON:")
comparison = experiment.compare_with_previous_experiments(limit=5)
if 'error' not in comparison:
    print(f"Previous experiments analyzed: {comparison['previous_experiments_count']}")
    
    for rec in comparison.get('recommendations', []):
        print(f"  {rec}")
else:
    print(f"  {comparison['error']}")

# Data quality final assessment
print(f"\n📊 FINAL DATA QUALITY ASSESSMENT:")
print(f"Overall quality score: {quality_report['quality_score']:.1f}%")

target_stats = quality_report.get('target_column_stats', {})
if target_stats:
    print(f"\nTarget column ({config.target_column}) statistics:")
    print(f"  Mean: {target_stats.get('mean', 'N/A'):.4f}")
    print(f"  Std:  {target_stats.get('std', 'N/A'):.4f}")
    print(f"  Range: {target_stats.get('min', 'N/A'):.4f} to {target_stats.get('max', 'N/A'):.4f}")
    print(f"  Missing: {target_stats.get('missing_count', 'N/A')} values")

# Feature importance (if available from models)
print(f"\n🔧 MODEL CONFIGURATION SUMMARY:")
for model_name, result in single_run_results.items():
    if result.success:
        print(f"\n  {model_name.replace('_', ' ').title()}:")
        print(f"    Hyperparameters: {result.hyperparameters}")
        if result.diagnostics:
            print(f"    Diagnostics available: {list(result.diagnostics.keys())}")
        if result.convergence_info:
            converged = result.convergence_info.get('converged', True)
            print(f"    Convergence: {'✅ Yes' if converged else '⚠️ Issues detected'}")

print(f"\n🎉 ANALYSIS COMPLETE!")
print(f"\n💾 All results have been logged to the database.")
print(f"📊 You can query the logs database for detailed historical analysis.")

logger.info("🔬 Advanced analysis complete!")