# Mini Outbreak Detector - Data Exploration

This notebook demonstrates how to use the Mini Outbreak Detector pipeline for data analysis, anomaly detection, and forecasting.

## Setup

In [None]:
import sys
import os

# Add parent directory to path
sys.path.append(os.path.dirname(os.getcwd()))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import modules
from src.data.loader import DataLoader
from src.data.preprocess import DataPreprocessor
from src.ml.anomalies import AnomalyDetector
from src.ml.forecast import Forecaster
from src.ai.explain import ExplanationGenerator

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 1. Load Data

In [None]:
# Initialize loader
loader = DataLoader()

# Load sample data
df = loader.load_from_csv(
    "sample_disease_data.csv",
    date_col="date",
    cases_col="new_cases",
    country_col="location",
    disease_col="disease"
)

print(f"Loaded {len(df)} rows")
df.head()

In [None]:
# Filter for specific country
df_filtered = loader.filter_data(df, country="India", disease="COVID-19")
print(f"Filtered to {len(df_filtered)} rows for India")
df_filtered.head()

## 2. Preprocess Data

In [None]:
# Initialize preprocessor
preprocessor = DataPreprocessor()

# Clean and prepare data
df_processed = preprocessor.prepare_for_analysis(df_filtered)

print(f"Processed {len(df_processed)} rows")
df_processed.head()

In [None]:
# Get summary statistics
summary_stats = preprocessor.get_summary_stats(df_processed)
print("Summary Statistics:")
for key, value in summary_stats.items():
    print(f"  {key}: {value}")

## 3. Visualize Time Series

In [None]:
# Plot cases over time
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

# Raw cases
axes[0].plot(df_processed.index, df_processed['cases'], label='Daily Cases', alpha=0.7)
axes[0].plot(df_processed.index, df_processed['rolling_mean'], label='7-Day Average', linewidth=2)
axes[0].set_title('Daily Cases and Rolling Average')
axes[0].set_ylabel('Cases')
axes[0].legend()
axes[0].grid(True)

# Rolling statistics
axes[1].plot(df_processed.index, df_processed['rolling_slope'], label='Rolling Slope', color='green')
axes[1].set_title('Trend (Rolling Slope)')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Slope')
axes[1].legend()
axes[1].grid(True)
axes[1].axhline(y=0, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 4. Anomaly Detection

In [None]:
# Initialize anomaly detector
detector = AnomalyDetector()

# Detect anomalies
df_anomalies = detector.detect_all(df_processed)

# Get anomaly statistics
anomaly_stats = detector.get_anomaly_stats(df_anomalies)
print("Anomaly Statistics:")
for key, value in anomaly_stats.items():
    print(f"  {key}: {value}")

In [None]:
# Visualize anomalies
fig, ax = plt.subplots(figsize=(14, 6))

# Plot cases
ax.plot(df_anomalies.index, df_anomalies['cases'], label='Daily Cases', alpha=0.7)
ax.plot(df_anomalies.index, df_anomalies['rolling_mean'], label='Rolling Mean', linewidth=2)

# Highlight anomalies
anomalies = df_anomalies[df_anomalies['is_anomaly']]
ax.scatter(anomalies.index, anomalies['cases'], color='red', s=100, 
           label=f'Anomalies ({len(anomalies)})', zorder=5, marker='x')

ax.set_title('Anomaly Detection Results')
ax.set_xlabel('Date')
ax.set_ylabel('Cases')
ax.legend()
ax.grid(True)

plt.tight_layout()
plt.show()

## 5. Forecasting

In [None]:
# Initialize forecaster
forecaster = Forecaster()

# Generate forecast
forecast_df = forecaster.generate_forecast(df_processed, method="prophet")

# Get forecast statistics
forecast_stats = forecaster.get_forecast_stats(forecast_df)
print("Forecast Statistics:")
for key, value in forecast_stats.items():
    print(f"  {key}: {value}")

In [None]:
# Visualize forecast
fig, ax = plt.subplots(figsize=(14, 6))

# Plot historical data
ax.plot(df_processed.index, df_processed['cases'], label='Historical Cases', color='blue')

# Plot forecast
ax.plot(forecast_df['ds'], forecast_df['yhat'], label='Forecast', color='green', linewidth=2)

# Plot confidence interval
ax.fill_between(
    forecast_df['ds'],
    forecast_df['yhat_lower'],
    forecast_df['yhat_upper'],
    alpha=0.3,
    color='green',
    label='95% Confidence Interval'
)

ax.set_title('14-Day Forecast')
ax.set_xlabel('Date')
ax.set_ylabel('Cases')
ax.legend()
ax.grid(True)

plt.tight_layout()
plt.show()

## 6. AI Explanation

In [None]:
# Initialize explanation generator
explainer = ExplanationGenerator()

# Generate explanation
explanation = explainer.generate_explanation(
    country="India",
    disease="COVID-19",
    summary_stats=summary_stats,
    anomaly_stats=anomaly_stats,
    forecast_stats=forecast_stats
)

print("AI Explanation:")
print("=" * 80)
print(f"\nRISK LEVEL: {explanation['risk_level'].upper()}")
print(f"\nSUMMARY:\n{explanation['summary']}")
print(f"\nEXPLANATION:\n{explanation['explanation']}")
print(f"\nRECOMMENDATIONS:")
for i, rec in enumerate(explanation['recommendations'], 1):
    print(f"  {i}. {rec}")
print(f"\nCONFIDENCE: {explanation['confidence'].upper()}")
print("=" * 80)

## 7. Complete Analysis Summary

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(3, 1, figsize=(14, 12))

# Panel 1: Cases with anomalies
axes[0].plot(df_anomalies.index, df_anomalies['cases'], alpha=0.7, label='Daily Cases')
axes[0].plot(df_anomalies.index, df_anomalies['rolling_mean'], linewidth=2, label='7-Day Avg')
anomalies = df_anomalies[df_anomalies['is_anomaly']]
axes[0].scatter(anomalies.index, anomalies['cases'], color='red', s=100, 
               label='Anomalies', zorder=5, marker='x')
axes[0].set_title('Historical Cases with Anomaly Detection')
axes[0].set_ylabel('Cases')
axes[0].legend()
axes[0].grid(True)

# Panel 2: Forecast
axes[1].plot(df_processed.tail(30).index, df_processed.tail(30)['cases'], 
            label='Recent Cases', color='blue')
axes[1].plot(forecast_df['ds'], forecast_df['yhat'], label='Forecast', 
            color='green', linewidth=2)
axes[1].fill_between(forecast_df['ds'], forecast_df['yhat_lower'], 
                     forecast_df['yhat_upper'], alpha=0.3, color='green')
axes[1].set_title('14-Day Forecast')
axes[1].set_ylabel('Cases')
axes[1].legend()
axes[1].grid(True)

# Panel 3: Z-scores
axes[2].plot(df_anomalies.index, df_anomalies['z_score'], label='Z-Score')
axes[2].axhline(y=2.5, color='red', linestyle='--', label='Threshold')
axes[2].axhline(y=-2.5, color='red', linestyle='--')
axes[2].fill_between(df_anomalies.index, -2.5, 2.5, alpha=0.1, color='green')
axes[2].set_title('Z-Score Analysis')
axes[2].set_xlabel('Date')
axes[2].set_ylabel('Z-Score')
axes[2].legend()
axes[2].grid(True)

plt.tight_layout()
plt.show()

## Next Steps

- Try loading different data sources
- Experiment with different parameters in `src/config/settings.py`
- Compare multiple countries or diseases
- Integrate with the API for real-time analysis