# Weather Alert EDA
## Exploratory Data Analysis for Weather Anomaly Detection

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [None]:
# Load processed data
df = pd.read_csv('../data/processed/weather_alerts_processed.csv')
print(f"Data shape: {df.shape}")
print(f"\nColumns:\n{df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes.value_counts()}")

## Basic Statistics

In [None]:
# Display basic statistics
print("First few rows:")
display(df.head())

print("\nBasic info:")
df.info()

print("\nDescriptive statistics:")
display(df.describe(include='all'))

## Alert Type Distribution

In [None]:
if 'alert_type' in df.columns:
    alert_counts = df['alert_type'].value_counts()
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Bar chart
    alert_counts.plot(kind='bar', ax=ax1, color='steelblue')
    ax1.set_title('Alert Type Distribution', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Alert Type', fontsize=12)
    ax1.set_ylabel('Count', fontsize=12)
    ax1.tick_params(axis='x', rotation=45)
    
    # Pie chart
    alert_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%', startangle=90)
    ax2.set_title('Alert Type Proportion', fontsize=14, fontweight='bold')
    ax2.set_ylabel('')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nAlert type counts:\n{alert_counts}")

## Temporal Analysis

In [None]:
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['month'] = df['timestamp'].dt.month
    
    # Daily alerts
    daily_alerts = df.groupby('date').size()
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(14, 12))
    
    # Time series
    daily_alerts.plot(ax=ax1, color='darkred', linewidth=2)
    ax1.set_title('Daily Alert Count Over Time', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Date', fontsize=12)
    ax1.set_ylabel('Number of Alerts', fontsize=12)
    ax1.grid(True, alpha=0.3)
    
    # Hourly distribution
    hourly_counts = df['hour'].value_counts().sort_index()
    hourly_counts.plot(kind='bar', ax=ax2, color='teal')
    ax2.set_title('Hourly Distribution of Alerts', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Hour of Day', fontsize=12)
    ax2.set_ylabel('Count', fontsize=12)
    
    # Day of week distribution
    day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    dow_counts = df['day_of_week'].value_counts().sort_index()
    dow_counts.index = [day_names[i] for i in dow_counts.index]
    dow_counts.plot(kind='bar', ax=ax3, color='purple')
    ax3.set_title('Day of Week Distribution', fontsize=14, fontweight='bold')
    ax3.set_xlabel('Day of Week', fontsize=12)
    ax3.set_ylabel('Count', fontsize=12)
    ax3.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nAverage daily alerts: {daily_alerts.mean():.1f}")
    print(f"Peak hour: {hourly_counts.idxmax()}:00 ({hourly_counts.max()} alerts)")

## Regional Analysis

In [None]:
if 'region' in df.columns:
    region_counts = df['region'].value_counts()
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    region_counts.plot(kind='bar', ax=ax, color='forestgreen')
    ax.set_title('Alert Distribution by Region', fontsize=14, fontweight='bold')
    ax.set_xlabel('Region', fontsize=12)
    ax.set_ylabel('Number of Alerts', fontsize=12)
    ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nTop 5 regions:\n{region_counts.head()}")

## Text Analysis

In [None]:
# Text length analysis
if 'description' in df.columns:
    df['description_length'] = df['description'].apply(lambda x: len(str(x)))
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Histogram
    df['description_length'].hist(bins=50, ax=ax1, color='coral', edgecolor='black')
    ax1.set_title('Description Length Distribution', fontsize=14, fontweight='bold')
    ax1.set_xlabel('Description Length (characters)', fontsize=12)
    ax1.set_ylabel('Frequency', fontsize=12)
    
    # Box plot
    df.boxplot(column='description_length', ax=ax2, vert=False)
    ax2.set_title('Description Length Box Plot', fontsize=14, fontweight='bold')
    ax2.set_xlabel('Description Length (characters)', fontsize=12)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nDescription length statistics:")
    print(f"Mean: {df['description_length'].mean():.1f}")
    print(f"Median: {df['description_length'].median():.1f}")
    print(f"Std: {df['description_length'].std():.1f}")
    print(f"Min: {df['description_length'].min()}")
    print(f"Max: {df['description_length'].max()}")

## Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if len(numeric_cols) > 1:
    # Limit to top columns for readability
    top_cols = numeric_cols[:10]
    
    corr_matrix = df[top_cols].corr()
    
    fig, ax = plt.subplots(figsize=(10, 8))
    
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=0.5, ax=ax)
    
    ax.set_title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("\nTop correlations:")
    # Flatten correlation matrix and get top absolute correlations
    corr_pairs = corr_matrix.unstack()
    sorted_pairs = corr_pairs.sort_values(key=abs, ascending=False)
    
    # Remove self-correlations and duplicates
    unique_pairs = pd.DataFrame(sorted_pairs).reset_index()
    unique_pairs.columns = ['Feature1', 'Feature2', 'Correlation']
    unique_pairs = unique_pairs[unique_pairs['Feature1'] != unique_pairs['Feature2']]
    unique_pairs = unique_pairs.iloc[::2]  # Take every other to avoid duplicates
    
    display(unique_pairs.head(10))

## Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
})

missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Percentage', ascending=False)

if not missing_df.empty:
    print("Columns with missing values:")
    display(missing_df)
    
    # Visualize missing values
    fig, ax = plt.subplots(figsize=(12, 6))
    
    missing_df['Percentage'].plot(kind='bar', ax=ax, color='orange')
    ax.set_title('Missing Values Percentage by Column', fontsize=14, fontweight='bold')
    ax.set_xlabel('Column', fontsize=12)
    ax.set_ylabel('Percentage Missing', fontsize=12)
    ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("No missing values found in the dataset.")

## Anomaly Detection Preview

In [None]:
# Load anomaly results if available
try:
    anomaly_df = pd.read_csv('../data/output/anomaly_results.csv', index_col=0)
    anomaly_df.index = pd.to_datetime(anomaly_df.index)
    
    if 'is_anomaly' in anomaly_df.columns:
        print(f"Anomaly data loaded: {anomaly_df.shape}")
        print(f"\nAnomalies detected: {anomaly_df['is_anomaly'].sum()} ({anomaly_df['is_anomaly'].sum() / len(anomaly_df) * 100:.1f}%)")
        
        # Plot anomalies
        fig, ax = plt.subplots(figsize=(14, 6))
        
        # Plot time series
        if 'total_alerts' in anomaly_df.columns:
            ax.plot(anomaly_df.index, anomaly_df['total_alerts'], 
                    color='blue', linewidth=1, label='Daily Alerts')
        
        # Highlight anomalies
        anomalies = anomaly_df[anomaly_df['is_anomaly']]
        if not anomalies.empty and 'total_alerts' in anomalies.columns:
            ax.scatter(anomalies.index, anomalies['total_alerts'], 
                      color='red', s=100, zorder=5, label='Anomalies')
        
        ax.set_title('Daily Alerts with Anomaly Detection', fontsize=14, fontweight='bold')
        ax.set_xlabel('Date', fontsize=12)
        ax.set_ylabel('Number of Alerts', fontsize=12)
        ax.legend()
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
except FileNotFoundError:
    print("Anomaly results not found. Run anomaly detection first.")

## Conclusion

In [None]:
print("\n" + "="*60)
print("EDA SUMMARY")
print("="*60)

print(f"\nDataset Overview:")
print(f"- Total records: {len(df):,}")
print(f"- Total features: {len(df.columns)}")
print(f"- Time range: {df['timestamp'].min() if 'timestamp' in df.columns else 'N/A'} to {df['timestamp'].max() if 'timestamp' in df.columns else 'N/A'}")

print(f"\nKey Findings:")

if 'alert_type' in df.columns:
    top_alert = df['alert_type'].value_counts().index[0]
    print(f"- Most common alert type: {top_alert}")

if 'region' in df.columns:
    top_region = df['region'].value_counts().index[0]
    print(f"- Most active region: {top_region}")

if 'hour' in df.columns:
    peak_hour = df['hour'].value_counts().index[0]
    print(f"- Peak alert hour: {peak_hour}:00")

print(f"\nData Quality:")
total_missing = df.isnull().sum().sum()
total_cells = df.shape[0] * df.shape[1]
print(f"- Missing values: {total_missing:,} ({total_missing/total_cells*100:.1f}%)")
print(f"- Duplicate rows: {df.duplicated().sum()}")

print("\n" + "="*60)