# Data Exploration - Disaster Early Warning System

This notebook explores the available weather datasets to understand:
- Data structure and quality
- Missing values and data distributions
- Weather patterns and extreme conditions
- Correlations between weather variables

## Datasets Available:
1. **GlobalWeatherRepository.csv** - Global weather data for 195+ countries
2. **weather_classification_data.csv** - Labeled weather types (13,201 rows)
3. **rain_prediction_2500observations.csv** - Binary rain prediction dataset
4. **weather_data.csv** - Large-scale weather observations
5. **top100cities_weather_data.csv** - Weather data for top 100 cities
6. **seattle-weather.csv** - Historical Seattle weather data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully!")

## 1. Load and Examine Datasets

In [None]:
# Define dataset paths
dataset_dir = Path('../dataset')
datasets = {}

# Load all CSV datasets
try:
    datasets['global'] = pd.read_csv(dataset_dir / 'GlobalWeatherRepository.csv')
    print(f"✓ Global Weather: {datasets['global'].shape}")
except Exception as e:
    print(f"✗ Global Weather: {e}")

try:
    datasets['classification'] = pd.read_csv(dataset_dir / 'weather_classification_data.csv')
    print(f"✓ Weather Classification: {datasets['classification'].shape}")
except Exception as e:
    print(f"✗ Weather Classification: {e}")

try:
    datasets['rain_prediction'] = pd.read_csv(dataset_dir / 'rain_prediction_2500observations.csv')
    print(f"✓ Rain Prediction: {datasets['rain_prediction'].shape}")
except Exception as e:
    print(f"✗ Rain Prediction: {e}")

try:
    datasets['weather_large'] = pd.read_csv(dataset_dir / 'weather_data.csv')
    print(f"✓ Weather Large: {datasets['weather_large'].shape}")
except Exception as e:
    print(f"✗ Weather Large: {e}")

try:
    datasets['top_cities'] = pd.read_csv(dataset_dir / 'top100cities_weather_data.csv')
    print(f"✓ Top Cities: {datasets['top_cities'].shape}")
except Exception as e:
    print(f"✗ Top Cities: {e}")

try:
    datasets['seattle'] = pd.read_csv(dataset_dir / 'seattle-weather.csv')
    print(f"✓ Seattle Weather: {datasets['seattle'].shape}")
except Exception as e:
    print(f"✗ Seattle Weather: {e}")

print(f"\nLoaded {len(datasets)} datasets successfully")

## 2. Dataset Structure Analysis

In [None]:
# Examine structure of each dataset
for name, df in datasets.items():
    print(f"\n{'='*50}")
    print(f"Dataset: {name.upper()}")
    print(f"{'='*50}")
    print(f"Shape: {df.shape}")
    print(f"\nColumns ({len(df.columns)}):")
    for i, col in enumerate(df.columns):
        print(f"{i+1:2d}. {col}")
    
    print(f"\nData Types:")
    print(df.dtypes.value_counts())
    
    print(f"\nFirst few rows:")
    print(df.head(2))

## 3. Missing Values Analysis

In [None]:
# Analyze missing values for each dataset
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, (name, df) in enumerate(datasets.items()):
    if i < len(axes):
        missing_pct = (df.isnull().sum() / len(df)) * 100
        missing_pct = missing_pct[missing_pct > 0].sort_values(ascending=False)
        
        if len(missing_pct) > 0:
            missing_pct.plot(kind='bar', ax=axes[i], color='coral')
            axes[i].set_title(f'{name.title()} - Missing Values %')
            axes[i].set_ylabel('Missing %')
            axes[i].tick_params(axis='x', rotation=45)
        else:
            axes[i].text(0.5, 0.5, f'{name.title()}\nNo Missing Values', 
                        ha='center', va='center', transform=axes[i].transAxes,
                        fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgreen'))
            axes[i].set_xticks([])
            axes[i].set_yticks([])

# Remove empty subplots
for i in range(len(datasets), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Print detailed missing value statistics
print("\nDetailed Missing Value Analysis:")
print("="*60)
for name, df in datasets.items():
    missing_count = df.isnull().sum().sum()
    total_cells = df.shape[0] * df.shape[1]
    missing_pct = (missing_count / total_cells) * 100
    print(f"{name.title():20s}: {missing_count:6d} missing ({missing_pct:.2f}%)")

## 4. Weather Variable Distributions

In [None]:
# Focus on weather classification dataset for detailed analysis
if 'classification' in datasets:
    df_weather = datasets['classification']
    
    # Identify numeric columns
    numeric_cols = df_weather.select_dtypes(include=[np.number]).columns.tolist()
    print(f"Numeric columns: {numeric_cols}")
    
    # Plot distributions
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    axes = axes.flatten()
    
    for i, col in enumerate(numeric_cols[:9]):
        if i < len(axes):
            df_weather[col].hist(bins=30, ax=axes[i], alpha=0.7, color='skyblue')
            axes[i].set_title(f'{col} Distribution')
            axes[i].set_ylabel('Frequency')
            
            # Add statistics
            mean_val = df_weather[col].mean()
            std_val = df_weather[col].std()
            axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.1f}')
            axes[i].legend()
    
    # Remove empty subplots
    for i in range(len(numeric_cols), len(axes)):
        fig.delaxes(axes[i])
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\nSummary Statistics:")
    print(df_weather[numeric_cols].describe())

## 5. Extreme Weather Conditions Analysis

In [None]:
# Analyze extreme weather conditions across datasets
def identify_extreme_conditions(df, dataset_name):
    """Identify extreme weather conditions in a dataset"""
    print(f"\n{dataset_name.upper()} - Extreme Conditions:")
    print("-" * 50)
    
    # Try to identify relevant columns (case-insensitive)
    cols = df.columns.str.lower()
    
    # Temperature extremes
    temp_cols = [col for col in df.columns if any(x in col.lower() for x in ['temp', 'celsius', 'fahrenheit'])]
    if temp_cols:
        temp_col = temp_cols[0]
        temp_data = pd.to_numeric(df[temp_col], errors='coerce')
        print(f"Temperature ({temp_col}):")
        print(f"  Extreme Cold: < {temp_data.quantile(0.05):.1f} ({(temp_data < temp_data.quantile(0.05)).sum()} records)")
        print(f"  Extreme Heat: > {temp_data.quantile(0.95):.1f} ({(temp_data > temp_data.quantile(0.95)).sum()} records)")
    
    # Pressure extremes
    pressure_cols = [col for col in df.columns if 'pressure' in col.lower()]
    if pressure_cols:
        pressure_col = pressure_cols[0]
        pressure_data = pd.to_numeric(df[pressure_col], errors='coerce')
        print(f"Pressure ({pressure_col}):")
        print(f"  Very Low: < {pressure_data.quantile(0.05):.1f} ({(pressure_data < pressure_data.quantile(0.05)).sum()} records)")
        print(f"  Very High: > {pressure_data.quantile(0.95):.1f} ({(pressure_data > pressure_data.quantile(0.95)).sum()} records)")
    
    # Wind extremes
    wind_cols = [col for col in df.columns if any(x in col.lower() for x in ['wind', 'mph', 'kph'])]
    if wind_cols:
        wind_col = wind_cols[0]
        wind_data = pd.to_numeric(df[wind_col], errors='coerce')
        print(f"Wind ({wind_col}):")
        print(f"  High Wind: > {wind_data.quantile(0.90):.1f} ({(wind_data > wind_data.quantile(0.90)).sum()} records)")
        print(f"  Extreme Wind: > {wind_data.quantile(0.95):.1f} ({(wind_data > wind_data.quantile(0.95)).sum()} records)")
    
    # Precipitation extremes
    precip_cols = [col for col in df.columns if any(x in col.lower() for x in ['precip', 'rain', 'precipitation'])]
    if precip_cols:
        precip_col = precip_cols[0]
        precip_data = pd.to_numeric(df[precip_col], errors='coerce')
        print(f"Precipitation ({precip_col}):")
        print(f"  Heavy Rain: > {precip_data.quantile(0.90):.1f} ({(precip_data > precip_data.quantile(0.90)).sum()} records)")
        print(f"  Extreme Rain: > {precip_data.quantile(0.95):.1f} ({(precip_data > precip_data.quantile(0.95)).sum()} records)")

# Analyze extreme conditions for key datasets
key_datasets = ['classification', 'rain_prediction', 'global']
for name in key_datasets:
    if name in datasets:
        identify_extreme_conditions(datasets[name], name)

## 6. Correlation Analysis

In [None]:
# Correlation analysis for weather classification dataset
if 'classification' in datasets:
    df_weather = datasets['classification']
    numeric_cols = df_weather.select_dtypes(include=[np.number]).columns
    
    if len(numeric_cols) > 1:
        # Calculate correlation matrix
        corr_matrix = df_weather[numeric_cols].corr()
        
        # Plot correlation heatmap
        plt.figure(figsize=(12, 10))
        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
        sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
                   square=True, linewidths=0.5, cbar_kws={"shrink": .8})
        plt.title('Weather Variables Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
        # Find strong correlations
        print("\nStrong Correlations (|r| > 0.5):")
        print("-" * 40)
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                corr_val = corr_matrix.iloc[i, j]
                if abs(corr_val) > 0.5:
                    print(f"{corr_matrix.columns[i]} vs {corr_matrix.columns[j]}: {corr_val:.3f}")

## 7. Weather Type Distribution (Classification Dataset)

In [None]:
# Analyze weather type distribution
if 'classification' in datasets:
    df_weather = datasets['classification']
    
    # Check if weather type column exists
    weather_type_cols = [col for col in df_weather.columns if 'weather' in col.lower() or 'type' in col.lower()]
    
    if weather_type_cols:
        weather_col = weather_type_cols[0]
        
        # Plot weather type distribution
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        weather_counts = df_weather[weather_col].value_counts()
        weather_counts.plot(kind='bar', color='lightblue')
        plt.title('Weather Type Distribution')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        
        plt.subplot(1, 2, 2)
        weather_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90)
        plt.title('Weather Type Percentage')
        plt.ylabel('')
        
        plt.tight_layout()
        plt.show()
        
        print(f"\nWeather Type Counts:")
        print(weather_counts)
        
        # Analyze weather conditions by type
        numeric_cols = df_weather.select_dtypes(include=[np.number]).columns
        if len(numeric_cols) > 0:
            print(f"\nAverage Weather Conditions by Type:")
            print(df_weather.groupby(weather_col)[numeric_cols].mean().round(2))

## 8. Data Quality Assessment

In [None]:
# Comprehensive data quality assessment
def assess_data_quality(df, dataset_name):
    """Assess data quality for a dataset"""
    print(f"\n{'='*60}")
    print(f"DATA QUALITY ASSESSMENT: {dataset_name.upper()}")
    print(f"{'='*60}")
    
    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Missing values
    missing_pct = (df.isnull().sum() / len(df)) * 100
    print(f"\nMissing Values:")
    if missing_pct.sum() == 0:
        print("  ✓ No missing values")
    else:
        print(f"  ✗ {missing_pct[missing_pct > 0].count()} columns with missing data")
        for col, pct in missing_pct[missing_pct > 0].items():
            print(f"    {col}: {pct:.1f}%")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicate Rows: {duplicates} ({duplicates/len(df)*100:.1f}%)")
    
    # Data types
    print(f"\nData Types:")
    for dtype, count in df.dtypes.value_counts().items():
        print(f"  {dtype}: {count} columns")
    
    # Numeric columns analysis
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\nNumeric Columns Analysis:")
        for col in numeric_cols:
            data = df[col]
            print(f"  {col}:")
            print(f"    Range: {data.min():.2f} to {data.max():.2f}")
            print(f"    Mean: {data.mean():.2f}, Std: {data.std():.2f}")
            
            # Check for outliers (values beyond 3 standard deviations)
            outliers = np.abs((data - data.mean()) / data.std()) > 3
            outlier_count = outliers.sum()
            if outlier_count > 0:
                print(f"    ⚠ Outliers: {outlier_count} ({outlier_count/len(data)*100:.1f}%)")
    
    return {
        'shape': df.shape,
        'missing_pct': missing_pct.sum(),
        'duplicates': duplicates,
        'numeric_cols': len(numeric_cols)
    }

# Assess quality for all datasets
quality_summary = {}
for name, df in datasets.items():
    quality_summary[name] = assess_data_quality(df, name)

# Summary table
print(f"\n{'='*80}")
print("DATA QUALITY SUMMARY")
print(f"{'='*80}")
summary_df = pd.DataFrame(quality_summary).T
print(summary_df)

## 9. Recommendations for Data Processing

Based on the exploration above, here are the key findings and recommendations:

In [None]:
print("\n" + "="*80)
print("DATA PROCESSING RECOMMENDATIONS")
print("="*80)

recommendations = [
    "1. PRIMARY DATASETS FOR ML TRAINING:",
    "   • weather_classification_data.csv - Best for labeled weather patterns",
    "   • rain_prediction_2500observations.csv - Good for binary classification",
    "   • GlobalWeatherRepository.csv - Rich feature set for current conditions",
    "",
    "2. DATA CLEANING PRIORITIES:",
    "   • Handle missing values using forward fill for time series data",
    "   • Remove duplicate records to avoid bias",
    "   • Standardize units (ensure consistent temperature, pressure, wind units)",
    "   • Validate extreme values and handle outliers appropriately",
    "",
    "3. FEATURE ENGINEERING OPPORTUNITIES:",
    "   • Create rolling window statistics (7-day averages, max, min)",
    "   • Calculate pressure drop rates and wind speed changes",
    "   • Engineer composite features (pressure + wind + precipitation)",
    "   • Create seasonal and location-based features",
    "",
    "4. DISASTER LABELING STRATEGY:",
    "   • Use extreme weather thresholds to create synthetic disaster labels",
    "   • Combine multiple weather factors for more accurate labeling",
    "   • Consider regional variations in extreme weather definitions",
    "",
    "5. MODEL TRAINING CONSIDERATIONS:",
    "   • Use stratified sampling to maintain class balance",
    "   • Consider ensemble methods for better prediction accuracy",
    "   • Implement cross-validation for robust model evaluation",
    "   • Monitor for overfitting with limited disaster examples"
]

for rec in recommendations:
    print(rec)

print("\n" + "="*80)
print("EXPLORATION COMPLETE - Ready for data preprocessing pipeline!")
print("="*80)