In [6]:
import pandas as pd
import numpy as np
import requests
from pyproj import Transformer

df = pd.read_csv('../data/conus_ak/fired_conus_ak_2000_to_2024_events.csv')

transformer = Transformer.from_crs("epsg:32610", "epsg:4326", always_xy=True)  # Adjust EPSG if UTM zone varies
df['lon'], df['lat'] = transformer.transform(df['ig_utm_x'].values, df['ig_utm_y'].values)

df['ig_date'] = pd.to_datetime(df['ig_date'])
df['last_date'] = pd.to_datetime(df['last_date'])

def get_event_dates(row):
    return [row['ig_date'] + pd.Timedelta(days=i) for i in range(int(row['event_dur']))]

df['event_dates'] = df.apply(get_event_dates, axis=1)

df.head()

Unnamed: 0,id,geometry,ig_date,ig_day,ig_month,ig_year,last_date,event_dur,tot_pix,tot_ar_km2,...,lc_name,lc_desc,lc_type,eco_mode,eco_name,eco_type,tot_perim,lon,lat,event_dates
0,1,MULTIPOLYGON (((-10010798.656358264 4834667.34...,2007-02-01,32,2,2007,2007-02-01,1,1,0.214659,...,Evergreen Needleleaf Forests,Dominated by evergreen conifer trees (canopy>2...,IGBP global vegetation classification scheme,10.0,Central Pacific coastal forests,WWF Terrestrial Ecoregions of the World,1861.250866,163.280081,14.680764,[2007-02-01 00:00:00]
1,3,MULTIPOLYGON (((-10013578.656358264 4818451.34...,2003-10-31,304,10,2003,2003-11-04,5,5,1.073293,...,Evergreen Needleleaf Forests,Dominated by evergreen conifer trees (canopy>2...,IGBP global vegetation classification scheme,10.0,Central Pacific coastal forests,WWF Terrestrial Ecoregions of the World,7419.250866,163.312547,14.634074,"[2003-10-31 00:00:00, 2003-11-01 00:00:00, 200..."
2,4,MULTIPOLYGON (((-10026087.656358264 4817061.34...,2007-02-02,33,2,2007,2007-02-02,1,2,0.429317,...,Evergreen Needleleaf Forests,Dominated by evergreen conifer trees (canopy>2...,IGBP global vegetation classification scheme,10.0,Central Pacific coastal forests,WWF Terrestrial Ecoregions of the World,2787.250866,163.281524,14.602774,[2007-02-02 00:00:00]
3,5,MULTIPOLYGON (((-10025624.656358264 4816598.34...,2013-10-30,303,10,2013,2013-10-30,1,1,0.214659,...,Evergreen Needleleaf Forests,Dominated by evergreen conifer trees (canopy>2...,IGBP global vegetation classification scheme,10.0,Central Pacific coastal forests,WWF Terrestrial Ecoregions of the World,1861.250866,163.283768,14.602657,[2013-10-30 00:00:00]
4,6,MULTIPOLYGON (((-10024234.656358264 4816134.84...,2013-10-30,303,10,2013,2013-10-30,1,1,0.214659,...,Evergreen Needleleaf Forests,Dominated by evergreen conifer trees (canopy>2...,IGBP global vegetation classification scheme,10.0,Central Pacific coastal forests,WWF Terrestrial Ecoregions of the World,1861.250866,163.28838,14.604608,[2013-10-30 00:00:00]


In [7]:
df_filtered = df[df['ig_date'] > '2016-01-01'].copy()
len(df_filtered)

122481

In [15]:
import json
import numpy as np
import pandas as pd
import time
import os
from datetime import datetime
from tqdm import tqdm

import openmeteo_requests
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

OPEN_MATEO_API_KEY = "0VsL87d9nCTz6eRe"

weather_variables = [
    'weather_code',
    'temperature_2m_max',
    'temperature_2m_min',
    'apparent_temperature_max',
    'apparent_temperature_min',
    'precipitation_sum',
    'rain_sum',
    'snowfall_sum',
    'precipitation_hours',
    'sunshine_duration',
    'daylight_duration',
    'wind_speed_10m_max',
    'wind_gusts_10m_max',
    'wind_direction_10m_dominant',
    'shortwave_radiation_sum',
    'et0_fao_evapotranspiration'
]

def get_weather_history(lat, lon, ignition_date):
    """Get weather data for 14 days before ignition date using openmeteo_requests"""
    url = "https://customer-archive-api.open-meteo.com/v1/archive"
    
    # Convert ignition_date to datetime if it's a string
    if isinstance(ignition_date, str):
        ignition_date = pd.to_datetime(ignition_date)
    
    # Calculate the date 14 days before ignition
    preignition_date = (ignition_date - pd.Timedelta(days=14)).strftime('%Y-%m-%d')
    ignition_date_str = ignition_date.strftime('%Y-%m-%d')
    
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": preignition_date,
        "end_date": ignition_date_str,
        "daily": weather_variables,
        "timezone": "auto",
        "apikey": OPEN_MATEO_API_KEY
    }
    
    try:
        responses = openmeteo.weather_api(url, params=params)
        # Process first location
        response = responses[0]
        
        # Process daily data
        daily = response.Daily()
        
        # Create a dictionary to hold the data
        weather_data = {
            "date": pd.date_range(
                start=pd.to_datetime(daily.Time(), unit="s"),
                end=pd.to_datetime(daily.TimeEnd(), unit="s"),
                freq=pd.Timedelta(days=1),
                inclusive="left"
            )
        }
        
        # Process each weather variable - the order matters and must match the request
        for idx, var in enumerate(weather_variables):
            if daily.Variables(idx) is not None:
                weather_data[var] = daily.Variables(idx).ValuesAsNumpy()
        
        # Create DataFrame
        weather_df = pd.DataFrame(data=weather_data)
        
        # Add metadata
        weather_df['lat'] = lat
        weather_df['lon'] = lon
        weather_df['ignition_date'] = ignition_date_str
        weather_df['days_before_ignition'] = (ignition_date - weather_df['date']).dt.days
        
        return weather_df
    except Exception as e:
        print(f"Exception occurred: {e}")
        return None

# Create output directory if it doesn't exist
os.makedirs('../data/weather_data', exist_ok=True)

# Store fire event IDs with their weather data file paths
weather_metadata = []

# Process each fire event
with tqdm(total=len(df_filtered), desc="Processing fire events") as pbar:
    for idx, row in df_filtered.iterrows():
        fire_id = row['id']
        lat = row['lat']
        lon = row['lon']
        ig_date = row['ig_date']
        
        # Check if file already exists to avoid redundant API calls
        filename = f"../data/weather_data/fire_{fire_id}_{ig_date.strftime('%Y%m%d')}.csv"
        if os.path.exists(filename):
            weather_metadata.append({
                'fire_id': fire_id,
                'ignition_date': ig_date,
                'lat': lat,
                'lon': lon,
                'weather_file': filename
            })
            pbar.update(1)
            continue
        
        # Get weather data
        weather_df = get_weather_history(lat, lon, ig_date)
        
        if weather_df is not None:
            # Save to CSV
            weather_df.to_csv(filename, index=False)
            
            # Record metadata
            weather_metadata.append({
                'fire_id': fire_id,
                'ignition_date': ig_date,
                'lat': lat,
                'lon': lon,
                'weather_file': filename
            })
            
        # Update progress bar
        pbar.update(1)
        
        # The library handles rate limiting internally, but we can add a small
        # delay if needed for very large datasets
        if idx % 100 == 0 and idx > 0:
            time.sleep(1)

# Create a metadata file that links fire events to their weather data files
metadata_df = pd.DataFrame(weather_metadata)
metadata_df.to_csv('../data/weather_data/weather_metadata.csv', index=False)

print(f"Completed processing {len(df_filtered)} fire events.")
print(f"Successfully retrieved weather data for {len(weather_metadata)} events.")

# Helper function to load weather data
def load_fire_weather(fire_id=None, metadata_path='../data/weather_data/weather_metadata.csv'):
    """
    Load weather data for a specific fire or all fires
    """"
    
    # Load metadata
    metadata = pd.read_csv(metadata_path)
    
    if fire_id is not None:
        # Get the specific fire's metadata
        fire_meta = metadata[metadata['fire_id'] == fire_id]
        
        if len(fire_meta) == 0:
            print(f"No weather data found for fire ID {fire_id}")
            return None
        
        # Load the CSV file
        weather_file = fire_meta.iloc[0]['weather_file']
        return pd.read_csv(weather_file, parse_dates=['date'])
    else:
        # Load all fire weather data
        weather_data = {}
        for _, row in metadata.iterrows():
            fire_id = row['fire_id']
            weather_file = row['weather_file']
            weather_data[fire_id] = pd.read_csv(weather_file, parse_dates=['date'])
        
        return weather_data

Processing fire events: 100%|██████████| 122481/122481 [5:08:25<00:00,  6.62it/s]   


Completed processing 122481 fire events.
Successfully retrieved weather data for 122481 events.


In [16]:
metadata_df

Unnamed: 0,fire_id,ignition_date,lat,lon,weather_file
0,16,2018-10-22,14.617790,163.329689,../data/weather_data/fire_16_20181022.csv
1,19,2018-10-20,14.561815,163.289361,../data/weather_data/fire_19_20181020.csv
2,21,2023-09-16,14.538290,163.279034,../data/weather_data/fire_21_20230916.csv
3,25,2020-09-05,14.414217,163.174572,../data/weather_data/fire_25_20200905.csv
4,26,2018-10-20,14.518580,163.299238,../data/weather_data/fire_26_20181020.csv
...,...,...,...,...,...
122476,517086,2019-08-22,39.009169,164.844545,../data/weather_data/fire_517086_20190822.csv
122477,517087,2019-08-16,38.989999,164.892736,../data/weather_data/fire_517087_20190816.csv
122478,517088,2019-08-16,38.982260,164.920055,../data/weather_data/fire_517088_20190816.csv
122479,517089,2021-08-27,38.996342,164.884739,../data/weather_data/fire_517089_20210827.csv


In [28]:
load_fire_weather(25)

Unnamed: 0,date,weather_code,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,...,daylight_duration,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,lat,lon,ignition_date,days_before_ignition
0,2020-08-21 13:00:00,53.0,28.65,27.0,35.75303,31.450607,4.3,4.3,0.0,9.0,...,45088.44,19.770523,28.08,102.335045,23.79,5.094182,14.414217,163.174572,2020-09-05,14
1,2020-08-22 13:00:00,51.0,28.75,27.4,35.35414,31.85252,0.3,0.3,0.0,2.0,...,45044.695,16.595179,22.68,94.358986,25.46,5.370487,14.414217,163.174572,2020-09-05,13
2,2020-08-23 13:00:00,53.0,28.65,27.6,34.698513,31.49472,1.6,1.6,0.0,11.0,...,45000.504,19.201874,26.28,95.88589,22.4,4.758129,14.414217,163.174572,2020-09-05,12
3,2020-08-24 13:00:00,61.0,28.6,26.35,34.828796,30.270138,5.900001,5.900001,0.0,13.0,...,44955.9,17.81909,25.199999,99.2996,20.53,4.411226,14.414217,163.174572,2020-09-05,11
4,2020-08-25 13:00:00,53.0,28.3,26.55,35.07344,31.256077,2.1,2.1,0.0,7.0,...,44910.914,10.594036,17.64,166.17657,24.42,4.728853,14.414217,163.174572,2020-09-05,10
5,2020-08-26 13:00:00,51.0,29.1,27.55,35.261597,31.366676,0.3,0.3,0.0,3.0,...,44865.582,18.58451,26.64,95.97424,25.35,5.326609,14.414217,163.174572,2020-09-05,9
6,2020-08-27 13:00:00,51.0,29.5,28.35,34.966286,31.955772,0.9,0.9,0.0,7.0,...,44819.945,25.651104,36.719997,84.850655,24.26,5.530336,14.414217,163.174572,2020-09-05,8
7,2020-08-28 13:00:00,53.0,29.4,27.5,34.754337,30.838264,3.7,3.7,0.0,14.0,...,44774.023,26.319422,34.56,85.31822,23.65,5.390596,14.414217,163.174572,2020-09-05,7
8,2020-08-29 13:00:00,51.0,29.3,27.65,33.59569,30.989243,0.1,0.1,0.0,1.0,...,44727.863,27.607563,36.36,89.07594,24.9,5.590386,14.414217,163.174572,2020-09-05,6
9,2020-08-30 13:00:00,63.0,29.15,26.0,32.393158,27.856647,10.6,10.6,0.0,11.0,...,44681.492,29.17777,43.56,91.28523,10.19,2.967695,14.414217,163.174572,2020-09-05,5


In [29]:
def aggregate_weather_metrics(metadata_path='../data/weather_data/weather_metadata.csv'):
    """
    Aggregate statistical metrics from weather data for each fire ID.
    
    Parameters:
    - metadata_path: Path to the metadata CSV file
    
    Returns:
    - DataFrame with aggregated metrics for each fire ID
    """
    # Load metadata
    metadata_df = pd.read_csv(metadata_path)
    print(f"Found {len(metadata_df)} fire events in metadata")
    
    # List of weather variables to aggregate (excluding weather_code)
    weather_vars = [
        'temperature_2m_max',
        'temperature_2m_min',
        'apparent_temperature_max',
        'apparent_temperature_min',
        'precipitation_sum',
        'rain_sum',
        'snowfall_sum',
        'precipitation_hours',
        'sunshine_duration',
        'daylight_duration',
        'wind_speed_10m_max',
        'wind_gusts_10m_max',
        'wind_direction_10m_dominant',
        'shortwave_radiation_sum',
        'et0_fao_evapotranspiration'
    ]
    
    # Define aggregation functions
    agg_functions = {
        'mean': np.mean,
        'median': np.median,
        'min': np.min,
        'max': np.max,
        'std': np.std,
        'sum': np.sum,
        'range': lambda x: np.max(x) - np.min(x),
        'q25': lambda x: np.percentile(x, 25),
        'q75': lambda x: np.percentile(x, 75),
        'iqr': lambda x: np.percentile(x, 75) - np.percentile(x, 25),
        'last3_mean': lambda x: np.mean(x[-3:]) if len(x) >= 3 else np.nan,  # Mean of last 3 days
        'last7_mean': lambda x: np.mean(x[-7:]) if len(x) >= 7 else np.nan,  # Mean of last 7 days
    }
    
    # Variables to skip certain aggregations for (e.g., sum doesn't make sense for temperature)
    skip_sum = ['temperature_2m_max', 'temperature_2m_min', 'apparent_temperature_max', 
                'apparent_temperature_min', 'wind_direction_10m_dominant']
    
    # List to store aggregated data
    aggregated_data = []
    
    # Process each fire event
    for _, row in tqdm(metadata_df.iterrows(), total=len(metadata_df), desc="Aggregating metrics"):
        fire_id = row['fire_id']
        weather_file = row['weather_file']
        
        # Skip if file doesn't exist
        if not os.path.exists(weather_file):
            print(f"Warning: Weather file not found for fire ID {fire_id}: {weather_file}")
            continue
        
        try:
            # Load weather data
            weather_df = pd.read_csv(weather_file, parse_dates=['date'])
            
            # Check if all required columns are present
            missing_vars = [var for var in weather_vars if var not in weather_df.columns]
            if missing_vars:
                print(f"Warning: Missing variables for fire ID {fire_id}: {missing_vars}")
                continue
                
            # Check for minimum data points (at least 7 days of data)
            if len(weather_df) < 7:
                print(f"Warning: Insufficient data points for fire ID {fire_id}, found {len(weather_df)}")
                continue
            
            # Sort by days_before_ignition to ensure chronological order
            weather_df = weather_df.sort_values('days_before_ignition', ascending=False)
            
            # Create dictionary for this fire's aggregated metrics
            fire_metrics = {'fire_id': fire_id}
            
            # Add metadata
            fire_metrics['ignition_date'] = row['ignition_date']
            fire_metrics['lat'] = row['lat']
            fire_metrics['lon'] = row['lon']
            
            # Calculate metrics for each variable
            for var in weather_vars:
                if var not in weather_df.columns:
                    continue
                    
                # Skip rows with NaN values
                var_data = weather_df[var].dropna()
                
                if len(var_data) == 0:
                    continue
                
                # Apply each aggregation function
                for agg_name, agg_func in agg_functions.items():
                    # Skip sum for variables where it doesn't make sense
                    if agg_name == 'sum' and var in skip_sum:
                        continue
                        
                    try:
                        result = agg_func(var_data)
                        fire_metrics[f"{var}_{agg_name}"] = result
                    except Exception as e:
                        print(f"Error calculating {agg_name} for {var} in fire ID {fire_id}: {e}")
            
            # Add trend metrics (changes over time)
            for var in weather_vars:
                if var in skip_sum:  # Skip directional variables for trend analysis
                    continue
                    
                if var not in weather_df.columns:
                    continue
                    
                var_data = weather_df[var].dropna()
                
                if len(var_data) < 7:
                    continue
                
                # Calculate trend (slope) using last 14 days
                try:
                    days = np.arange(len(var_data))
                    if len(days) > 0 and len(var_data) > 0:
                        trend = np.polyfit(days, var_data, 1)[0]
                        fire_metrics[f"{var}_trend"] = trend
                except Exception as e:
                    print(f"Error calculating trend for {var} in fire ID {fire_id}: {e}")
            
            # Calculate temporal metrics (specific days before ignition)
            days_of_interest = [1, 2, 3, 5, 7, 14]
            for day in days_of_interest:
                day_data = weather_df[weather_df['days_before_ignition'] == day]
                if len(day_data) == 0:
                    continue
                    
                for var in weather_vars:
                    if var not in day_data.columns or pd.isna(day_data[var].iloc[0]):
                        continue
                    
                    fire_metrics[f"{var}_day{day}"] = day_data[var].iloc[0]
            
            # Add to list of aggregated data
            aggregated_data.append(fire_metrics)
            
        except Exception as e:
            print(f"Error processing fire ID {fire_id}: {e}")
    
    if not aggregated_data:
        print("No valid aggregated data found! :(")
        return None
        
    aggregated_df = pd.DataFrame(aggregated_data)
    
    # Print statistics
    print(f"Successfully aggregated metrics for {len(aggregated_df)} out of {len(metadata_df)} fire events")
    print(f"Total features: {len(aggregated_df.columns) - 4}")  # -4 for fire_id, ignition_date, lat, lon
    
    return aggregated_df

aggregated_metrics_df = aggregate_weather_metrics()
aggregated_metrics_df.to_csv('../data/aggregated_weather_metrics.csv', index=False)

print(f"Saved aggregated metrics to ../data/aggregated_weather_metrics.csv")
print("\nSample of aggregated metrics:")
print(aggregated_metrics_df.iloc[:5, :10])
print("\nMetric columns:")
for col in sorted(aggregated_metrics_df.columns):
    print(f"- {col}")

Found 122481 fire events in metadata


Aggregating metrics: 100%|██████████| 122481/122481 [19:01<00:00, 107.27it/s]


Successfully aggregated metrics for 122481 out of 122481 fire events
Total features: 275
Saved aggregated metrics to ../data/aggregated_weather_metrics.csv

Sample of aggregated metrics:
   fire_id ignition_date        lat         lon  temperature_2m_max_mean  \
0       16    2018-10-22  14.617790  163.329689                28.520000   
1       19    2018-10-20  14.561815  163.289361                28.543333   
2       21    2023-09-16  14.538290  163.279034                29.076667   
3       25    2020-09-05  14.414217  163.174572                29.013333   
4       26    2018-10-20  14.518580  163.299238                28.556667   

   temperature_2m_max_median  temperature_2m_max_min  temperature_2m_max_max  \
0                      28.60                   27.40                   29.45   
1                      28.65                   27.45                   29.35   
2                      29.10                   28.40                   29.65   
3                      29.10        

In [35]:
aggregated_metrics_df

Unnamed: 0,fire_id,ignition_date,lat,lon,temperature_2m_max_mean,temperature_2m_max_median,temperature_2m_max_min,temperature_2m_max_max,temperature_2m_max_std,temperature_2m_max_range,...,rain_sum_day14,snowfall_sum_day14,precipitation_hours_day14,sunshine_duration_day14,daylight_duration_day14,wind_speed_10m_max_day14,wind_gusts_10m_max_day14,wind_direction_10m_dominant_day14,shortwave_radiation_sum_day14,et0_fao_evapotranspiration_day14
0,16,2018-10-22,14.617790,163.329689,28.520000,28.60,27.40,29.45,0.493221,2.05,...,1.000000,0.0,7.0,40326.055,42887.695,22.870626,31.319998,70.831480,22.59,5.092394
1,19,2018-10-20,14.561815,163.289361,28.543333,28.65,27.45,29.35,0.478841,1.90,...,6.300000,0.0,16.0,39537.530,42983.520,20.150354,27.720000,67.918900,20.48,4.312694
2,21,2023-09-16,14.538290,163.279034,29.076667,29.10,28.40,29.65,0.358174,1.25,...,1.400000,0.0,8.0,42115.630,44628.650,23.006226,29.880000,72.853900,23.57,5.229069
3,25,2020-09-05,14.414217,163.174572,29.013333,29.10,28.30,29.65,0.363073,1.35,...,4.300000,0.0,9.0,42262.508,45088.440,19.770523,28.080000,102.335045,23.79,5.094182
4,26,2018-10-20,14.518580,163.299238,28.556667,28.70,27.60,29.30,0.453088,1.70,...,6.200000,0.0,12.0,39542.785,42986.562,19.469975,27.359999,66.598015,20.53,4.398854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122476,517086,2019-08-22,39.009169,164.844545,20.633333,20.65,18.05,22.95,1.728648,4.90,...,8.400000,0.0,12.0,30172.955,50310.695,36.702347,48.600000,214.103550,16.30,2.333407
122477,517087,2019-08-16,38.989999,164.892736,21.213333,21.05,18.60,22.95,1.216753,4.35,...,0.300000,0.0,3.0,30163.836,51018.305,23.210928,31.319998,328.707030,17.56,3.003890
122478,517088,2019-08-16,38.982260,164.920055,21.146667,21.05,18.60,22.80,1.186095,4.20,...,0.100000,0.0,1.0,32964.047,51018.305,23.210928,31.319998,329.747620,17.80,3.031766
122479,517089,2021-08-27,38.996342,164.884739,18.873333,18.65,17.50,20.50,1.017983,3.00,...,0.100000,0.0,1.0,40647.742,49602.240,55.107050,74.880000,282.386870,20.41,4.925669


In [39]:
def join_fire_data_with_metrics(filtered_df, aggregated_metrics_df):
    """
    Join the filtered fire dataframe with the aggregated weather metrics dataframe.
    
    Parameters:
    - filtered_df: Original fire events dataframe
    - aggregated_metrics_df: Dataframe with aggregated weather metrics
    
    Returns:
    - Combined dataframe with fire data and weather metrics
    """
    print(f"Original filtered_df shape: {filtered_df.shape}")
    print(f"Aggregated metrics dataframe shape: {aggregated_metrics_df.shape}")
    
    # Rename columns in aggregated_metrics_df to avoid conflicts
    aggregated_metrics_df = aggregated_metrics_df.rename(columns={
        'ignition_date': 'metrics_ignition_date',
        'lat': 'metrics_lat',
        'lon': 'metrics_lon'
    })
    
    # Join dataframes on fire_id = id
    combined_df = pd.merge(
        filtered_df,
        aggregated_metrics_df,
        left_on='id',
        right_on='fire_id',
        how='inner',
        suffixes=('', '_metrics')
    )
    
    # Drop redundant columns
    columns_to_drop = ['fire_id', 'metrics_ignition_date']
    combined_df = combined_df.drop(columns=columns_to_drop)
    
    # Verify that the lat/lon values are similar
    if 'metrics_lat' in combined_df.columns and 'metrics_lon' in combined_df.columns:
        lat_diff = (combined_df['lat'] - combined_df['metrics_lat']).abs().mean()
        lon_diff = (combined_df['lon'] - combined_df['metrics_lon']).abs().mean()
        print(f"Average difference in lat: {lat_diff}")
        print(f"Average difference in lon: {lon_diff}")
        
        # Drop the metrics lat/lon columns
        combined_df = combined_df.drop(columns=['metrics_lat', 'metrics_lon'])
    
    print(f"Combined dataframe shape: {combined_df.shape}")
    print(f"Number of fires with complete data: {len(combined_df)}")
    
    return combined_df

combined_df = join_fire_data_with_metrics(df_filtered, aggregated_metrics_df)
combined_df.to_csv('../data/fire_events_with_weather_metrics.csv', index=False)

print(f"Saved combined dataset to ../data/fire_events_with_weather_metrics.csv")

print("\nCombined dataset summary:")
print(f"Total rows: {len(combined_df)}")
print(f"Total columns: {len(combined_df.columns)}")
print(f"Memory usage: {combined_df.memory_usage().sum() / 1024**2:.2f} MB")
print("\nSample of combined data (first 5 rows, selected columns):")

sample_columns = ['id', 'ig_date', 'lat', 'lon', 'event_dur']
weather_sample_cols = [col for col in combined_df.columns if 'temperature' in col and 'mean' in col][:3]
sample_columns.extend(weather_sample_cols)
print(combined_df[sample_columns].head())


missing_values = combined_df.isnull().sum()
print("\nColumns with missing values:")
print(missing_values[missing_values > 0].sort_values(ascending=False).head(10))

Original filtered_df shape: (122481, 35)
Aggregated metrics dataframe shape: (122481, 279)
Average difference in lat: 4.55789603218717e-16
Average difference in lon: 8.959448413287765e-16
Combined dataframe shape: (122481, 310)
Number of fires with complete data: 122481
Saved combined dataset to ../data/fire_events_with_weather_metrics.csv

Combined dataset summary:
Total rows: 122481
Total columns: 310
Memory usage: 289.68 MB

Sample of combined data (first 5 rows, selected columns):
   id    ig_date        lat         lon  event_dur  temperature_2m_max_mean  \
0  16 2018-10-22  14.617790  163.329689          1                28.520000   
1  19 2018-10-20  14.561815  163.289361          1                28.543333   
2  21 2023-09-16  14.538290  163.279034          1                29.076667   
3  25 2020-09-05  14.414217  163.174572          7                29.013333   
4  26 2018-10-20  14.518580  163.299238          1                28.556667   

   temperature_2m_max_last3_mean  t

In [45]:
combined_df.columns.tolist()

['id',
 'geometry',
 'ig_date',
 'ig_day',
 'ig_month',
 'ig_year',
 'last_date',
 'event_dur',
 'tot_pix',
 'tot_ar_km2',
 'fsr_px_dy',
 'fsr_km2_dy',
 'mx_grw_px',
 'mn_grw_px',
 'mu_grw_px',
 'mx_grw_km2',
 'mn_grw_km2',
 'mu_grw_km2',
 'mx_grw_dte',
 'x',
 'y',
 'ig_utm_x',
 'ig_utm_y',
 'lc_code',
 'lc_mode',
 'lc_name',
 'lc_desc',
 'lc_type',
 'eco_mode',
 'eco_name',
 'eco_type',
 'tot_perim',
 'lon',
 'lat',
 'event_dates',
 'temperature_2m_max_mean',
 'temperature_2m_max_median',
 'temperature_2m_max_min',
 'temperature_2m_max_max',
 'temperature_2m_max_std',
 'temperature_2m_max_range',
 'temperature_2m_max_q25',
 'temperature_2m_max_q75',
 'temperature_2m_max_iqr',
 'temperature_2m_max_last3_mean',
 'temperature_2m_max_last7_mean',
 'temperature_2m_min_mean',
 'temperature_2m_min_median',
 'temperature_2m_min_min',
 'temperature_2m_min_max',
 'temperature_2m_min_std',
 'temperature_2m_min_range',
 'temperature_2m_min_q25',
 'temperature_2m_min_q75',
 'temperature_2m_min_iqr

# Statistical Analyses on Weather Features

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
import shap
import os
import warnings
warnings.filterwarnings('ignore')

combined_df = pd.read_csv('../data/fire_events_with_weather_metrics.csv')
print(f"Loaded dataset with {combined_df.shape[0]} rows and {combined_df.shape[1]} columns")

# Define target variables
target_vars = ['tot_ar_km2', 'fsr_km2_dy', 'mx_grw_km2', 'mu_grw_km2', 'tot_perim']

# Check if target variables exist in the dataset
missing_targets = [var for var in target_vars if var not in combined_df.columns]
if missing_targets:
    print(f"Warning: The following target variables are missing: {missing_targets}")
    target_vars = [var for var in target_vars if var in combined_df.columns]

# Basic statistics for target variables
print("\nBasic statistics for target variables:")
print(combined_df[target_vars].describe())

# Handle potential missing values in target variables
for var in target_vars:
    missing_pct = combined_df[var].isna().mean() * 100
    print(f"Missing values in {var}: {missing_pct:.2f}%")
    if missing_pct > 0:
        # Drop rows with missing target values
        combined_df = combined_df.dropna(subset=[var])

print(f"Dataset after handling missing values: {combined_df.shape[0]} rows")

# Identify wind direction columns
wind_dir_cols = [col for col in combined_df.columns if 'wind_direction' in col]
print(f"Found {len(wind_dir_cols)} wind direction columns")

# Identify wind speed columns
wind_speed_cols = [col for col in combined_df.columns if 'wind_speed' in col or 'wind_gusts' in col]
print(f"Found {len(wind_speed_cols)} wind speed columns")

# Identify other weather metric columns
weather_cols = [col for col in combined_df.columns if any(x in col for x in [
    'temperature', 'apparent', 'precipitation', 'rain', 'snow', 
    'radiation', 'evapotranspiration', 'sunshine', 'daylight'
]) and 'wind' not in col]

print(f"Found {len(weather_cols)} non-wind weather columns")

# Create additional wind direction features
def create_minimal_wind_features(df, wind_dir_cols, wind_speed_cols):
    """Create a minimal set of wind features that combine direction and speed"""
    processed_df = df.copy()
    created_features = []
    
    # Use the dominant wind direction if available, otherwise use another wind direction column
    main_dir_col = [col for col in wind_dir_cols if 'dominant' in col]
    if main_dir_col:
        main_dir_col = main_dir_col[0]
    elif wind_dir_cols:
        main_dir_col = wind_dir_cols[0]
    else:
        print("No wind direction columns found")
        return processed_df, created_features
    
    # Skip if column has all NaN values
    if main_dir_col not in processed_df.columns or processed_df[main_dir_col].isna().all():
        print(f"Wind direction column {main_dir_col} is invalid")
        return processed_df, created_features
    
    # Convert to radians
    wind_rad = np.radians(processed_df[main_dir_col])
    
    # Create north-south and east-west components and cos/sin components
    ns_col = "wind_northward"
    ew_col = "wind_eastward"
    
    processed_df[ns_col] = np.cos(wind_rad)
    processed_df[ew_col] = np.sin(wind_rad)
    
    created_features.extend([ns_col, ew_col])
    
    mean_speed_col = [col for col in wind_speed_cols if 'mean' in col and 'speed' in col]
    max_gust_col = [col for col in wind_speed_cols if 'max' in col and 'gusts' in col]
    
    if mean_speed_col:
        mean_speed_col = mean_speed_col[0]
        # Create northward and eastward mean wind velocities
        north_mean_vel = "wind_north_velocity"
        east_mean_vel = "wind_east_velocity"
        
        processed_df[north_mean_vel] = processed_df[ns_col] * processed_df[mean_speed_col]
        processed_df[east_mean_vel] = processed_df[ew_col] * processed_df[mean_speed_col]
        
        created_features.extend([north_mean_vel, east_mean_vel])
    
    if max_gust_col:
        max_gust_col = max_gust_col[0]
        # Create northward and eastward max gust velocities
        north_gust_vel = "gust_north_velocity"
        east_gust_vel = "gust_east_velocity"
        
        processed_df[north_gust_vel] = processed_df[ns_col] * processed_df[max_gust_col]
        processed_df[east_gust_vel] = processed_df[ew_col] * processed_df[max_gust_col]
        
        created_features.extend([north_gust_vel, east_gust_vel])
    
    return processed_df, created_features

print("\nCreating additional wind features...")
combined_df, wind_features = create_minimal_wind_features(combined_df, wind_dir_cols, wind_speed_cols)


weather_cols.extend(wind_speed_cols)
weather_cols.extend(wind_features)
print(f"Created {len(wind_features)} minimal wind features:")
for feature in wind_features:
    print(f"  - {feature}")

# Remove original wind direction columns from analysis
weather_cols = [col for col in weather_cols if 'wind_direction' not in col]

  from .autonotebook import tqdm as notebook_tqdm


Loaded dataset with 122481 rows and 310 columns

Basic statistics for target variables:
          tot_ar_km2     fsr_km2_dy     mx_grw_km2     mu_grw_km2  \
count  122481.000000  122481.000000  122481.000000  122481.000000   
mean        2.603060       0.445347       1.157670       0.883250   
std        27.973542       1.748412      10.442255       7.903565   
min         0.214659       0.058543       0.214659       0.214659   
25%         0.214659       0.214659       0.214659       0.214659   
50%         0.429317       0.214659       0.214659       0.214659   
75%         1.073293       0.429317       0.643976       0.536647   
max      3584.799844     232.904661    1920.551150    1608.782292   

          tot_perim  
count  1.224810e+05  
mean   6.230137e+03  
std    1.950468e+04  
min    1.861251e+03  
25%    1.861251e+03  
50%    2.787251e+03  
75%    5.568251e+03  
max    1.794116e+06  
Missing values in tot_ar_km2: 0.00%
Missing values in fsr_km2_dy: 0.00%
Missing values in mx

In [50]:
# 1. Correlation Analysis
# -----------------------
def analyze_correlations(df, features, targets, n_top=20):
    """Analyze correlations between features and target variables"""
    all_correlations = pd.DataFrame()
    
    for target in targets:
        # Calculate correlations with target
        correlations = pd.DataFrame({
            'feature': features,
            f'corr_with_{target}': [stats.spearmanr(df[feature], df[target], 
                                                  nan_policy='omit')[0] 
                                  for feature in features]
        })
        
        # Sort by absolute correlation
        correlations[f'abs_corr_with_{target}'] = correlations[f'corr_with_{target}'].abs()
        correlations = correlations.sort_values(f'abs_corr_with_{target}', ascending=False)
        
        if all_correlations.empty:
            all_correlations = correlations
        else:
            all_correlations = pd.merge(all_correlations, correlations, on='feature')
    
    # Return the top correlations
    return all_correlations.head(n_top)

# Calculate correlations
print("\nCalculating correlations between weather features and fire characteristics...")
correlations_df = analyze_correlations(combined_df, weather_cols, target_vars)
print("\nTop correlations with target variables:")
print(correlations_df)

# Visualize top correlations for each target
plt.figure(figsize=(15, 12))

for i, target in enumerate(target_vars):
    plt.subplot(len(target_vars), 1, i+1)
    
    # Get top 10 features by correlation magnitude
    top_features = correlations_df.nlargest(10, f'abs_corr_with_{target}')
    
    # Create horizontal bar plot
    colors = ['red' if x < 0 else 'blue' for x in top_features[f'corr_with_{target}']]
    sns.barplot(x=top_features[f'corr_with_{target}'], y=top_features['feature'], palette=colors)
    
    plt.title(f'Top 10 Weather Features Correlated with {target}')
    plt.xlabel('Spearman Correlation')
    plt.tight_layout()

os.makedirs('../figures', exist_ok=True)
plt.savefig('../figures/top_correlations.png')
plt.close()

Loaded dataset with 122481 rows and 310 columns

Basic statistics for target variables:
          tot_ar_km2     fsr_km2_dy     mx_grw_km2     mu_grw_km2  \
count  122481.000000  122481.000000  122481.000000  122481.000000   
mean        2.603060       0.445347       1.157670       0.883250   
std        27.973542       1.748412      10.442255       7.903565   
min         0.214659       0.058543       0.214659       0.214659   
25%         0.214659       0.214659       0.214659       0.214659   
50%         0.429317       0.214659       0.214659       0.214659   
75%         1.073293       0.429317       0.643976       0.536647   
max      3584.799844     232.904661    1920.551150    1608.782292   

          tot_perim  
count  1.224810e+05  
mean   6.230137e+03  
std    1.950468e+04  
min    1.861251e+03  
25%    1.861251e+03  
50%    2.787251e+03  
75%    5.568251e+03  
max    1.794116e+06  
Missing values in tot_ar_km2: 0.00%
Missing values in fsr_km2_dy: 0.00%
Missing values in mx

In [2]:
# Feature Selection and Model Optimization
# ------------------------------------------------
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.feature_selection import mutual_info_regression, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import joblib

# Create directories if they don't exist
os.makedirs('../figures', exist_ok=True)
os.makedirs('../results', exist_ok=True)
os.makedirs('../models', exist_ok=True)

# 1. Mutual Information Analysis
# --------------------------------------
def analyze_mutual_information(X, y, feature_names, target_name, n_top=20):
    """
    Calculate mutual information between features and target
    to identify most informative features
    """
    print(f"\nCalculating mutual information for {target_name}...")
    
    mi_scores = mutual_info_regression(X, y, n_neighbors=3, random_state=42)
    mi_df = pd.DataFrame({
        'feature': feature_names,
        'mutual_info': mi_scores
    }).sort_values('mutual_info', ascending=False)
    
    # Save MI scores
    mi_df.to_csv(f'../results/mutual_info_{target_name}.csv', index=False)
    
    # Plot top features
    plt.figure(figsize=(12, 10))
    sns.barplot(x='mutual_info', y='feature', data=mi_df.head(n_top))
    plt.title(f'Top {n_top} Features by Mutual Information with {target_name}')
    plt.tight_layout()
    plt.savefig(f'../figures/mutual_info_{target_name}.png')
    plt.close()
    
    print(f"Top 5 features by mutual information for {target_name}:")
    for i, row in mi_df.head(5).iterrows():
        print(f"  {row['feature']}: {row['mutual_info']:.6f}")
    
    return mi_df

# 2. Correlation Analysis
# ----------------------
def analyze_correlations(X, y, feature_names, target_name, n_top=20):
    """Calculate correlations between features and target"""
    print(f"\nCalculating correlations for {target_name}...")
    
    # Calculate correlations
    correlations = []
    for i, feature in enumerate(feature_names):
        corr, _ = stats.spearmanr(X[:, i], y, nan_policy='omit')
        correlations.append((feature, corr))
    
    # Convert to DataFrame
    corr_df = pd.DataFrame(correlations, columns=['feature', 'correlation'])
    
    # Add absolute correlation
    corr_df['abs_correlation'] = corr_df['correlation'].abs()
    
    # Sort by absolute correlation
    corr_df = corr_df.sort_values('abs_correlation', ascending=False)
    
    # Save correlations
    corr_df.to_csv(f'../results/correlations_{target_name}.csv', index=False)
    
    # Plot top features
    plt.figure(figsize=(12, 10))
    colors = ['red' if x < 0 else 'blue' for x in corr_df.head(n_top)['correlation']]
    sns.barplot(x='correlation', y='feature', data=corr_df.head(n_top), palette=colors)
    plt.title(f'Top {n_top} Features by Correlation with {target_name}')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'../figures/correlations_{target_name}.png')
    plt.close()
    
    print(f"Top 5 features by correlation for {target_name}:")
    for i, row in corr_df.head(5).iterrows():
        print(f"  {row['feature']}: {row['correlation']:.6f}")
    
    return corr_df

# 3. Feature Selection
# ------------------
def select_features_from_rankings(mi_df, corr_df, n_from_each=15):
    """
    Select features based on mutual information and correlation rankings
    """
    # Get top features from MI
    top_mi_features = mi_df.head(n_from_each)['feature'].tolist()
    
    # Get top features from correlation
    top_corr_features = corr_df.head(n_from_each)['feature'].tolist()
    
    # Combine and remove duplicates
    selected_features = list(set(top_mi_features + top_corr_features))
    
    return selected_features

# 4. Target Variable Analysis
# -------------------------
def analyze_target_variable(y, target_name):
    """
    Analyze target variable for issues like skewness, outliers, etc.
    """
    print(f"\nAnalyzing target variable {target_name}...")
    
    # Calculate statistics - renamed from 'stats' to 'target_stats' to avoid conflict
    target_stats = {
        'count': len(y),
        'min': y.min(),
        'max': y.max(),
        'mean': y.mean(),
        'median': np.median(y),
        'std': y.std(),
        'skew': stats.skew(y)
    }
    
    print(f"  Statistics for {target_name}:")
    for stat, value in target_stats.items():
        print(f"    {stat}: {value}")
    
    # Check if target is very skewed
    if abs(target_stats['skew']) > 1:
        print(f"  WARNING: {target_name} is highly skewed ({target_stats['skew']:.2f})")
    
    # Plot distribution
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 2, 1)
    plt.hist(y, bins=30)
    plt.title(f'Distribution of {target_name}')
    
    plt.subplot(2, 2, 2)
    plt.boxplot(y)
    plt.title(f'Boxplot of {target_name}')
    
    # Try log transformation if data is positive and skewed
    if target_stats['min'] > 0 and target_stats['skew'] > 0.5:
        plt.subplot(2, 2, 3)
        log_y = np.log1p(y)
        plt.hist(log_y, bins=30)
        plt.title(f'Log-transformed {target_name}')
        
        plt.subplot(2, 2, 4)
        plt.boxplot(log_y)
        plt.title(f'Boxplot of Log-transformed {target_name}')
    
    plt.tight_layout()
    plt.savefig(f'../figures/target_analysis_{target_name}.png')
    plt.close()
    
    # Check if target needs transformation
    needs_log_transform = target_stats['min'] > 0 and target_stats['skew'] > 0.5
    
    return {
        'stats': target_stats,
        'needs_log_transform': needs_log_transform
    }

# 5. Enhanced Model Evaluation with Proper Transformation Handling
# --------------------------------------------------------------
def evaluate_models_with_selected_features(X, y, feature_names, selected_features, target_name, log_transform=False):
    """
    Evaluate models with selected features.
    """
    # Get indices of selected features
    selected_indices = [feature_names.index(feature) for feature in selected_features]
    
    # Select features from X
    X_selected = X[:, selected_indices]
    
    # Log transform target if needed
    if log_transform and np.all(y > 0):
        print(f"Applying log transform to {target_name}")
        y_transformed = np.log1p(y)
    else:
        y_transformed = y
        log_transform = False  # Ensure it's False if we didn't transform
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y_transformed, test_size=0.2, random_state=42
    )
    
    # Also keep original y_test for proper evaluation
    if log_transform:
        _, _, _, y_test_original = train_test_split(
            X_selected, y, test_size=0.2, random_state=42
        )
    else:
        y_test_original = y_test
    
    # Import additional models
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.ensemble import HistGradientBoostingRegressor
    from sklearn.neural_network import MLPRegressor
    
    models = {
        'Ridge': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', Ridge(alpha=0.1, solver='lsqr', random_state=42))
        ]),
        
        'Lasso': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', Lasso(alpha=0.001, max_iter=2000, selection='random', random_state=42))
        ]),
        
        'ElasticNet': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', ElasticNet(alpha=0.001, l1_ratio=0.3, max_iter=2000, random_state=42))
        ]),
        
        'GradientBoosting': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', GradientBoostingRegressor(n_estimators=150, learning_rate=0.1, 
                                             max_depth=5, subsample=0.8, random_state=42))
        ]),
        
        'RandomForest': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', RandomForestRegressor(n_estimators=200, max_depth=15, 
                                         min_samples_split=5, n_jobs=-1, random_state=42))
        ]),
        
        'KNeighbors': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', KNeighborsRegressor(n_neighbors=5, weights='distance', p=2))
        ]),
        
        'HistGradientBoosting': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', HistGradientBoostingRegressor(max_iter=200, learning_rate=0.1, 
                                                 max_depth=10, l2_regularization=0.1, random_state=42))
        ]),
        
        'MLPRegressor': Pipeline([
            ('scaler', MinMaxScaler()),
            ('model', MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', 
                                alpha=0.001, learning_rate='adaptive', max_iter=500, random_state=42))
        ])
    }
    
    results = {
        'model_name': [],
        'r2_score': [],
        'rmse': [],
        'mae': [],
        'train_time': [],
        'cv_r2_mean': [],
        'cv_r2_std': []
    }
    
    best_model = None
    best_r2 = -float('inf')
    best_y_pred = None
    
    # Create a custom scorer for cross-validation that transforms predictions back after log-transforming them
    if log_transform:
        def inverse_transform_scorer(estimator, X, y):
            y_pred = estimator.predict(X)
            # Transform predictions back to original space
            y_pred_original = np.expm1(y_pred)
            # Transform true values back
            y_original = np.expm1(y)
            # Calculate R² in original space
            return r2_score(y_original, y_pred_original)
        
        scoring = inverse_transform_scorer
    else:
        scoring = 'r2'
    
    # Cross-validation setup (3-fold for speed)
    cv = KFold(n_splits=3, shuffle=True, random_state=42)
    
    # Evaluate each model
    print(f"\nEvaluating models for {target_name} with {len(selected_features)} selected features:")
    
    for model_name, pipeline in models.items():
        try:
            start_time = time.time()
            
            # Train model
            pipeline.fit(X_train, y_train)
            
            # Record training time
            train_time = time.time() - start_time
            
            # Make predictions
            y_pred = pipeline.predict(X_test)
            
            # Transform predictions back to original space if needed
            if log_transform:
                y_pred_original = np.expm1(y_pred)
            else:
                y_pred_original = y_pred
            
            # Calculate metrics in ORIGINAL space
            r2 = r2_score(y_test_original, y_pred_original)
            rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_original))
            mae = mean_absolute_error(y_test_original, y_pred_original)
            
            cv_start = time.time()
            cv_scores = cross_val_score(pipeline, X_selected, y_transformed, 
                                       cv=cv, scoring=scoring, n_jobs=-1)
            cv_time = time.time() - cv_start
            
            # Store results
            results['model_name'].append(model_name)
            results['r2_score'].append(r2)
            results['rmse'].append(rmse)
            results['mae'].append(mae)
            results['train_time'].append(train_time)
            results['cv_r2_mean'].append(cv_scores.mean())
            results['cv_r2_std'].append(cv_scores.std())
            
            print(f"  {model_name}: R² = {r2:.4f}, CV R² = {cv_scores.mean():.4f}±{cv_scores.std():.4f}, Training Time = {train_time:.2f}s")
            
            # Track best model based on CV score
            if cv_scores.mean() > best_r2:
                best_r2 = cv_scores.mean()
                best_model = pipeline
                best_y_pred = y_pred_original
                best_y_test = y_test_original
                best_model_name = model_name
                
        except Exception as e:
            print(f"  Error training {model_name}: {str(e)}")

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('cv_r2_mean', ascending=False)
    results_df.to_csv(f'../results/model_evaluation_{target_name}.csv', index=False)

    print(f"\nBest model for {target_name}: {best_model_name} (CV R² = {best_r2:.4f})")
    
    # Visualize model comparison
    plt.figure(figsize=(12, 8))
    
    # Plot CV R² scores with error bars
    plt.errorbar(
        x=results_df['model_name'], 
        y=results_df['cv_r2_mean'], 
        yerr=results_df['cv_r2_std'], 
        fmt='o', 
        capsize=5
    )
    
    plt.title(f'Model Cross-Validation R² Comparison for {target_name}')
    plt.ylabel('Cross-Validation R² Score')
    plt.xlabel('Model')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f'../figures/model_comparison_{target_name}.png')
    plt.close()
    
    # Plot best model predictions
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.scatter(best_y_test, best_y_pred, alpha=0.5)
    plt.plot([min(best_y_test), max(best_y_test)], [min(best_y_test), max(best_y_test)], 'r--')
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'Actual vs Predicted for {target_name} (Original Scale)')
    
    plt.subplot(1, 2, 2)
    residuals = best_y_test - best_y_pred
    plt.scatter(best_y_pred, residuals, alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel('Predicted')
    plt.ylabel('Residuals')
    plt.title('Residual Plot (Original Scale)')
    
    plt.tight_layout()
    plt.savefig(f'../figures/best_model_performance_{target_name}.png')
    plt.close()
    
    # Extract feature importance from the model if available
    model = best_model.named_steps['model']
    
    if hasattr(model, 'feature_importances_'):
        importance = pd.DataFrame({
            'feature': [selected_features[i] for i in range(len(selected_features))],
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        # Plot importance
        plt.figure(figsize=(12, 8))
        sns.barplot(x='importance', y='feature', data=importance.head(20))
        plt.title(f'Feature Importance for {target_name}')
        plt.tight_layout()
        plt.savefig(f'../figures/feature_importance_{target_name}.png')
        plt.close()
        
        # Save importance
        importance.to_csv(f'../results/feature_importance_{target_name}.csv', index=False)
    elif hasattr(model, 'coef_'):
        # For linear models
        if len(np.array(model.coef_).shape) == 1:
            coefs = model.coef_
        else:
            coefs = model.coef_[0]  # For multi-output models
            
        importance = pd.DataFrame({
            'feature': [selected_features[i] for i in range(len(selected_features))],
            'importance': np.abs(coefs)
        }).sort_values('importance', ascending=False)
        
        # Plot importance
        plt.figure(figsize=(12, 8))
        sns.barplot(x='importance', y='feature', data=importance.head(20))
        plt.title(f'Feature Importance for {target_name}')
        plt.tight_layout()
        plt.savefig(f'../figures/feature_importance_{target_name}.png')
        plt.close()
        
        # Save importance
        importance.to_csv(f'../results/feature_importance_{target_name}.csv', index=False)
    
    # Create a function for making predictions with this model
    def make_prediction(X_new, transform_output=log_transform):
        """Make predictions with best model, handling transformations"""
        # Ensure X_new contains only the selected features in the right order
        X_new_selected = np.zeros((X_new.shape[0], len(selected_indices)))
        for i, idx in enumerate(selected_indices):
            if idx < X_new.shape[1]:
                X_new_selected[:, i] = X_new[:, idx]
        
        # Make prediction
        y_pred = best_model.predict(X_new_selected)
        
        # Transform back if needed
        if transform_output:
            return np.expm1(y_pred)
        else:
            return y_pred
    
    # Return results and best model
    return {
        'results_df': results_df,
        'best_model': best_model,
        'best_model_name': best_model_name,
        'best_r2': best_r2,
        'selected_features': selected_features,
        'log_transform': log_transform,
        'make_prediction': make_prediction
    }

# 6. Run analysis pipeline
numeric_cols = combined_df[weather_cols].select_dtypes(include=np.number).columns.tolist()
print(f"\nUsing {len(numeric_cols)} numeric weather features for analysis")

# Handle missing values in features
X = combined_df[numeric_cols].copy()
X = X.fillna(X.mean())

print("Applying MinMaxScaler to features...")
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Save feature names and scaler
joblib.dump(scaler, '../models/feature_scaler.pkl')
with open('../models/feature_names.txt', 'w') as f:
    for feature in numeric_cols:
        f.write(f"{feature}\n")

all_results = {}

for target in target_vars:
    print(f"\n{'='*80}")
    print(f"Analyzing {target}")
    print(f"{'='*80}")
    
    # Get target data
    y = combined_df[target].copy()
    
    # Remove rows with missing target values
    valid_idx = ~y.isna()
    y_valid = y[valid_idx].values
    X_valid = X_scaled[valid_idx]
    
    # Analyze target variable
    target_analysis = analyze_target_variable(y_valid, target)
    
    # Calculate mutual information
    mi_df = analyze_mutual_information(X_valid, y_valid, numeric_cols, target)
    
    # Calculate correlations
    corr_df = analyze_correlations(X_valid, y_valid, numeric_cols, target)
    
    # Select features
    selected_features = select_features_from_rankings(mi_df, corr_df)
    print(f"\nSelected {len(selected_features)} features for {target}:")
    for feature in selected_features[:10]:
        print(f"  - {feature}")
    if len(selected_features) > 10:
        print(f"  - ... and {len(selected_features) - 10} more")
    
    # Evaluate models with selected features
    model_results = evaluate_models_with_selected_features(
        X_valid, y_valid, numeric_cols, selected_features, target, 
        log_transform=target_analysis['needs_log_transform']
    )
    
    # Store all results
    all_results[target] = {
        'target_analysis': target_analysis,
        'mutual_info': mi_df,
        'correlations': corr_df,
        'selected_features': selected_features,
        'model_results': model_results
    }
    
    # Save best model
    # joblib.dump(model_results['best_model'], f'../models/best_model_{target}.pkl')
    
    # # Save selected features
    # with open(f'../models/selected_features_{target}.txt', 'w') as f:
    #     for feature in selected_features:
    #         f.write(f"{feature}\n")

# 7. Create comprehensive summary report
# ------------------------------------
def create_summary_report(all_results):
    """Create a comprehensive summary of all results"""
    summary = []
    
    for target, results in all_results.items():
        model_results = results['model_results']
        
        row = {
            'target': target,
            'best_model': model_results['best_variant'],
            'r2_score': model_results['best_r2'],
            'cv_r2_mean': model_results['cv_scores'].mean(),
            'cv_r2_std': model_results['cv_scores'].std(),
            'num_features': len(model_results['selected_features']),
            'log_transform': model_results['log_transform'],
            'top_mi_feature': results['mutual_info'].iloc[0]['feature'],
            'top_corr_feature': results['correlations'].iloc[0]['feature'],
            'target_skew': results['target_analysis']['stats']['skew']
        }
        
        summary.append(row)
    
    # Convert to DataFrame
    summary_df = pd.DataFrame(summary)
    
    # Sort by R² score
    summary_df = summary_df.sort_values('r2_score', ascending=False)
    
    # Save summary
    summary_df.to_csv('../results/analysis_summary.csv', index=False)
    
    # Create a bar chart of R² scores
    plt.figure(figsize=(12, 6))
    sns.barplot(x='target', y='r2_score', data=summary_df)
    plt.title('Model Performance by Target Variable')
    plt.ylabel('R² Score')
    plt.xlabel('Target Variable')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('../figures/overall_performance.png')
    plt.close()
    
    return summary_df

# Generate summary
print("\nGenerating summary report...")
summary_df = create_summary_report(all_results)

print("\nAnalysis Summary:")
print(summary_df)

print("\nAnalysis complete! Results saved to the 'figures', 'models', and 'results' directories.")


Using 264 numeric weather features for analysis
Applying MinMaxScaler to features...

Analyzing tot_ar_km2

Analyzing target variable tot_ar_km2...
  Statistics for tot_ar_km2:
    count: 122481
    min: 0.2146586732964775
    max: 3584.7998440511747
    mean: 2.603060423339451
    median: 0.429317346592955
    std: 27.973428019759016
    skew: 60.99132860103478

Calculating mutual information for tot_ar_km2...
Top 5 features by mutual information for tot_ar_km2:
  sunshine_duration_max: 0.036338
  daylight_duration_day14: 0.035048
  sunshine_duration_q75: 0.034606
  daylight_duration_median: 0.032555
  apparent_temperature_min_max: 0.031719

Calculating correlations for tot_ar_km2...
Top 5 features by correlation for tot_ar_km2:
  daylight_duration_trend: 0.133740
  wind_east_velocity: 0.106368
  wind_north_velocity: 0.102912
  gust_east_velocity: 0.102811
  gust_north_velocity: 0.102759

Selected 28 features for tot_ar_km2:
  - daylight_duration_day2
  - daylight_duration_trend
  - 

KeyboardInterrupt: 

# ML Analysis with Additional Categorical Features