In [27]:
import streamlit as st
import pandas as pd
import geopandas as gpd
import folium
from streamlit_folium import st_folium
from folium.plugins import Draw
from shapely.geometry import Polygon
import joblib
import requests
from datetime import datetime, timedelta
import numpy as np
import os
import logging

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")


In [28]:
# src/utils.py

import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
import joblib
import os
import streamlit as st

# Suppress warnings globally in utilities
import warnings
warnings.filterwarnings("ignore")

@st.cache_data
def load_data(csv_path):
    df = pd.read_csv(csv_path, parse_dates=['date'])
    return df

@st.cache_data
def load_grid(shapefile_path):
    grid_gdf = gpd.read_file(shapefile_path)
    return grid_gdf

@st.cache_data
@st.cache_data
def load_grid_centroids(_grid_gdf):
    """
    Compute centroids in projected CRS and reproject them to geographic CRS.

    Parameters:
        _grid_gdf (GeoDataFrame): Projected GeoDataFrame of grid cells.

    Returns:
        GeoDataFrame: DataFrame with 'cell_id', 'centroid_lon', 'centroid_lat'.
    """
    # Compute centroids in projected CRS (EPSG:3310)
    centroids_projected = _grid_gdf.geometry.centroid

    # Create a GeoDataFrame with centroids
    gdf_centroids = gpd.GeoDataFrame(
        _grid_gdf[['cell_id']].copy(),
        geometry=centroids_projected,
        crs=_grid_gdf.crs
    )

    # Reproject centroids to geographic CRS (EPSG:4326)
    gdf_centroids = gdf_centroids.to_crs(epsg=4326)

    # Extract longitude and latitude
    gdf_centroids['centroid_lon'] = gdf_centroids.geometry.x
    gdf_centroids['centroid_lat'] = gdf_centroids.geometry.y

    # Return only necessary columns
    return gdf_centroids[['cell_id', 'centroid_lon', 'centroid_lat']]

def load_models(model_path='/Users/tobiascanavesi/Documents/wildifre_prevention/models/', scaler_path='/Users/tobiascanavesi/Documents/wildifre_prevention/models/scaler.joblib'):
    """
    Load trained models and scaler from the specified directory.

    Parameters:
        model_path (str): Path to the directory containing model files.
        scaler_path (str): Path to the scaler file.

    Returns:
        dict: Dictionary of models keyed by cluster.
        StandardScaler: Loaded scaler object.
    """
    models = {}
    if not os.path.exists(scaler_path):
        raise FileNotFoundError(f"Scaler file not found at {scaler_path}")
    scaler = joblib.load(scaler_path)
    for file in os.listdir(model_path):
        if file.startswith('LightGBM_cluster_') and file.endswith('.joblib'):
            try:
                cluster = int(file.split('_')[-1].split('.joblib')[0])
                models[cluster] = joblib.load(os.path.join(model_path, file))
            except ValueError:
                st.warning(f"Could not parse cluster number from {file}. Skipping this file.")
    return models, scaler


2025-01-22 16:07:37.056 No runtime found, using MemoryCacheStorageManager
2025-01-22 16:07:37.059 No runtime found, using MemoryCacheStorageManager
2025-01-22 16:07:37.059 No runtime found, using MemoryCacheStorageManager
2025-01-22 16:07:37.060 No runtime found, using MemoryCacheStorageManager


In [64]:
# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Load spatial data
GRID_SHP_PATH = '/Users/tobiascanavesi/Documents/wildifre_prevention/data/raw/ca_grid_10km.shp'
grid_gdf = load_grid(GRID_SHP_PATH)
grid_centroids = load_grid_centroids(grid_gdf)


models, scaler = load_models()

def map_selection_interface():
    """Render map interface for spatial selection"""
    st.sidebar.header("Spatial Selection")
    
    m = folium.Map(location=[36.7783, -119.4179], zoom_start=6)
    draw = Draw(
        export=True,
        position='topleft',
        draw_options={'polygon': True, 'rectangle': True, 'circle': False}
    )
    draw.add_to(m)
    
    map_output = st_folium(m, width=700, height=500)
    
    selected_cells = []
    if map_output and 'all_drawings' in map_output:
        for drawing in map_output['all_drawings']:
            if drawing['geometry']['type'] == 'Polygon':
                coords = drawing['geometry']['coordinates'][0]
                polygon = Polygon(coords)
                
                gdf_poly = gpd.GeoDataFrame(
                    geometry=[polygon], 
                    crs="EPSG:4326"
                ).to_crs(grid_gdf.crs)
                
                selected = gpd.sjoin(
                    grid_gdf.to_crs(grid_gdf.crs),
                    gdf_poly,
                    predicate='within'
                )
                selected_cells.extend(selected['cell_id'].tolist())
    
    return list(set(selected_cells))

def station_selection_interface():
    """Render station selection interface"""
    merged_path = '/Users/tobiascanavesi/Documents/wildifre_prevention/data/processed/merged_future_dataset.csv'
    if not os.path.exists(merged_path):
        return []
    
    stations = pd.read_csv(merged_path, usecols=['station_id', 'station_name'])
    stations = stations.drop_duplicates()
    
    st.sidebar.header("Weather Data Selection")
    selected_stations = st.sidebar.multiselect(
        "Select Weather Stations",
        options=stations['station_name'].unique(),
        help="Select stations to use for weather data"
    )
    
    station_ids = stations[stations['station_name'].isin(selected_stations)]['station_id'].unique()
    return list(station_ids)

def prepare_features(forecast_df, scaler):
    """Prepare features for prediction"""
    logging.info("Starting feature preparation...")
    
    # Feature Engineering
    forecast_df['date'] = pd.to_datetime(forecast_df['date'])
    forecast_df['day_of_year'] = forecast_df['date'].dt.dayofyear
    forecast_df['day_of_month'] = forecast_df['date'].dt.day
    forecast_df['month'] = forecast_df['date'].dt.month_name()
    forecast_df['day_of_week'] = forecast_df['date'].dt.day_name()

    def get_season(month):
        if month in ['December', 'January', 'February']:
            return 'Winter'
        elif month in ['March', 'April', 'May']:
            return 'Spring'
        elif month in ['June', 'July', 'August']:
            return 'Summer'
        else:
            return 'Fall'

    forecast_df['season'] = forecast_df['month'].apply(get_season)

    # Moving averages
    window_sizes = [7, 15, 30, 60, 90, 180, 360]
    for window in window_sizes:
        forecast_df[f'precip_ma_{window}d'] = (
            forecast_df.groupby('cell_id')['forecast_precip_in']
            .transform(lambda x: x.rolling(window, min_periods=1).mean())
        )
        forecast_df[f'tmax_ma_{window}d'] = (
            forecast_df.groupby('cell_id')['forecast_tmax_F']
            .transform(lambda x: x.rolling(window, min_periods=1).mean())
        )
        forecast_df[f'tmin_ma_{window}d'] = (
            forecast_df.groupby('cell_id')['forecast_tmin_F']
            .transform(lambda x: x.rolling(window, min_periods=1).mean())
        )
        forecast_df[f'ndvi_ma_{window}d'] = (
            forecast_df.groupby('cell_id')['ndvi_latest']
            .transform(lambda x: x.rolling(window, min_periods=1).mean())
        )

    # NDVI lag feature
    forecast_df['ndvi_lag_1d'] = forecast_df.groupby('cell_id')['ndvi_latest'].shift(1)

    # Handle missing values
    forecast_df.fillna(method='ffill', inplace=True)
    forecast_df.fillna(method='bfill', inplace=True)

    # Rename forecast columns
    forecast_df.rename(columns={
        'forecast_precip_in': 'precip_in',
        'forecast_tmax_F': 'tmax_F',
        'forecast_tmin_F': 'tmin_F',
        'ndvi_latest': 'ndvi'
    }, inplace=True)

    # One-Hot Encoding
    forecast_encoded = pd.get_dummies(forecast_df, columns=['month', 'day_of_week', 'season'], drop_first=True)

    # Load feature names
    feature_names_path = '/Users/tobiascanavesi/Documents/wildifre_prevention/models/feature_names.txt'
    with open(feature_names_path, 'r') as f:
        feature_names = [line.strip() for line in f]

    # Add missing features
    for feature in feature_names:
        if feature not in forecast_encoded.columns:
            forecast_encoded[feature] = 0

    # Drop extra features
    extra_features = set(forecast_encoded.columns) - set(feature_names) - {'cell_id', 'date', 'cluster'}
    if extra_features:
        logging.warning(f"Dropping extra features: {extra_features}")
        forecast_encoded = forecast_encoded.drop(columns=list(extra_features))

    # Maintain column order
    forecast_encoded = forecast_encoded[feature_names + ['cell_id', 'date', 'cluster']]

    # Load numerical features
    numerical_features_path = '/Users/tobiascanavesi/Documents/wildifre_prevention/models/numerical_features.txt'
    with open(numerical_features_path, 'r') as f:
        numerical_cols = [line.strip() for line in f]

    # Scale features
    try:
        forecast_encoded[numerical_cols] = scaler.transform(forecast_encoded[numerical_cols])
    except Exception as e:
        logging.error(f"Scaling error: {e}")
        raise

    logging.info("Feature preparation completed.")
    return forecast_encoded

def make_predictions(forecast_encoded, models):
    """Make predictions using cluster-specific models"""
    predictions = []
    
    if not {'cell_id', 'date', 'cluster'}.issubset(forecast_encoded.columns):
        raise ValueError("Missing required columns in input data")
    
    for idx, row in forecast_encoded.iterrows():
        try:
            cluster = int(row['cluster'])
            model = models.get(cluster, None)
            
            if not model:
                continue
                
            features = row.drop(['cell_id', 'date', 'cluster']).values.reshape(1, -1)
            proba = model.predict_proba(features)[0][1]
            pred = model.predict(features)[0]
            
            predictions.append({
                'cell_id': row['cell_id'],
                'date': row['date'],
                'fire_probability': proba,
                'fire_prediction': pred,
                'cluster': cluster
            })
            
        except Exception as e:
            logging.error(f"Prediction error for row {idx}: {str(e)}")
            continue
    
    if not predictions:
        raise ValueError("No predictions generated - check input data and models")
    
    return pd.DataFrame(predictions)


In [69]:





# Load data
merged_path = '/Users/tobiascanavesi/Documents/wildifre_prevention/data/processed/merged_future_dataset.csv'
merged_df = pd.read_csv(merged_path, parse_dates=['date'])


    

In [70]:
merged_df

Unnamed: 0,cell_id,station_id,station_name,station_name.1,date,forecast_tmax_F,forecast_tmin_F,forecast_precip_in,ndvi_latest,fire_occurred,cluster
0,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-22,74.0,49.0,0.00,0.322861,,0
1,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-23,78.0,48.0,0.00,0.322861,,0
2,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-24,68.0,46.0,0.02,0.322861,,0
3,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-25,57.0,45.0,0.07,0.322861,,0
4,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-26,57.0,44.0,0.00,0.322861,,0
...,...,...,...,...,...,...,...,...,...,...,...
68749,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-24,61.0,44.0,0.00,0.322861,,0
68750,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-25,57.0,42.0,0.00,0.322861,,0
68751,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-26,58.0,42.0,0.00,0.322861,,0
68752,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-27,59.0,42.0,0.00,0.322861,,0


In [71]:
# Clean duplicate columns
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
merged_df['cell_id'] = merged_df['cell_id'].astype(str)
merged_df['station_id'] = merged_df['station_id'].str.strip() 
# Filter data
filtered = merged_df[
    (merged_df['cell_id'].isin(['106'])) &
    (merged_df['station_id'].isin(['GHCND:USW00023234']))
]


In [72]:
merged_df

Unnamed: 0,cell_id,station_id,station_name,station_name.1,date,forecast_tmax_F,forecast_tmin_F,forecast_precip_in,ndvi_latest,fire_occurred,cluster
0,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-22,74.0,49.0,0.00,0.322861,,0
1,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-23,78.0,48.0,0.00,0.322861,,0
2,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-24,68.0,46.0,0.02,0.322861,,0
3,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-25,57.0,45.0,0.07,0.322861,,0
4,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-26,57.0,44.0,0.00,0.322861,,0
...,...,...,...,...,...,...,...,...,...,...,...
68749,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-24,61.0,44.0,0.00,0.322861,,0
68750,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-25,57.0,42.0,0.00,0.322861,,0
68751,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-26,58.0,42.0,0.00,0.322861,,0
68752,9009,GHCND:USW00093134,San Francisco International Airport,San Francisco International Airport,2025-01-27,59.0,42.0,0.00,0.322861,,0


In [73]:
filtered

Unnamed: 0,cell_id,station_id,station_name,station_name.1,date,forecast_tmax_F,forecast_tmin_F,forecast_precip_in,ndvi_latest,fire_occurred,cluster
0,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-22,74.0,49.0,0.0,0.322861,,0
1,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-23,78.0,48.0,0.0,0.322861,,0
2,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-24,68.0,46.0,0.02,0.322861,,0
3,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-25,57.0,45.0,0.07,0.322861,,0
4,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-26,57.0,44.0,0.0,0.322861,,0
5,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-27,60.0,43.0,0.0,0.322861,,0
6,106,GHCND:USW00023234,Los Angeles International Airport,Los Angeles International Airport,2025-01-28,60.0,44.0,0.0,0.322861,,0


In [74]:
forecast_encoded = prepare_features(filtered, scaler)

2025-01-22 16:28:42,673 - INFO - Starting feature preparation...


FileNotFoundError: [Errno 2] No such file or directory: 'models/numerical_features.txt'

In [None]:
predictions = make_predictions(forecast_encoded, models)