In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import lightgbm as lgb
from lightgbm import LGBMRegressor

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import (
    mean_squared_error,
    roc_auc_score,
    r2_score,
    silhouette_score,
),

import holidays

DATA_DIR = Path('shared_data/')
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [6]:
# ---- 1.1 Load wildfires ----
fires_path = DATA_DIR / 'wildfires_fires.parquet'
fires = pd.read_parquet(
    fires_path,
    columns=['FIRE_SIZE', 'DISCOVERY_DATE', 'STATE', 'STAT_CAUSE_CODE'],
)

if not np.issubdtype(fires['DISCOVERY_DATE'].dtype, np.datetime64):
    fires['DISCOVERY_DATE'] = pd.to_datetime(fires['DISCOVERY_DATE'])

fires = fires.rename(columns={'DISCOVERY_DATE': 'date'})
fires = fires[(fires['date'].dt.year >= 1992) & (fires['date'].dt.year <= 2015)]
fires.head()

FileNotFoundError: [Errno 2] No such file or directory: 'shared_data\\wildfires_fires.parquet'

In [None]:
# ---- 1.2 Aggregate to state–day ----
daily_agg = (
    fires.groupby(['STATE', 'date'])
    .agg(
        n_fires=('FIRE_SIZE', 'size'),
        area_burned=('FIRE_SIZE', 'sum'),
    )
    .reset_index()
)

cause_counts = (
    fires.pivot_table(
        index=['STATE', 'date'],
        columns='STAT_CAUSE_CODE',
        values='FIRE_SIZE',
        aggfunc='count',
        fill_value=0,
    )
    .rename_axis(columns='cause_code')
)
cause_counts.columns = [f'cause_{int(c)}' for c in cause_counts.columns]
cause_counts = cause_counts.reset_index()

fires_daily = daily_agg.merge(cause_counts, on=['STATE', 'date'], how='left').fillna(0)
fires_daily['any_fire'] = (fires_daily['n_fires'] > 0).astype(int)
fires_daily.head()

In [None]:
# ---- 2.1 Load weather ----
weather_path = DATA_DIR / 'us_daily_weather_1992_2015.parquet'
weather = pd.read_parquet(weather_path)

cities_path = DATA_DIR / 'cities.csv'
cities_df = pd.read_csv(cities_path)

weather['date'] = pd.to_datetime(weather['date'])
weather = weather[(weather['date'].dt.year >= 1992) & (weather['date'].dt.year <= 2015)]

weather = weather.merge(
    cities_df[['city_name', 'state']].drop_duplicates(), on='city_name', how='left'
)

weather = weather.rename(columns={
    'state': 'STATE',
    'avg_temp_c': 'tavg',
    'min_temp_c': 'tmin',
    'max_temp_c': 'tmax',
    'precipitation_mm': 'prcp',
    'snow_depth_mm': 'snow',
    'avg_wind_speed_kmh': 'wspd',
    'avg_sea_level_pres_hpa': 'pres',
})
weather.head()

In [None]:
# ---- 2.2 Aggregate station data to state–day ----
weather_daily = (
    weather.groupby(['STATE', 'date'])
    .agg(
        tavg=('tavg', 'mean'),
        tmin=('tmin', 'mean'),
        tmax=('tmax', 'mean'),
        prcp=('prcp', 'sum'),
        snow=('snow', 'sum'),
        wspd=('wspd', 'mean'),
        pres=('pres', 'mean'),
    )
    .reset_index()
)
weather_daily.head()

In [None]:
# ---- 2.3 Lagged features helper ----
def add_lagged_features(df, group_col, date_col, base_cols, windows=(3, 7, 30), shift=1):
    df = df.sort_values([group_col, date_col]).copy()
    for col in base_cols:
        for w in windows:
            roll = (
                df.groupby(group_col)[col]
                .transform(lambda x: x.rolling(w, min_periods=1).mean())
            )
            df[f'{col}_mean_{w}'] = roll.shift(shift)
    for w in [7, 30]:
        roll = (
            df.groupby(group_col)['prcp']
            .transform(lambda x: x.rolling(w, min_periods=1).sum())
        )
        df[f'prcp_sum_{w}'] = roll.shift(shift)
    return df

In [None]:
# ---- 2.4 Build weather features with state mapping and lags ----
weather_feat = weather_daily.copy()

state_name_to_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
        
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
        
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
        
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
        
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
        

weather_feat['STATE'] = weather_feat['STATE'].map(state_name_to_abbrev)
weather_feat.dropna(subset=['STATE'], inplace=True)

weather_feat = weather_feat.set_index('STATE')
weather_feat = weather_feat.groupby(level=0).ffill().bfill()
weather_feat = weather_feat.reset_index()

weather_feat['year'] = weather_feat['date'].dt.year
weather_feat['month'] = weather_feat['date'].dt.month
weather_feat['doy'] = weather_feat['date'].dt.dayofyear

base_cols = ['tavg', 'tmin', 'tmax', 'prcp', 'wspd', 'pres']
weather_feat = add_lagged_features(
    weather_feat, group_col='STATE', date_col='date', base_cols=base_cols
)
weather_feat = weather_feat.dropna().reset_index(drop=True)
weather_feat.head()

In [None]:
# ---- 3.1 Merge with fires and add targets + calendar / holiday features ----
data = weather_feat.merge(fires_daily, on=['STATE', 'date'], how='left')

fire_cols = [c for c in data.columns if c.startswith('cause_')] + [
    'n_fires', 'area_burned', 'any_fire',
]
for c in fire_cols:
    if c in data.columns:
        data[c] = data[c].fillna(0)

data['log_area_burned'] = np.log1p(data['area_burned'])

# Weekend indicator
data['is_weekend'] = data['date'].dt.weekday.isin([5, 6]).astype(int)

# US holidays and July 4th indicator
us_holidays = holidays.US()
data['holiday_name'] = data['date'].dt.date.map(us_holidays.get)
data['is_holiday'] = data['holiday_name'].notna().astype(int)
data['is_july4'] = (data['holiday_name'] == 'Independence Day').astype(int)

# Previous-day precipitation feature (per state)
data = data.sort_values(['STATE', 'date'])
data['prcp_prev_1d'] = data.groupby('STATE')['prcp'].shift(1)
data['prcp_prev_1d'] = data['prcp_prev_1d'].fillna(0.0)

data.head()

In [None]:
# ---- 4.1 Choose features for clustering (including calendar & prev-day precipitation) ----
cluster_features = [
    'tavg_mean_7',
    'tavg_mean_30',
    'prcp_sum_7',
    'prcp_sum_30',
    'wspd_mean_7',
    'pres_mean_7',
    # Calendar / holiday / previous-day precipitation features
    'prcp_prev_1d',
    'is_weekend',
    'is_holiday',
    'is_july4',
]
cluster_features = [c for c in cluster_features if c in data.columns]

train_year_cutoff = 2011
train_mask = data['year'] < train_year_cutoff

# Scale features
scaler = StandardScaler()
X_cluster_train = scaler.fit_transform(data.loc[train_mask, cluster_features])
X_cluster_full = scaler.transform(data[cluster_features])

# ---- 4.2 PCA dimensionality reduction ----
# Keep enough components to explain ~95% of variance
pca = PCA(n_components=0.95, random_state=RANDOM_STATE)
X_cluster_train_pca = pca.fit_transform(X_cluster_train)
X_cluster_full_pca = pca.transform(X_cluster_full)

# ---- 4.3 KMeans with small automatic K search (in PCA space) ----
candidate_ks = [3, 5, 8, 10]
best_k = None
best_score = -1.0
best_model = None

for k in candidate_ks:
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)
    labels = km.fit_predict(X_cluster_train_pca)
    # Silhouette needs at least 2 clusters
    if len(set(labels)) < 2:
        continue
    score = silhouette_score(X_cluster_train_pca, labels)
    if score > best_score:
        best_score = score
        best_k = k
        best_model = km

print(f"Chosen K for KMeans (PCA space): {best_k}, silhouette={best_score:.3f}")

kmeans = best_model
data['weather_cluster_kmeans'] = kmeans.predict(X_cluster_full_pca)

# ---- 4.4 DBSCAN in PCA space with simple eps heuristic ----
eps_candidates = [0.3, 0.5, 0.8, 1.0]
min_samples = 50

best_eps = None
best_db = None
best_n_clusters = -1

for eps in eps_candidates:
    db = DBSCAN(eps=eps, min_samples=min_samples)
    labels = db.fit_predict(X_cluster_train_pca)
    unique_labels = set(labels) - {-1}
    n_clusters = len(unique_labels)
    # Require at least 2 clusters and not too many
    if n_clusters < 2 or n_clusters > 30:
        continue
    if n_clusters > best_n_clusters:
        best_n_clusters = n_clusters
        best_eps = eps
        best_db = db

if best_db is None:
    # Fallback DBSCAN if no candidate passes the heuristic
    best_eps = 0.8
    best_db = DBSCAN(eps=best_eps, min_samples=min_samples).fit(X_cluster_train_pca)

print(f"Chosen eps for DBSCAN (PCA space): {best_eps}, clusters={best_n_clusters}")

dbscan = best_db
data['weather_cluster_dbscan'] = dbscan.fit_predict(X_cluster_full_pca)

data['weather_cluster_kmeans'].value_counts().sort_index()

In [None]:
# ---- 5.1 Final feature matrix and splits ----
le_state = LabelEncoder()
data['STATE_LE'] = le_state.fit_transform(data['STATE'])

exclude_cols = {
    'n_fires', 'any_fire', 'area_burned', 'log_area_burned', 'STATE', 'date'
} | set([c for c in data.columns if c.startswith('cause_')])

feature_cols = [c for c in data.columns if c not in exclude_cols]

train_mask = data['year'] < train_year_cutoff
test_mask = ~train_mask

X_train = data.loc[train_mask, feature_cols]
X_test = data.loc[test_mask, feature_cols]

y_train_count = data.loc[train_mask, 'n_fires']
y_test_count = data.loc[test_mask, 'n_fires']
y_train_bin = data.loc[train_mask, 'any_fire']
y_test_bin = data.loc[test_mask, 'any_fire']
y_train_log_area = data.loc[train_mask, 'log_area_burned']
y_test_log_area = data.loc[test_mask, 'log_area_burned']