In [1]:
# started 11-17-2025

In [2]:
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
fires = pd.read_csv("../synthetic_fires/fires_smogn.csv")

In [5]:
fires.head()

Unnamed: 0,FIRE_YEAR,FIRE_SIZE,LATITUDE,LONGITUDE,temp_max_F,humidity_pct,precip_in,windspeed_mph,ndvi,pop_density,slope
0,2008.0,60.026144,33.120652,-115.470841,76.122085,66.0,0.0,7.709415,925.894234,0.030405,0.0
1,2009.0,45.414847,34.160982,-118.759623,89.364895,81.0,0.0,5.09049,3507.904432,653.045072,264.541299
2,2007.0,59.15493,33.961713,-117.219707,80.965517,60.0,0.0,7.560573,2701.12494,62.490938,608.347391
3,2008.0,56.220472,33.920497,-117.098855,91.666924,68.0,0.0,6.765826,2470.0,2.840684,516.2973
4,2009.0,51.333333,34.508497,-118.088987,80.392702,55.0,0.004306,6.366584,4531.77545,0.59194,1458.844625


In [6]:
fires = fires.sample(frac=1, random_state = 5)

In [7]:
fires_data = fires[["FIRE_SIZE","temp_max_F", "humidity_pct", "precip_in", "windspeed_mph", "ndvi", "pop_density","slope"]].copy()
fires_data

Unnamed: 0,FIRE_SIZE,temp_max_F,humidity_pct,precip_in,windspeed_mph,ndvi,pop_density,slope
1200,150.0,76.64,86.0,0.0,9.633313,2638.0,297.180756,143.354950
866,645.0,94.10,51.0,0.0,7.147296,3958.0,1.282237,568.764340
2130,80.0,88.34,33.0,0.0,12.740833,1386.0,0.000000,1139.315900
1376,6.0,74.66,98.0,0.0,8.328154,2050.0,793.120544,419.773930
1763,3.1,99.86,81.0,0.0,6.650093,3372.0,3600.464355,342.488000
...,...,...,...,...,...,...,...,...
1032,50.0,91.40,72.0,0.0,6.712244,5196.0,4.904080,370.977260
2121,7.2,84.02,65.0,0.0,9.011809,2185.0,0.612813,620.171260
1424,110.0,113.54,33.0,0.0,11.684276,3597.0,0.855352,72.046646
1725,42.0,82.40,80.0,0.0,8.141703,3471.0,882.793518,762.268250


In [8]:
fires_data["FIRE_SIZE"] = np.log10(fires_data["FIRE_SIZE"])

In [9]:
print(np.isinf(fires_data).sum())

FIRE_SIZE        0
temp_max_F       0
humidity_pct     0
precip_in        0
windspeed_mph    0
ndvi             0
pop_density      0
slope            0
dtype: int64


In [10]:
fires_data = fires_data.dropna()

In [11]:
iqr_cols = ["windspeed_mph"] # not including precip bc it has such a long and small tail, everything's just gonna get clipped
zscore_cols = ["temp_max_F", "humidity_pct"]

In [12]:
for col in iqr_cols:
    Q1 = fires_data[col].quantile(0.25)
    Q3 = fires_data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.9 * IQR
    upper = Q3 + 1.9 * IQR
    fires_data[col] = fires_data[col].clip(lower, upper)

k = 3 # 3 std from mean (99.7%)
for col in zscore_cols:
    mean = fires_data[col].mean()
    std = fires_data[col].std()
    lower = mean - k * std
    upper = mean + k * std
    fires_data[col] = fires_data[col].clip(lower, upper)

In [13]:
X = fires_data.drop("FIRE_SIZE", axis=1)
y = fires_data["FIRE_SIZE"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [15]:
power_cols = ["precip_in", "pop_density"]
standard_cols = ["temp_max_F", "humidity_pct", "windspeed_mph", "ndvi","slope"]
std_scaler = StandardScaler()
pwr_scaler = PowerTransformer()


In [16]:
std_scaler.fit(X_train[standard_cols])

pwr_scaler.fit(X_train[power_cols])

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True


In [17]:
X_train[standard_cols] = std_scaler.fit_transform(X_train[standard_cols])
X_train[power_cols] = pwr_scaler.fit_transform(X_train[power_cols])

X_test[standard_cols] = std_scaler.transform(X_test[standard_cols])
X_test[power_cols] = pwr_scaler.transform(X_test[power_cols])

In [18]:
print(np.isinf(X_train).sum(), np.isinf(y_train).sum())

temp_max_F       0
humidity_pct     0
precip_in        0
windspeed_mph    0
ndvi             0
pop_density      0
slope            0
dtype: int64 0


In [19]:
xgb_model = XGBRegressor(
    n_estimators=5000,      # number of boosting rounds (trees)
    learning_rate=0.001,
    max_depth=6,           # maximum depth of each tree
    subsample=0.8,         # fraction of samples used per tree
    colsample_bytree=0.8,  # fraction of features used per tree
    random_state=5    
)

In [20]:
xgb_model.fit(X_train, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [31]:
X_train

Unnamed: 0,temp_max_F,humidity_pct,precip_in,windspeed_mph,ndvi,pop_density,slope
2007,0.729943,0.103343,-0.306948,-0.070161,-0.271251,1.288017,0.154438
137,1.309252,-1.605737,-0.306948,-1.074095,-1.601398,-0.916982,0.767691
489,1.329067,-0.430744,-0.306948,-0.680834,-1.012663,0.284322,0.651397
810,0.962344,0.637431,-0.306948,-0.098546,0.104404,1.514898,-0.930010
979,0.193632,0.103343,-0.306948,1.207180,0.610423,-0.107723,-0.098976
...,...,...,...,...,...,...,...
1198,0.640558,1.011292,3.827339,-0.126932,-0.136599,0.292395,0.440507
1205,0.122124,1.171519,-0.306948,0.809785,-0.465940,0.680147,-1.341478
1733,-1.039882,0.904475,-0.306948,1.235566,0.118984,1.671537,-0.889983
1478,-1.325914,1.545380,-0.306948,1.093639,0.554675,1.424380,-1.129883


In [33]:
features = X_train.columns.tolist()

In [54]:
scalers = {
    "standard_scaler": std_scaler,
    "power_scaler": pwr_scaler
}

In [None]:
joblib.dump(scalers, "scalers.pkl")

['scalers.pkl']

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [22]:
preprocessor = ColumnTransformer(
    transformers=[
        ("power", std_scaler, standard_cols),
        ("standard", pwr_scaler, power_cols)
    ],
    remainder="drop"  # drop any columns not specified
)

In [23]:
preprocessor.fit(X_train)

0,1,2
,transformers,"[('power', ...), ('standard', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,method,'yeo-johnson'
,standardize,True
,copy,True


In [24]:
fire_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", xgb_model)
])

In [25]:
import joblib

In [30]:
joblib.dump(fire_pipeline, 'fire_model_pipeline.pkl')

['fire_model_pipeline.pkl']

In [26]:
### DO NOT FORGET THE LIME TING

In [46]:
import pickle
with open("xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)

In [47]:
import dill
import numpy as np
from lime import lime_tabular

In [48]:
with open("xgb_model.pkl", "rb") as f:
    model = pickle.load(f)

In [49]:
def make_predict_fn(model):
    return lambda X: model.predict(X)

predict_fn = make_predict_fn(model)

In [51]:
explainer = lime_tabular.LimeTabularExplainer(
    training_data=X_train.to_numpy(),
    feature_names=features,
    mode="regression"
)

In [52]:
bundle = {"explainer": explainer, "predict_fn": predict_fn}

In [53]:
with open("lime_explainer.dill", "wb") as f:
    dill.dump(bundle, f)