In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from joblib import dump, load
from collections import Counter
from pipeline import create_wildfire_pipeline
from data_loader import load_fire_data
from Data_Preparation import prepare_features


from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline




In [2]:
def evaluate_xgb(X_train, y_train, X_dev, y_dev):
    print("Evaluating XGBoost Regressor...")

    param_grid = {
        'algo__n_estimators': [1000],
        'algo__max_depth': [2, 3, 4],
        'algo__learning_rate': [0.01, 0.05, 0.1],
        'algo__subsample': [0.8, 1.0],
    }

    pipeline = create_wildfire_pipeline(
        numeric_features=['LATITUDE', 'LONGITUDE', 'DISCOVERY_DOY', 'DISCOVERY_HOUR'],
        categorical_features=['STATE', 'STAT_CAUSE_DESCR', 'OWNER_DESCR', 'SEASON', 'CAUSE_SIMPLE']
    )

    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', XGBRegressor(objective='reg:squarederror', random_state=42))
    ])

    grid_search = GridSearchCV(
        pipeline_with_algo, param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_estimator = grid_search.best_estimator_

    # Feature importance
    try:
        model = best_estimator.named_steps["algo"]
        preprocessor = best_estimator.named_steps["preprocessor"]
        feature_names = preprocessor.get_feature_names_out()
        importances = model.feature_importances_

        feature_df = pd.DataFrame({
            "Feature": feature_names,
            "Importance": importances
        }).sort_values(by="Importance", ascending=False)

        print("\nTop 10 Most Important Features:")
        print(feature_df.head(10))
    except Exception as e:
        print("Could not extract feature importances:", e)

    y_pred = best_estimator.predict(X_dev)
    rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
    mape = mean_absolute_percentage_error(y_dev, y_pred)
    r2 = r2_score(y_dev, y_pred)

    print("Grid searching is done!")
    print("Best score (neg MSE):", grid_search.best_score_)
    print("Best hyperparameters:", grid_search.best_params_)

    return best_estimator, rmse, mape, r2

In [3]:
# Load & prep
df = load_fire_data('../data/FPA_FOD_20170508.sqlite')
df = prepare_features(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DISCOVERY_HOUR'].fillna(df['DISCOVERY_HOUR'].median(), inplace=True)


In [4]:

X = df[[
    'LATITUDE', 'LONGITUDE', 'DISCOVERY_DOY', 'DISCOVERY_HOUR',
    'STATE', 'OWNER_DESCR', 'SEASON', 'STAT_CAUSE_DESCR', 'CAUSE_SIMPLE'
]]

y = df['FIRE_SIZE']

In [5]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Train & evaluate
best_model, rmse, mape, r2 = evaluate_xgb(X_train, y_train, X_dev, y_dev)
print(f"RMSE: {rmse:.4f} | MAPE: {mape:.4f} | R²: {r2:.4f}")

Evaluating XGBoost Regressor...
Fitting 5 folds for each of 18 candidates, totalling 90 fits

Top 10 Most Important Features:
                            Feature  Importance
31                    cat__STATE_NC    0.094608
21                    cat__STATE_KY    0.089241
14                    cat__STATE_GA    0.080224
53                    cat__STATE_WI    0.058510
56      cat__STAT_CAUSE_DESCR_Arson    0.032693
58   cat__STAT_CAUSE_DESCR_Children    0.031774
40                    cat__STATE_OK    0.029956
62  cat__STAT_CAUSE_DESCR_Lightning    0.025077
9                     cat__STATE_CO    0.023798
1                    num__LONGITUDE    0.022922
Grid searching is done!
Best score (neg MSE): -1.3639341363905115
Best hyperparameters: {'algo__learning_rate': 0.1, 'algo__max_depth': 4, 'algo__n_estimators': 1000, 'algo__subsample': 0.8}
RMSE: 1.1656 | MAPE: 3.8892 | R²: 0.2444
