In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from joblib import dump, load
from collections import Counter
from pipeline import create_wildfire_pipeline
from data_loader import load_fire_data
from Data_Preparation import prepare_features


from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline




In [2]:
def evaluate_xgb(X_train, y_train, X_dev, y_dev):
    print("Evaluating XGBoost Regressor...")

    param_grid = {
        'algo__n_estimators': [1000],
        'algo__max_depth': [2, 3, 4],
        'algo__learning_rate': [0.01, 0.05, 0.1],
        'algo__subsample': [0.8, 1.0],
    }

    pipeline = create_wildfire_pipeline(
        numeric_features=['LATITUDE', 'LONGITUDE', 'DISCOVERY_DOY', 'DISCOVERY_HOUR'],
        categorical_features=['STATE', 'STAT_CAUSE_DESCR', 'OWNER_DESCR', 'SEASON', 'CAUSE_SIMPLE']
    )

    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', XGBRegressor(objective='reg:squarederror', random_state=42))
    ])

    grid_search = GridSearchCV(
        pipeline_with_algo, param_grid,
        cv=5,
        scoring='neg_mean_squared_error',
        verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_estimator = grid_search.best_estimator_

    # Feature importance
    try:
        model = best_estimator.named_steps["algo"]
        preprocessor = best_estimator.named_steps["preprocessor"]
        feature_names = preprocessor.get_feature_names_out()
        importances = model.feature_importances_

        feature_df = pd.DataFrame({
            "Feature": feature_names,
            "Importance": importances
        }).sort_values(by="Importance", ascending=False)

        print("\nTop 10 Most Important Features:")
        print(feature_df.head(10))
    except Exception as e:
        print("Could not extract feature importances:", e)

    y_pred = best_estimator.predict(X_dev)
    rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
    mape = mean_absolute_percentage_error(y_dev, y_pred)
    r2 = r2_score(y_dev, y_pred)

    print("Grid searching is done!")
    print("Best score (neg MSE):", grid_search.best_score_)
    print("Best hyperparameters:", grid_search.best_params_)

    return best_estimator, rmse, mape, r2

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import numpy as np
import pandas as pd

from pipeline import create_wildfire_pipeline

def evaluate_rf(X_train, y_train, X_dev, y_dev, tune_rows=100_000):
    print("Evaluating RF Regressor (memory-safe)...")

    # Subsample for tuning if dataset is big
    if len(X_train) > tune_rows:
        X_tune = X_train.sample(tune_rows, random_state=42)
        y_tune = y_train.loc[X_tune.index]
    else:
        X_tune, y_tune = X_train, y_train

    param_grid = {
        'algo__n_estimators': [100, 300],
        'algo__max_depth': [10, None],
        'algo__min_samples_split': [2, 5],
        'algo__min_samples_leaf': [1, 2],
        'algo__max_features': ['sqrt', 0.3],
        'algo__bootstrap': [True],
        'algo__max_samples': [0.5, 0.75, 1.0],  # 50%, 75%, or all rows
    }

    pre = create_wildfire_pipeline(
        numeric_features=['LATITUDE','LONGITUDE','DISCOVERY_DOY','DISCOVERY_HOUR'],
        categorical_features=['STATE','STAT_CAUSE_DESCR','OWNER_DESCR','SEASON','CAUSE_SIMPLE'],
        encoding="ordinal"   # <- key change
    )

    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pre),
        ('algo', RandomForestRegressor(
            random_state=42,
            n_jobs=1,           # avoid double parallelism; let GridSearch parallelize
        ))
    ])

    grid_search = GridSearchCV(
        pipeline_with_algo,
        param_grid,
        cv=3,
        scoring='neg_mean_squared_error',
        verbose=0,
        n_jobs=-1,
        pre_dispatch='2*n_jobs',
        error_score='raise'
    )

    grid_search.fit(X_tune, y_tune)
    best_estimator = grid_search.best_estimator_

    # Retrain best model on full training set with maybe larger n_estimators
    best_params = grid_search.best_params_
    best_params['algo__n_estimators'] = max(best_params['algo__n_estimators'], 500)  # optional bump
    final_model = Pipeline(steps=[
        ('preprocessor', pre),
        ('algo', RandomForestRegressor(
            random_state=42,
            n_jobs=-1,
            **{k.replace('algo__',''): v for k, v in best_params.items() if k.startswith('algo__')}
        ))
    ])
    final_model.fit(X_train, y_train)

    # Evaluate
    y_pred = final_model.predict(X_dev)
    rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
    mape = mean_absolute_percentage_error(y_dev, y_pred)
    r2 = r2_score(y_dev, y_pred)

    print("Grid search done.")
    print("Best hyperparameters (tune subset):", grid_search.best_params_)
    print(f"RMSE: {rmse:.4f} | MAPE: {mape:.4f} | R²: {r2:.4f}")

    # Optional: concise feature importances
    try:
        model = final_model.named_steps["algo"]
        preproc = final_model.named_steps["preprocessor"]
        feat_names = preproc.get_feature_names_out()
        importances = model.feature_importances_
        top = (pd.DataFrame({'Feature': feat_names, 'Importance': importances})
                 .sort_values('Importance', ascending=False).head(10))
        print("\nTop 10 Features:\n", top)
    except Exception as e:
        print("Could not extract feature importances:", e)

    return final_model, rmse, mape, r2


In [4]:
# Load & prep
df = load_fire_data('../data/FPA_FOD_20170508.sqlite')
df = prepare_features(df)

In [5]:

X = df[[
    'LATITUDE', 'LONGITUDE', 'DISCOVERY_DOY', 'DISCOVERY_HOUR',
    'STATE', 'OWNER_DESCR', 'SEASON', 'STAT_CAUSE_DESCR', 'CAUSE_SIMPLE'
]]

y = df['FIRE_SIZE']

In [6]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Train & evaluate
best_model, rmse, mape, r2 = evaluate_xgb(X_train, y_train, X_dev, y_dev)
print(f"RMSE: {rmse:.4f} | MAPE: {mape:.4f} | R²: {r2:.4f}")

Evaluating XGBoost Regressor...
Fitting 5 folds for each of 18 candidates, totalling 90 fits

Top 10 Most Important Features:
                            Feature  Importance
31                    cat__STATE_NC    0.094608
21                    cat__STATE_KY    0.089241
14                    cat__STATE_GA    0.080224
53                    cat__STATE_WI    0.058510
56      cat__STAT_CAUSE_DESCR_Arson    0.032693
58   cat__STAT_CAUSE_DESCR_Children    0.031774
40                    cat__STATE_OK    0.029956
62  cat__STAT_CAUSE_DESCR_Lightning    0.025077
9                     cat__STATE_CO    0.023798
1                    num__LONGITUDE    0.022922
Grid searching is done!
Best score (neg MSE): -1.3639341363905115
Best hyperparameters: {'algo__learning_rate': 0.1, 'algo__max_depth': 4, 'algo__n_estimators': 1000, 'algo__subsample': 0.8}
RMSE: 1.1656 | MAPE: 3.8892 | R²: 0.2444


In [7]:
# Train & evaluate
best_model, rmse, mape, r2 = evaluate_rf(X_train, y_train, X_dev, y_dev)
print(f"RMSE: {rmse:.4f} | MAPE: {mape:.4f} | R²: {r2:.4f}")

Evaluating RF Regressor (memory-safe)...




Grid search done.
Best hyperparameters (tune subset): {'algo__bootstrap': True, 'algo__max_depth': None, 'algo__max_features': 0.3, 'algo__max_samples': 0.5, 'algo__min_samples_leaf': 2, 'algo__min_samples_split': 2, 'algo__n_estimators': 500}
RMSE: 1.1220 | MAPE: 3.4825 | R²: 0.3000

Top 10 Features:
                                    Feature  Importance
1                           num__LONGITUDE    0.291671
0                            num__LATITUDE    0.265053
2                       num__DISCOVERY_DOY    0.160633
3                      num__DISCOVERY_HOUR    0.065566
56             cat__STAT_CAUSE_DESCR_Arson    0.012893
75  cat__OWNER_DESCR_MISSING/NOT SPECIFIED    0.008367
14                           cat__STATE_GA    0.008299
31                           cat__STATE_NC    0.008001
21                           cat__STATE_KY    0.007934
40                           cat__STATE_OK    0.007713
RMSE: 1.1220 | MAPE: 3.4825 | R²: 0.3000


In [7]:
# Train & evaluate
best_model, rmse, mape, r2 = evaluate_rf(X_train, y_train, X_dev, y_dev)
print(f"RMSE: {rmse:.4f} | MAPE: {mape:.4f} | R²: {r2:.4f}")

Evaluating RF Regressor (memory-safe)...




Grid search done.
Best hyperparameters (tune subset): {'algo__bootstrap': True, 'algo__max_depth': None, 'algo__max_features': 'sqrt', 'algo__max_samples': 0.5, 'algo__min_samples_leaf': 2, 'algo__min_samples_split': 2, 'algo__n_estimators': 500}
RMSE: 1.1208 | MAPE: 3.4817 | R²: 0.3015

Top 10 Features:
                  Feature  Importance
1         num__LONGITUDE    0.312866
0          num__LATITUDE    0.283024
2     num__DISCOVERY_DOY    0.169462
3    num__DISCOVERY_HOUR    0.066942
4             cat__STATE    0.050919
5  cat__STAT_CAUSE_DESCR    0.047276
6       cat__OWNER_DESCR    0.042870
7            cat__SEASON    0.013490
8      cat__CAUSE_SIMPLE    0.013150
RMSE: 1.1208 | MAPE: 3.4817 | R²: 0.3015
