In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import joblib
from joblib import dump, load
from collections import Counter

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline



In [None]:
from data_loader import load_fire_data
from notebooks.Data_Preparation import prepare_features

# Load raw data
df_raw = load_fire_data('../data/FPA_FOD_20170508.sqlite')



In [3]:
# Apply feature engineering
df_processed, label_encoders = prepare_features(df_raw)

# Inspect processed data
df_processed.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DISCOVERY_HOUR'].fillna(df['DISCOVERY_HOUR'].median(), inplace=True)


Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,...,LATITUDE,LONGITUDE,OWNER_CODE,OWNER_DESCR,STATE,COUNTY,Shape,DISCOVERY_HOUR,SEASON,CAUSE_SIMPLE
0,1,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,511,Plumas National Forest,...,40.036944,-121.005833,5.0,15,4,63,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...,13.0,3,2
1,2,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,38.933056,-120.404444,5.0,15,4,61,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...,8.0,1,1
2,3,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,38.984167,-120.735556,13.0,12,4,17,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...,19.0,1,0
3,4,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,38.559167,-119.913333,5.0,15,4,3,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...,16.0,2,1
4,5,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,38.559167,-119.933056,5.0,15,4,3,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...,16.0,2,1


In [4]:
print(df_processed.columns.tolist())

['OBJECTID', 'FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'FIRE_YEAR', 'DISCOVERY_DATE', 'DISCOVERY_DOY', 'DISCOVERY_TIME', 'STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR', 'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'LATITUDE', 'LONGITUDE', 'OWNER_CODE', 'OWNER_DESCR', 'STATE', 'COUNTY', 'Shape', 'DISCOVERY_HOUR', 'SEASON', 'CAUSE_SIMPLE']


In [None]:
def evaluate_xgb(X_train, y_train, X_dev, y_dev):
    print("Evaluating XGBoost Regressor...")

    # Define the hyperparameter grid search to try combinations of these hyperparameters.
    param_grid = {
        'algo__n_estimators': [1000],
        'algo__max_depth': [2, 3, 4],
        'algo__learning_rate': [0.01, 0.05, 0.1], # smaller learning rate is possibly better as training consisitency increasees.
        'algo__subsample': [0.8, 1.0],

    }

    # This here uses the pipeline to handle missing values, scaling, encoding, etc for teh dataset.
    pipeline = create_fish_pipeline()

    # This combines the preprocessing and XGBoost model into one clean pipeline.
    pipeline_with_algo = Pipeline(steps=[
        ('preprocessor', pipeline),
        ('algo', XGBRegressor(
            objective='reg:squarederror',
            random_state=42
        ))
    ])

    grid_search = GridSearchCV(
        pipeline_with_algo, param_grid,
        cv=5,  # 5-fold cross-validation
        scoring='neg_mean_squared_error',  
        verbose=1  # Show progress in terminal
    )
    grid_search.fit(X_train, y_train)

    # This shows us our best model based on cross-validation R² score.
    best_estimator = grid_search.best_estimator_

    # 📊 FEATURE IMPORTANCE SECTION
    try:
        model = best_estimator.named_steps["algo"]
        preprocessor = best_estimator.named_steps["preprocessor"]
        feature_names = preprocessor.get_feature_names_out()
        importances = model.feature_importances_

        feature_df = pd.DataFrame({
            "Feature": feature_names,
            "Importance": importances
        }).sort_values(by="Importance", ascending=False)

        print("\nTop 10 Most Important Features:")
        print(feature_df.head(10))
    except Exception as e:
        print("Could not extract feature importances:", e)

    # We are making predicitons on the dev set here
    y_pred = best_estimator.predict(X_dev)

    # Here we are calculating the following values
    # Calculate evaluation metrics
    rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
    mape = mean_absolute_percentage_error(y_dev, y_pred)
    r2 = r2_score(y_dev, y_pred)

    # Shows you the best performance from the training phase and the hyperparameters that gave it.
    print("Grid searching is done!")
    print("Best score (neg MSE):", grid_search.best_score_)
    print("Best hyperparameters:")
    print(grid_search.best_params_)

    return best_estimator, rmse, mape, r2

In [5]:

X = df_processed[[
    'LATITUDE', 'LONGITUDE', 'DISCOVERY_DOY', 'DISCOVERY_HOUR',
    'STATE', 'OWNER_DESCR', 'SEASON', 'STAT_CAUSE_DESCR', 'CAUSE_SIMPLE'
]]

y = df_processed['FIRE_SIZE']

In [7]:
# Encode target
label_encoder = LabelEncoder()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42,)

In [20]:
rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)
rf.fit(X_train, y_train)

In [21]:
xgb = XGBRegressor(n_estimator=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

Parameters: { "n_estimator" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [24]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{model_name} Evaluation:")
    print(f"RMSE: {rmse:.3f}")
    print(f"MAE: {mae:.3f}")
    print(f"R² : {r2:.3f}")
    return y_pred


In [25]:
rf_preds = evaluate_model(rf, X_test, y_test, "Random Forest")
xgb_preds = evaluate_model(xgb, X_test, y_test, "XCBoost")


Random Forest Evaluation:
RMSE: 1.135
MAE: 0.751
R² : 0.283
XCBoost Evaluation:
RMSE: 1.191
MAE: 0.810
R² : 0.211


In [8]:
# # Model
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train, y_train)

# # Predictions
# y_pred = clf.predict(X_test)

# # Evaluation
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("\nClassification Report:\n", classification_report(y_test, y_pred))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [9]:
# # Compute class weights
# classes = np.unique(y_train)
# weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
# class_weights = dict(zip(classes, weights))

# print("Class Weights:", class_weights)

In [10]:
# # Random Forest with class_weight
# rf_model = RandomForestClassifier(
#     n_estimators=100,
#     random_state=42,
#     class_weight=class_weights
# )

# rf_model.fit(X_train, y_train)
# y_pred_rf = rf_model.predict(X_test)

# # Evaluation
# print("Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

In [11]:
# print(X_train.dtypes)
# print(X_train.select_dtypes(include='object').head())

In [12]:
# # SMOTE
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# print("Original training set shape:", Counter(y_train))
# print("Resampled training set shape:", Counter(y_resampled))

In [13]:
# # Model training
# rf_smote = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_smote.fit(X_resampled, y_resampled)


In [14]:

# # Predict on the original test set
# y_pred_smote = rf_smote.predict(X_test)

# # Evaluate
# print("Accuracy:", accuracy_score(y_test, y_pred_smote))
# print("\nClassification Report:\n", classification_report(y_test, y_pred_smote))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))


In [15]:
# # Save the model
# dump(rf_smote, '../models/rf_wildfire_model.joblib')

# # Save the scaler
# dump(scaler, '../models/scaler.joblib')

# for col, encoder in label_encoders.items():
#     dump(encoder, f'../models/{col.lower()}_encoder.joblib')

# print("Model and scaler saved.")
# print("Scaler expects:", scaler.n_features_in_)


In [16]:
# # Get feature importances
# importances = rf_smote.feature_importances_
# feature_names = X_train.columns  # Make sure X_train is a DataFrame

# # Create a DataFrame for easier plotting
# feat_importance_df = pd.DataFrame({
#     'Feature': feature_names,
#     'Importance': importances
# }).sort_values(by='Importance', ascending=False)

# # Plot
# plt.figure(figsize=(10,6))
# plt.barh(feat_importance_df['Feature'], feat_importance_df['Importance'])
# plt.xlabel('Importance')
# plt.title('Feature Importances - Random Forest')
# plt.gca().invert_yaxis()
# plt.show()

In [18]:
# # Encode target labels
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(
#     X_encoded, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
# )

# # Initialize XGBoost classifier
# xgb_model = XGBClassifier(
#     objective='multi:softmax',
#     num_class=len(label_encoder.classes_),
#     eval_metric='mlogloss',
#     use_label_encoder=False,
#     random_state=42
# )

# # Fit model
# xgb_model.fit(X_train, y_train)

# # Predict
# y_pred = xgb_model.predict(X_test)

# # Inverse transform predictions and true labels back to original classes
# y_test_labels = label_encoder.inverse_transform(y_test)
# y_pred_labels = label_encoder.inverse_transform(y_pred)

# # Now generate report with the original class names
# print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
# print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_labels))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test_labels, y_pred_labels))


In [19]:
# label_encoder = LabelEncoder()
# label_encoder.fit(y)

# xgb_model = XGBClassifier(
#     objective='multi:softmax',
#     num_class=len(np.unique(y_resampled)),
#     eval_metric='mlogloss',
#     use_label_encoder=False,
#     random_state=42
# )

# xgb_model.fit(X_resampled, y_resampled)

# # Predictions on original test set
# y_pred = xgb_model.predict(X_test)

# # Inverse transform predictions and true labels if needed
# y_test_labels = label_encoder.inverse_transform(y_test)
# y_pred_labels = label_encoder.inverse_transform(y_pred)

# # Evaluation
# print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
# print("\nClassification Report:\n", classification_report(y_test_labels, y_pred_labels))
# print("\nConfusion Matrix:\n", confusion_matrix(y_test_labels, y_pred_labels))