In [20]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [None]:
from data_loader import load_fire_data
from feature_engineering import prepare_features

# Load raw data
df_raw = load_fire_data('../data/FPA_FOD_20170508.sqlite')



In [None]:
# Apply feature engineering
df_processed, scaler = prepare_features(df_raw)

# Inspect processed data
df_processed.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['DISCOVERY_HOUR'].fillna(df['DISCOVERY_HOUR'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CONT_HOUR'].fillna(df['CONT_HOUR'].median(), inplace=True)


Unnamed: 0,OBJECTID,FOD_ID,FPA_ID,SOURCE_SYSTEM_TYPE,SOURCE_SYSTEM,NWCG_REPORTING_AGENCY,NWCG_REPORTING_UNIT_ID,NWCG_REPORTING_UNIT_NAME,SOURCE_REPORTING_UNIT,SOURCE_REPORTING_UNIT_NAME,...,STATE,COUNTY,Shape,DISCOVERY_HOUR,CONT_HOUR,FIRE_DURATION,DISCOVERY_HOUR_MISSING,CONT_HOUR_MISSING,SEASON,CAUSE_SIMPLE
0,1,1,FS-1418826,FED,FS-FIRESTAT,FS,USCAPNF,Plumas National Forest,511,Plumas National Forest,...,4,63,b'\x00\x01\xad\x10\x00\x00\xe8d\xc2\x92_@^\xc0...,-0.313641,0.417868,-0.090048,0,0,Winter,Unknown
1,2,2,FS-1418827,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,4,61,b'\x00\x01\xad\x10\x00\x00T\xb6\xeej\xe2\x19^\...,-1.547012,-0.043689,-0.090048,0,0,Spring,Unknown
2,3,3,FS-1418835,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,4,17,b'\x00\x01\xad\x10\x00\x00\xd0\xa5\xa0W\x13/^\...,1.166405,1.110203,-0.090048,0,0,Spring,Unknown
3,4,4,FS-1418845,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,4,3,b'\x00\x01\xad\x10\x00\x00\x94\xac\xa3\rt\xfa]...,0.426382,-0.274468,0.271755,0,0,Summer,Unknown
4,5,5,FS-1418847,FED,FS-FIRESTAT,FS,USCAENF,Eldorado National Forest,503,Eldorado National Forest,...,4,3,b'\x00\x01\xad\x10\x00\x00@\xe3\xaa.\xb7\xfb]\...,0.426382,-0.736025,0.271755,0,0,Summer,Unknown


In [5]:
print(df_processed.columns.tolist())

['OBJECTID', 'FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE', 'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID', 'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT', 'SOURCE_REPORTING_UNIT_NAME', 'FIRE_YEAR', 'DISCOVERY_DATE', 'DISCOVERY_DOY', 'DISCOVERY_TIME', 'STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR', 'CONT_DATE', 'CONT_DOY', 'CONT_TIME', 'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'LATITUDE', 'LONGITUDE', 'OWNER_CODE', 'OWNER_DESCR', 'STATE', 'COUNTY', 'Shape', 'DISCOVERY_HOUR', 'CONT_HOUR', 'FIRE_DURATION', 'DISCOVERY_HOUR_MISSING', 'CONT_HOUR_MISSING', 'SEASON', 'CAUSE_SIMPLE']


In [None]:
X = df_processed[[
    'LATITUDE', 'LONGITUDE', 'DISCOVERY_DOY', 'DISCOVERY_HOUR',
    'STATE', 'OWNER_DESCR', 'STAT_CAUSE_DESCR', 'SEASON', 'CAUSE_SIMPLE'
]]
y = df_processed['RISK_LEVEL']

In [10]:
# Convert categorical features to numeric (if not already)
X = pd.get_dummies(X, columns=['SEASON', 'CAUSE_SIMPLE'], drop_first=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6626551271846728

Classification Report:
               precision    recall  f1-score   support

           A       0.72      0.77      0.75     76320
           B       0.64      0.70      0.67     77243
           C       0.46      0.24      0.32     18507
           D       0.22      0.05      0.08      2876
           E       0.24      0.07      0.10      1680
           F       0.26      0.12      0.17      1124
           G       0.44      0.31      0.36       652

    accuracy                           0.66    178402
   macro avg       0.43      0.32      0.35    178402
weighted avg       0.64      0.66      0.65    178402


Confusion Matrix:
 [[58741 17056   434    26    23    18    22]
 [19502 54411  3133    89    40    30    38]
 [ 1979 11692  4463   194    81    62    36]
 [  357  1197   988   151    84    72    27]
 [  231   600   482   112   112    99    44]
 [  169   344   234    70    81   139    87]
 [   83   114    65    36    37   115   202]]


In [19]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Now do the train/test split on y_encoded
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Initialize XGBoost classifier
xgb_model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(y.unique()),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Fit model
xgb_model.fit(X_train, y_train)

# Predict
y_pred = xgb_model.predict(X_test)



# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.6694824049057746

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.77      0.75     76167
           1       0.64      0.72      0.68     77350
           2       0.51      0.25      0.34     18505
           3       0.28      0.03      0.06      2924
           4       0.26      0.07      0.11      1709
           5       0.25      0.13      0.17      1117
           6       0.39      0.40      0.39       630

    accuracy                           0.67    178402
   macro avg       0.44      0.34      0.36    178402
weighted avg       0.65      0.67      0.65    178402


Confusion Matrix:
 [[58682 17107   294    12     5    24    43]
 [19247 55442  2492    23    39    56    51]
 [ 1782 11732  4698    89    68    77    59]
 [  377  1194  1051    96    99    67    40]
 [  263   590   500    66   117   108    65]
 [  188   292   204    51    88   149   145]
 [   88    78    54     9    29   119   253]]
