# EDA for Wildfire detection - Part 2 (Dataset Integration)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt

from IPython.display import display
from scipy.stats import ttest_ind, norm, f
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import statsmodels.api as sm
from sklearn.neighbors import NearestNeighbors

In [1]:
from myLib import check_missing, check_column_sources, check_missing_row_overlap, check_missing_and_plot, plot_missing_spatial, move_column_after

## 1. Prepare data resources (Cleaned)

Read the following csv files:
- Read Parsed_Table_3 into 'df_table3'
- Read Firegrowth_pts_v1_1_2023 into 'df_cfsds_pts_2023'
- Read modis_2023_Canada into 'df_modis_2023'
- Read viirs-snpp_2023_Canada into 'df_virrs_2023'
(Parsed_Table_3 is the csv version of Table 3 mentioned in "The Canadian Fire Spread Dataset.pdf")

In [None]:
## Read CFSDS pts 2023
path = "../output/df_cfsds_pts_2023_clean.csv"
df_cfsds_pts_2023 = pd.read_csv(path)
print(f"Load dataframe from {path}, \nShape: {df_cfsds_pts_2023.shape}")
display(df_cfsds_pts_2023.head())

## Read MODIS 2023 (Canada)
path = "../data/modis_2023_Canada.csv"
df_modis_2023 = pd.read_csv(path)
print(f"Load dataframe from {path}, \nShape: {df_modis_2023.shape}")
display(df_modis_2023.head())

# Read VIIRS 2023 (Canada)
path = "../data/viirs-snpp_2023_Canada.csv"
df_viirs_2023 = pd.read_csv(path)
print(f"Load dataframe from {path}, \nShape: {df_viirs_2023.shape}")
display(df_viirs_2023.head())

Load dataframe from output/df_cfsds_pts_2023_clean.csv, 
Shape: (9805201, 54)


Unnamed: 0,ID,DOB,year,fireday,firearea,ecozone,fwi,isi,ffmc,dmc,...,dem,slope,twi,aspect,cumuarea,pctgrowth,prevgrow,sprdistm,lon,lat
0,2023_1,199,2023,6,34.83,4.0,12.689084,3.413085,87.002037,57.022739,...,155.888885,0.635593,8.542518,19.364168,584.01,6.342183,122.31,82.564522,-120.243573,65.162707
1,2023_1,200,2023,7,8.91,4.0,12.562475,3.274614,87.34787,59.742805,...,159.111115,1.2954,8.378138,65.199997,592.92,1.525659,34.83,20.722663,-120.242807,65.161939
2,2023_1,199,2023,6,34.83,4.0,12.689084,3.413085,87.002037,57.022739,...,157.555557,1.379041,8.435522,3.923952,584.01,6.342183,122.31,82.564522,-120.240982,65.162261
3,2023_1,198,2023,5,122.31,4.0,17.889744,5.501473,87.002037,54.54068,...,158.777771,0.873803,8.435522,3.923952,549.18,28.652751,182.25,312.984996,-120.239156,65.162582
4,2023_1,198,2023,5,122.31,4.0,17.889744,5.501473,87.002037,54.54068,...,156.888885,1.15359,8.31071,333.121857,549.18,28.652751,182.25,312.984996,-120.23733,65.162904


Load dataframe from data/modis_2023_Canada.csv, 
Shape: (324003, 15)


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,53.4971,-117.9781,375.0,1.5,1.2,2023-01-02,1915,Terra,MODIS,100,61.03,265.7,0.0,D,0
1,53.4998,-117.9604,361.8,1.5,1.2,2023-01-02,1915,Terra,MODIS,91,61.03,267.0,0.0,D,0
2,53.4929,-117.9582,377.4,1.5,1.2,2023-01-02,1915,Terra,MODIS,100,61.03,266.3,0.0,D,0
3,53.495,-117.9712,374.5,1.0,1.0,2023-01-02,2048,Aqua,MODIS,100,61.03,267.7,0.0,D,0
4,56.7794,-118.0159,314.9,1.3,1.1,2023-01-03,450,Terra,MODIS,87,61.03,266.6,28.1,N,0


Load dataframe from data/viirs-snpp_2023_Canada.csv, 
Shape: (1754727, 15)


Unnamed: 0,latitude,longitude,bright_ti4,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_ti5,frp,daynight,type
0,58.58209,-122.62513,321.46,0.57,0.43,2023-01-01,957,N,VIIRS,n,2,260.01,5.17,N,0
1,54.41019,-117.53371,339.81,0.39,0.36,2023-01-01,958,N,VIIRS,n,2,263.24,5.02,N,0
2,54.41105,-117.53967,338.31,0.39,0.36,2023-01-01,958,N,VIIRS,n,2,263.12,5.02,N,0
3,54.42263,-117.54747,343.21,0.39,0.36,2023-01-01,958,N,VIIRS,n,2,268.39,14.76,N,0
4,54.42507,-117.54012,337.76,0.39,0.36,2023-01-01,958,N,VIIRS,n,2,262.35,5.15,N,0


In [None]:
# Read CFSDS Table 3
path = "../data/Parsed_Table_3.csv"
df_table3 = pd.read_csv(path)
print(f"Load dataframe from {path}, \nShape: {df_table3.shape}")
display(df_table3.head())

Load dataframe from data/Parsed_Table_3.csv, 
Shape: (32, 4)


Unnamed: 0,attribute,description,mean ± sd,source
0,ID,Fire ID,,NBAC
1,DOB,Day of year burning,203.1 ± 28.5,CFSDS
2,year,Year,2012.5 ± 5.7,NBAC
3,fireday,Day of fire (ignition day -1),219.1 ± 21.1,CFSDS
4,fireera,Fire growth this day (ha),57.4 ± 583.3,CFSDS


## 2. Fetch all unique fire events from CFSDS (2023)

In [7]:
_unique_ids = df_cfsds_pts_2023['ID'].nunique()
print(f"Number of unique ID values: {_unique_ids}")

Number of unique ID values: 411


### 2.1 Create new dataframe to store enique fire events

- Create a new dataframe for each unique fire id (each row represents one unique fire event)

In [None]:
# Filter only rows where fireday == 1 (assumed to be ignition)
fire_events_df = df_cfsds_pts_2023[df_cfsds_pts_2023["fireday"] == 1].copy()

# Drop duplicates to keep only one row per fire ID
fire_events_df = fire_events_df.drop_duplicates(subset="ID").reset_index(drop=True)

# Extract fire_number from ID
fire_events_df["fire_number"] = fire_events_df["ID"].str.split("_").str[1].astype(int)

# Compute ignition_date from DOB (assuming year is 2023)
fire_events_df["ignition_date"] = pd.to_datetime("2023-01-01") + pd.to_timedelta(fire_events_df["DOB"] - 1, unit="D")

# Sort by year and fire_number
fire_events_df = fire_events_df.sort_values(by=["year", "fire_number"]).reset_index(drop=True)

# Reorder columns – put ID, year, fire_number, ignition_date, lon, lat at the front
cols = fire_events_df.columns.tolist()
new_order = ['ID', 'year', 'fire_number', 'ignition_date', 'lon', 'lat'] + \
            [col for col in cols if col not in ['ID', 'year', 'fire_number', 'ignition_date', 'lon', 'lat']]
fire_events_df = fire_events_df[new_order]

# Display the result
print(f"Fire events: shape={fire_events_df.shape}")
display(fire_events_df.head(10))

Fire events: shape=(408, 56)


Unnamed: 0,ID,year,fire_number,ignition_date,lon,lat,DOB,fireday,firearea,ecozone,...,nonfuel5k,nonfuel10k,dem,slope,twi,aspect,cumuarea,pctgrowth,prevgrow,sprdistm
0,2023_1,2023,1,2023-07-13,-120.132411,65.169616,194,1,55.89,4.0,...,0.38866,0.434968,160.333328,2.923349,8.141331,0.184916,55.89,0.0,0.0,843.571918
1,2023_2,2023,2,2023-07-21,-115.510326,66.083561,202,1,75.33,5.0,...,0.422344,0.414876,423.444458,7.380778,4.917335,67.092438,75.33,0.0,0.0,979.352515
2,2023_3,2023,3,2023-07-25,-116.750961,65.775854,206,1,34.02,5.0,...,0.315463,0.292883,386.222229,15.139147,5.079659,180.293304,34.02,0.0,0.0,658.145951
3,2023_10,2023,10,2023-07-01,-131.612507,65.975299,182,1,2.43,4.0,...,0.31733,0.253425,234.0,0.979205,5.813332,161.413071,2.43,0.0,0.0,175.896904
4,2023_11,2023,11,2023-07-29,-127.44726,67.987183,210,1,5.67,3.0,...,0.013489,0.086612,209.888885,2.824383,6.089895,29.511507,5.67,0.0,0.0,268.686959
5,2023_12,2023,12,2023-07-27,-128.01182,67.903474,208,1,1.62,4.0,...,0.082686,0.171423,245.888885,4.610787,5.581316,7.003726,1.62,0.0,0.0,143.619221
6,2023_13,2023,13,2023-07-29,-130.636332,67.60115,210,1,9.72,4.0,...,0.089316,0.11528,235.888885,1.053497,6.569538,101.103996,9.72,0.0,0.0,351.793809
7,2023_14,2023,14,2023-07-22,-129.851246,68.622171,203,1,59.94,4.0,...,0.028045,0.031579,200.555557,3.353917,6.300505,212.779419,59.94,0.0,0.0,873.601616
8,2023_15,2023,15,2023-07-08,-135.664836,67.183122,189,1,34.02,4.0,...,0.013413,0.040349,518.333313,8.221143,5.436932,118.024933,34.02,0.0,0.0,658.145951
9,2023_16,2023,16,2023-07-03,-131.482533,65.91942,184,1,271.35,4.0,...,0.300945,0.300156,248.111115,1.358212,6.176038,96.124161,271.35,0.0,0.0,1858.74568


### 2.2 Remove those fire events missing ignition day

- There are 3 fire events lack of ingition_day. These event will be skipped in this report.

In [None]:
# Get all unique fire IDs from the dataset
all_ids = set(df_cfsds_pts_2023['ID'].unique())

# Get fire IDs that have a row where fireday == 1 (assumed ignition)
ids_with_ignition = set(df_cfsds_pts_2023[df_cfsds_pts_2023['fireday'] == 1]['ID'].unique())

# Find fire IDs that are missing fireday == 1
ids_missing_ignition = all_ids - ids_with_ignition

# Print results
print(f"Number of fire IDs missing fireday == 1: {len(ids_missing_ignition)}")
print("Example IDs missing fireday == 1:")
print(list(ids_missing_ignition)[:10])  # Show first 10 missing IDs

Number of fire IDs missing fireday == 1: 3
Example IDs missing fireday == 1:
['2023_1721', '2023_102', '2023_1177']


### 2.3 Fetch and integrate detection day from MODIS dataset

- Merge detection day

In [None]:
# Extract MODIS hotspot coordinates and acquisition dates
modis_coords = df_modis_2023[["longitude", "latitude"]].to_numpy()
modis_dates = df_modis_2023["acq_date"].to_numpy()

# Build NearestNeighbors model using 1 nearest neighbor
nn_model = NearestNeighbors(n_neighbors=1, algorithm='ball_tree')
nn_model.fit(modis_coords)

# Match each fire event to the nearest MODIS hotspot
fire_coords = fire_events_df[["lon", "lat"]].to_numpy()
distances, indices = nn_model.kneighbors(fire_coords)

# Retrieve matched MODIS acquisition dates
matched_dates = modis_dates[indices.flatten()]

# Assign acquisition dates to fire events dataframe
fire_events_df["modis_acq_date"] = matched_dates

# Move 'modis_acq_date' column to be right after 'lat'
fire_events_df = move_column_after(fire_events_df, "modis_acq_date", "lat")

# Display result
display(fire_events_df.head(10))

Unnamed: 0,ID,year,fire_number,ignition_date,lon,lat,modis_acq_date,DOB,fireday,firearea,...,nonfuel5k,nonfuel10k,dem,slope,twi,aspect,cumuarea,pctgrowth,prevgrow,sprdistm
0,2023_1,2023,1,2023-07-13,-120.132411,65.169616,2023-07-14,194,1,55.89,...,0.38866,0.434968,160.333328,2.923349,8.141331,0.184916,55.89,0.0,0.0,843.571918
1,2023_2,2023,2,2023-07-21,-115.510326,66.083561,2023-07-22,202,1,75.33,...,0.422344,0.414876,423.444458,7.380778,4.917335,67.092438,75.33,0.0,0.0,979.352515
2,2023_3,2023,3,2023-07-25,-116.750961,65.775854,2023-07-26,206,1,34.02,...,0.315463,0.292883,386.222229,15.139147,5.079659,180.293304,34.02,0.0,0.0,658.145951
3,2023_10,2023,10,2023-07-01,-131.612507,65.975299,2023-07-03,182,1,2.43,...,0.31733,0.253425,234.0,0.979205,5.813332,161.413071,2.43,0.0,0.0,175.896904
4,2023_11,2023,11,2023-07-29,-127.44726,67.987183,2023-08-02,210,1,5.67,...,0.013489,0.086612,209.888885,2.824383,6.089895,29.511507,5.67,0.0,0.0,268.686959
5,2023_12,2023,12,2023-07-27,-128.01182,67.903474,2023-07-31,208,1,1.62,...,0.082686,0.171423,245.888885,4.610787,5.581316,7.003726,1.62,0.0,0.0,143.619221
6,2023_13,2023,13,2023-07-29,-130.636332,67.60115,2023-07-30,210,1,9.72,...,0.089316,0.11528,235.888885,1.053497,6.569538,101.103996,9.72,0.0,0.0,351.793809
7,2023_14,2023,14,2023-07-22,-129.851246,68.622171,2023-07-22,203,1,59.94,...,0.028045,0.031579,200.555557,3.353917,6.300505,212.779419,59.94,0.0,0.0,873.601616
8,2023_15,2023,15,2023-07-08,-135.664836,67.183122,2023-07-10,189,1,34.02,...,0.013413,0.040349,518.333313,8.221143,5.436932,118.024933,34.02,0.0,0.0,658.145951
9,2023_16,2023,16,2023-07-03,-131.482533,65.91942,2023-07-04,184,1,271.35,...,0.300945,0.300156,248.111115,1.358212,6.176038,96.124161,271.35,0.0,0.0,1858.74568


- Calculate and fill the detection delay

In [36]:
# Calculate delay days
fire_events_df["detection_delay_days"] = (
    pd.to_datetime(fire_events_df["modis_acq_date"]) -
    pd.to_datetime(fire_events_df["ignition_date"])
).dt.days

fire_events_df = move_column_after(fire_events_df, "detection_delay_days", "modis_acq_date")

display(fire_events_df.head(10))


Unnamed: 0,ID,year,fire_number,ignition_date,lon,lat,modis_acq_date,detection_delay_days,DOB,fireday,...,nonfuel5k,nonfuel10k,dem,slope,twi,aspect,cumuarea,pctgrowth,prevgrow,sprdistm
0,2023_1,2023,1,2023-07-13,-120.132411,65.169616,2023-07-14,1,194,1,...,0.38866,0.434968,160.333328,2.923349,8.141331,0.184916,55.89,0.0,0.0,843.571918
1,2023_2,2023,2,2023-07-21,-115.510326,66.083561,2023-07-22,1,202,1,...,0.422344,0.414876,423.444458,7.380778,4.917335,67.092438,75.33,0.0,0.0,979.352515
2,2023_3,2023,3,2023-07-25,-116.750961,65.775854,2023-07-26,1,206,1,...,0.315463,0.292883,386.222229,15.139147,5.079659,180.293304,34.02,0.0,0.0,658.145951
3,2023_10,2023,10,2023-07-01,-131.612507,65.975299,2023-07-03,2,182,1,...,0.31733,0.253425,234.0,0.979205,5.813332,161.413071,2.43,0.0,0.0,175.896904
4,2023_11,2023,11,2023-07-29,-127.44726,67.987183,2023-08-02,4,210,1,...,0.013489,0.086612,209.888885,2.824383,6.089895,29.511507,5.67,0.0,0.0,268.686959
5,2023_12,2023,12,2023-07-27,-128.01182,67.903474,2023-07-31,4,208,1,...,0.082686,0.171423,245.888885,4.610787,5.581316,7.003726,1.62,0.0,0.0,143.619221
6,2023_13,2023,13,2023-07-29,-130.636332,67.60115,2023-07-30,1,210,1,...,0.089316,0.11528,235.888885,1.053497,6.569538,101.103996,9.72,0.0,0.0,351.793809
7,2023_14,2023,14,2023-07-22,-129.851246,68.622171,2023-07-22,0,203,1,...,0.028045,0.031579,200.555557,3.353917,6.300505,212.779419,59.94,0.0,0.0,873.601616
8,2023_15,2023,15,2023-07-08,-135.664836,67.183122,2023-07-10,2,189,1,...,0.013413,0.040349,518.333313,8.221143,5.436932,118.024933,34.02,0.0,0.0,658.145951
9,2023_16,2023,16,2023-07-03,-131.482533,65.91942,2023-07-04,1,184,1,...,0.300945,0.300156,248.111115,1.358212,6.176038,96.124161,271.35,0.0,0.0,1858.74568


In [37]:
fire_events_df_2023 = fire_events_df

- Save csv

In [38]:
# Save cleaned DataFrame to CSV
path = "output/fire_events_df_2023.csv"
fire_events_df_2023.to_csv(path, index=False)

print(f"Saved cleaned DataFrame to {path}")

Saved cleaned DataFrame to output/fire_events_df_2023.csv


### Test MLR with the integrated dataframe

- MLR: Try to find if there are relations between the columns and detection_delay_days in the integrated dataframe.

In [None]:
# Drop rows with missing values in the relevant columns
df = fire_events_df.dropna(subset=["detection_delay_days"])

# Select numeric columns only (excluding ID and coordinates)
exclude_cols = ["ID", "year", "fire_number", "ignition_date", "modis_acq_date", 
                "DOB", "fireday", "lon", "lat", "detection_delay_days"]

numeric_cols = df.select_dtypes(include=["number"]).columns
predictors = [col for col in numeric_cols if col not in exclude_cols]

# Define X and y
X = df[predictors]
y = df["detection_delay_days"]

# Add constant term to the predictors
X = sm.add_constant(X)

# Fit MLR model 
model = sm.OLS(y, X).fit()

print(model.summary())

                             OLS Regression Results                             
Dep. Variable:     detection_delay_days   R-squared:                       0.107
Model:                              OLS   Adj. R-squared:                 -0.001
Method:                   Least Squares   F-statistic:                    0.9892
Date:                  Fri, 04 Jul 2025   Prob (F-statistic):              0.496
Time:                          23:55:29   Log-Likelihood:                -1912.3
No. Observations:                   408   AIC:                             3915.
Df Residuals:                       363   BIC:                             4095.
Df Model:                            44                                         
Covariance Type:              nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0238     

## 3. Summary
We selected all unique fire events from the CFSDS 2023 dataset and calculated the detection delay in days by comparing the ignition date from CFSDS with the detection date from MODIS 2023. Finally, we performed multiple linear regression (MLR) using all available features to explain detection delay. The results show that these features can only explain 10.7% of the variation in detection delay. This suggests that, when using satellite sensors to detect wildfires, detection delays may be difficult to predict based on the available features.