# Data preparation
This notebook merges the [Canadian Wildfire Dataset](https://www.kaggle.com/datasets/ulasozdemir/wildfires-in-canada-19502021) from Kaggle and [Canadian Wildland Fire Information System](https://cwfis.cfs.nrcan.gc.ca/downloads/activefires/) with weather data from Environment and Climate Change Canada (ECCC) and Meteorological Service of Canada (MSC) by using the [MSC GeoMet API](https://eccc-msc.github.io/open-data/msc-geomet/readme_en/)

In [1]:
import pandas as pd
import numpy as np

# Merge dataset of wildfires with nearest weather station
## Station data

In [2]:
stations_df = pd.read_csv('stations.csv')

In [3]:
# Convert to standard decimal degrees
stations_df['LATITUDE'] = stations_df['LATITUDE'] / 10000000
stations_df['LONGITUDE'] = stations_df['LONGITUDE'] / 10000000

In [4]:
# Create coordinates column, merges both latitude and longitude with ',' as a seperator
stations_df['COORDINATES'] = stations_df['LATITUDE'].astype(str) + ',' + stations_df['LONGITUDE'].astype(str)

In [5]:
stations_df

Unnamed: 0,STN_ID,STATION_NAME,PROV_STATE_TERR_CODE,ENG_PROV_NAME,FRE_PROV_NAME,COUNTRY,LATITUDE,LONGITUDE,TIMEZONE,ELEVATION,...,HLY_FIRST_DATE,HLY_LAST_DATE,DLY_FIRST_DATE,DLY_LAST_DATE,MLY_FIRST_DATE,MLY_LAST_DATE,HAS_MONTHLY_SUMMARY,HAS_NORMALS_DATA,HAS_HOURLY_DATA,COORDINATES
0,5679,FIRE LAKE,QC,QUEBEC,QUÉBEC,CAN,52.2000,-67.2200,EST,609.6,...,,,1974-07-01 00:00:00,1974-08-31 00:00:00,1974-01-01 00:00:00,1974-12-01 00:00:00,Y,N,N,"52.2,-67.22"
1,5680,FORESTVILLE,QC,QUEBEC,QUÉBEC,CAN,48.4400,-69.0500,EST,76.2,...,,,1963-01-01 00:00:00,1996-10-31 00:00:00,1963-01-01 00:00:00,1996-10-01 00:00:00,Y,N,N,"48.44,-69.05"
2,5682,FORET MONTMORENCY,QC,QUEBEC,QUÉBEC,CAN,47.1900,-71.0900,EST,640.0,...,,,1965-07-01 00:00:00,2002-01-31 00:00:00,1965-01-01 00:00:00,2002-01-01 00:00:00,Y,N,N,"47.19,-71.09"
3,5687,GRANDES BERGERONNES,QC,QUEBEC,QUÉBEC,CAN,48.1500,-69.3100,EST,61.0,...,,,1951-08-01 00:00:00,2023-02-28 00:00:00,1951-01-01 00:00:00,2015-11-01 00:00:00,Y,Y,N,"48.15,-69.31"
4,5688,GRAND FONDS,QC,QUEBEC,QUÉBEC,CAN,47.4500,-70.0700,EST,365.8,...,,,1968-10-01 00:00:00,1991-05-31 00:00:00,1968-01-01 00:00:00,1991-11-01 00:00:00,Y,N,N,"47.45,-70.07"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8547,61,JAMES ISLAND,BC,BRITISH COLUMBIA,COLOMBIE-BRITANNIQUE,CAN,48.3600,-123.2100,PST,53.6,...,,,1914-02-01 00:00:00,1978-09-30 00:00:00,1914-01-01 00:00:00,1978-12-01 00:00:00,Y,N,N,"48.36,-123.21"
8548,78,SAANICHTON MT NEWTON,BC,BRITISH COLUMBIA,COLOMBIE-BRITANNIQUE,CAN,48.3551,-123.2538,PST,42.7,...,,,1980-08-01 00:00:00,2024-07-12 00:00:00,1980-01-01 00:00:00,2007-02-01 00:00:00,Y,Y,N,"48.3551,-123.2538"
8549,85,SAANICH DOM ASTRO OBS,BC,BRITISH COLUMBIA,COLOMBIE-BRITANNIQUE,CAN,48.3100,-123.2500,PST,222.5,...,,,1916-12-01 00:00:00,1977-06-30 00:00:00,1916-01-01 00:00:00,1977-12-01 00:00:00,Y,N,N,"48.31,-123.25"
8550,87,SAANICHTON CDA,BC,BRITISH COLUMBIA,COLOMBIE-BRITANNIQUE,CAN,48.3718,-123.2508,PST,61.0,...,,,1914-03-01 00:00:00,2023-02-09 00:00:00,1914-01-01 00:00:00,2007-02-01 00:00:00,Y,Y,N,"48.3718,-123.2508"


Review data for stations that are at the same location

In [6]:
# Duplicates in location
duplicates_df = stations_df[stations_df.duplicated(subset='COORDINATES', keep=False)]
duplicates_df = duplicates_df[['STATION_NAME', 'PROV_STATE_TERR_CODE', 'LATITUDE', 'LONGITUDE', 'COORDINATES', 'FIRST_DATE', 'LAST_DATE', 'ELEVATION', 'CLIMATE_IDENTIFIER']]
duplicates_df

Unnamed: 0,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,COORDINATES,FIRST_DATE,LAST_DATE,ELEVATION,CLIMATE_IDENTIFIER
5,HAVRE-SAINT-PIERRE A,QC,50.1700,-63.3600,"50.17,-63.36",1983-12-01 00:00:00,2008-07-24 09:00:00,37.8,7043018
32,MURDOCHVILLE HOLLAND,QC,48.5700,-65.3100,"48.57,-65.31",1973-01-01 00:00:00,1974-12-01 00:00:00,624.8,705ECH0
45,ETANG A LA TRUITE,QC,48.4100,-66.4700,"48.41,-66.47",1982-01-01 00:00:00,1983-12-01 00:00:00,337.1,7052C09
59,KIRKLAND LAKE,ON,48.0900,-80.0000,"48.09,-80.0",1950-01-01 00:00:00,1996-12-31 00:00:00,324.0,6074209
64,NEWMARKET,ON,44.0400,-79.2600,"44.04,-79.26",1871-01-01 00:00:00,1967-12-01 00:00:00,281.9,6155616
...,...,...,...,...,...,...,...,...,...
8499,TOFINO A,BC,49.0456,-125.4621,"49.0456,-125.4621",2014-10-23 11:00:00,2024-07-15 12:30:02,24.4,1038210
8500,STEWART A,BC,55.5600,-129.5900,"55.56,-129.59",2015-01-15 11:00:00,2024-07-15 12:30:02,7.3,1067741
8501,BLUE RIVER A,BC,52.0729,-119.1734,"52.0729,-119.1734",2015-01-15 11:00:00,2024-07-15 12:30:02,690.4,1160898
8503,BLACKCOMB MOUNTAIN 1540,BC,50.0600,-122.5500,"50.06,-122.55",1983-01-01 00:00:00,1994-12-01 00:00:00,1540.2,1100Q7Q


Each wildfire can only be associated to a single station. We want to sort the stations by the latest date, and only keep the station with the most recent data for each coordinate point

In [7]:
# Convert LAST_DATE to date
stations_df['LAST_DATE'] = pd.to_datetime(stations_df['LAST_DATE']).dt.date

In [8]:
# Sort from earliest to latest
stations_df = stations_df.sort_values(by='LAST_DATE')

In [9]:
stations_df

Unnamed: 0,STN_ID,STATION_NAME,PROV_STATE_TERR_CODE,ENG_PROV_NAME,FRE_PROV_NAME,COUNTRY,LATITUDE,LONGITUDE,TIMEZONE,ELEVATION,...,HLY_FIRST_DATE,HLY_LAST_DATE,DLY_FIRST_DATE,DLY_LAST_DATE,MLY_FIRST_DATE,MLY_LAST_DATE,HAS_MONTHLY_SUMMARY,HAS_NORMALS_DATA,HAS_HOURLY_DATA,COORDINATES
608,6776,FORTEAU,NL,NEWFOUNDLAND,TERRE-NEUVE,CAN,51.2800,-56.5800,NST,,...,,,1871-09-01 00:00:00,1878-09-30 00:00:00,1871-01-01 00:00:00,1878-12-01 00:00:00,Y,N,N,"51.28,-56.58"
1456,4770,GRANTON,ON,ONTARIO,ONTARIO,CAN,43.1200,-81.2000,EST,315.5,...,,,1873-01-01 00:00:00,1886-12-31 00:00:00,1873-01-01 00:00:00,1886-12-01 00:00:00,Y,N,N,"43.12,-81.2"
6541,4253,CORNWALL,ON,ONTARIO,ONTARIO,CAN,45.0100,-74.4400,EST,53.3,...,,,1867-01-01 00:00:00,1887-12-31 00:00:00,1867-01-01 00:00:00,1887-12-01 00:00:00,Y,N,N,"45.01,-74.44"
6661,5344,CRANBOURNE,QC,QUEBEC,QUÉBEC,CAN,46.2300,-70.4200,EST,,...,,,1875-07-01 00:00:00,1890-11-30 00:00:00,1875-01-01 00:00:00,1890-12-01 00:00:00,Y,N,N,"46.23,-70.42"
6799,4788,LONDON,ON,ONTARIO,ONTARIO,CAN,42.5900,-81.1200,EST,246.3,...,,,1871-11-01 00:00:00,1891-12-31 00:00:00,1871-01-01 00:00:00,1891-12-01 00:00:00,Y,N,N,"42.59,-81.12"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7760,51077,CLYDE RIVER A,NU,NUNAVUT,NUNAVUT,CAN,70.2909,-68.3101,EST,26.5,...,2013-01-10 11:00:00,2024-07-15 12:30:02,2013-01-10 00:00:00,2024-07-15 00:00:00,,,Y,N,Y,"70.2909,-68.3101"
7759,50822,LA GRANDE RIVIERE A,QC,QUEBEC,QUÉBEC,CAN,53.3731,-77.4215,EST,194.8,...,2012-11-08 10:00:00,2024-07-15 12:30:02,2012-11-08 00:00:00,2024-07-15 00:00:00,,,Y,N,Y,"53.3731,-77.4215"
7758,50722,MUSKRAT DAM,ON,ONTARIO,ONTARIO,CAN,53.2629,-91.4546,EST,277.7,...,2012-11-13 10:00:00,2024-07-15 12:30:02,2012-11-15 00:00:00,2024-07-15 00:00:00,,,Y,N,N,"53.2629,-91.4546"
1357,50090,ROUYN-NORANDA A,QC,QUEBEC,QUÉBEC,CAN,48.1222,-78.5008,EST,301.1,...,2012-12-03 12:00:00,2024-07-15 12:30:02,2018-10-30 00:00:00,2024-07-15 00:00:00,,,Y,N,Y,"48.1222,-78.5008"


In [10]:
# Unique coordinates only, keep only last occurrence of duplicated row (station with most recent data)
unique_stations_df = stations_df.drop_duplicates(subset='COORDINATES', keep='last')

In [11]:
# Drop useless columns
unique_stations_df = unique_stations_df[['STATION_NAME', 'PROV_STATE_TERR_CODE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'CLIMATE_IDENTIFIER']]
unique_stations_df

Unnamed: 0,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,ELEVATION,CLIMATE_IDENTIFIER
608,FORTEAU,NL,51.2800,-56.5800,,8501615
1456,GRANTON,ON,43.1200,-81.2000,315.5,6142993
6541,CORNWALL,ON,45.0100,-74.4400,53.3,6101872
6661,CRANBOURNE,QC,46.2300,-70.4200,,7021952
6799,LONDON,ON,42.5900,-81.1200,246.3,6144470
...,...,...,...,...,...,...
7760,CLYDE RIVER A,NU,70.2909,-68.3101,26.5,2400804
7759,LA GRANDE RIVIERE A,QC,53.3731,-77.4215,194.8,7093716
7758,MUSKRAT DAM,ON,53.2629,-91.4546,277.7,6015026
1357,ROUYN-NORANDA A,QC,48.1222,-78.5008,301.1,7086719


## Preparing wildfire dataset

In [12]:
wildfire_df = pd.read_csv('2000-2021+2023-2024.csv')

In [13]:
# Drop response type column
wildfire_df.drop('response_type', axis=1, inplace=True)

In [14]:
# Add new column fire_id which will keep track of unique fires after merging
wildfire_df['fire_id'] = wildfire_df.index
wildfire_df

Unnamed: 0,agency,lat,lon,date,hectares,cause,fire_id
0,BC,50.9050,-126.9292,2000-03-10,6.00,H,0
1,BC,49.2971,-122.2321,2000-08-21,0.20,H,1
2,BC,55.8000,-124.8167,2005-06-20,0.01,L,2
3,BC,49.8770,-121.5730,2000-08-06,0.10,L,3
4,BC,49.0667,-121.8333,2000-07-28,0.10,H,4
...,...,...,...,...,...,...,...
148756,AK,65.0282,-141.2460,2024-05-15,0.00,,148756
148757,BC,55.9370,-121.9170,2024-05-12,295.00,,148757
148758,PC,60.2150,-112.8720,2024-04-26,0.10,,148758
148759,PC,60.1950,-112.7960,2024-04-26,0.10,,148759


In [15]:
wildfire_df

Unnamed: 0,agency,lat,lon,date,hectares,cause,fire_id
0,BC,50.9050,-126.9292,2000-03-10,6.00,H,0
1,BC,49.2971,-122.2321,2000-08-21,0.20,H,1
2,BC,55.8000,-124.8167,2005-06-20,0.01,L,2
3,BC,49.8770,-121.5730,2000-08-06,0.10,L,3
4,BC,49.0667,-121.8333,2000-07-28,0.10,H,4
...,...,...,...,...,...,...,...
148756,AK,65.0282,-141.2460,2024-05-15,0.00,,148756
148757,BC,55.9370,-121.9170,2024-05-12,295.00,,148757
148758,PC,60.2150,-112.8720,2024-04-26,0.10,,148758
148759,PC,60.1950,-112.7960,2024-04-26,0.10,,148759


In [16]:
# Merging datasets takes too much memory, so chunk down the task in 3
# small_wildfire_df = wildfire_df[:50000]
# small_wildfire_df = wildfire_df[50000:100000]
small_wildfire_df = wildfire_df[100000:]

In [17]:
# Reorganize columns and drop cause
small_wildfire_df = small_wildfire_df[['fire_id', 'agency', 'lat', 'lon', 'date', 'hectares']]
small_wildfire_df

Unnamed: 0,fire_id,agency,lat,lon,date,hectares
100000,100000,ON,52.3500,-88.6450,2015-06-30,2.0
100001,100001,ON,52.0929,-88.4385,2015-06-26,7.0
100002,100002,ON,52.4716,-89.8217,2015-06-27,35.0
100003,100003,ON,52.5713,-89.0429,2015-06-29,1.0
100004,100004,ON,52.3231,-90.3257,2016-07-21,15.2
...,...,...,...,...,...,...
148756,148756,AK,65.0282,-141.2460,2024-05-15,0.0
148757,148757,BC,55.9370,-121.9170,2024-05-12,295.0
148758,148758,PC,60.2150,-112.8720,2024-04-26,0.1
148759,148759,PC,60.1950,-112.7960,2024-04-26,0.1


In [18]:
# Select columns for wildfire_df
wildfire_df = wildfire_df[['fire_id', 'agency', 'lat', 'lon', 'date', 'hectares']]
wildfire_df

Unnamed: 0,fire_id,agency,lat,lon,date,hectares
0,0,BC,50.9050,-126.9292,2000-03-10,6.00
1,1,BC,49.2971,-122.2321,2000-08-21,0.20
2,2,BC,55.8000,-124.8167,2005-06-20,0.01
3,3,BC,49.8770,-121.5730,2000-08-06,0.10
4,4,BC,49.0667,-121.8333,2000-07-28,0.10
...,...,...,...,...,...,...
148756,148756,AK,65.0282,-141.2460,2024-05-15,0.00
148757,148757,BC,55.9370,-121.9170,2024-05-12,295.00
148758,148758,PC,60.2150,-112.8720,2024-04-26,0.10
148759,148759,PC,60.1950,-112.7960,2024-04-26,0.10


## Haversine distance

In [19]:
# Haversine formula
def haversine(lon1, lat1, lon2, lat2):
    lati = lat2 - lat1
    long = lon2 - lon1

    haver_formula = np.sin(lati/2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(long / 2.0) ** 2

    dist = 2 * np.arcsin(np.sqrt(haver_formula ))
    # use appropriate value for radius of the earth (this is crude!)
    km = 6367 * dist #6367 for distance in KM for miles use 3958
    return km

In [20]:
# Cross-joining wildfires
df = small_wildfire_df.merge(unique_stations_df, how='cross')

In [21]:
df

Unnamed: 0,fire_id,agency,lat,lon,date,hectares,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,ELEVATION,CLIMATE_IDENTIFIER
0,100000,ON,52.350,-88.645,2015-06-30,2.0,FORTEAU,NL,51.2800,-56.5800,,8501615
1,100000,ON,52.350,-88.645,2015-06-30,2.0,GRANTON,ON,43.1200,-81.2000,315.5,6142993
2,100000,ON,52.350,-88.645,2015-06-30,2.0,CORNWALL,ON,45.0100,-74.4400,53.3,6101872
3,100000,ON,52.350,-88.645,2015-06-30,2.0,CRANBOURNE,QC,46.2300,-70.4200,,7021952
4,100000,ON,52.350,-88.645,2015-06-30,2.0,LONDON,ON,42.5900,-81.1200,246.3,6144470
...,...,...,...,...,...,...,...,...,...,...,...,...
393452504,148760,NT,60.392,-117.896,2024-05-05,3.0,CLYDE RIVER A,NU,70.2909,-68.3101,26.5,2400804
393452505,148760,NT,60.392,-117.896,2024-05-05,3.0,LA GRANDE RIVIERE A,QC,53.3731,-77.4215,194.8,7093716
393452506,148760,NT,60.392,-117.896,2024-05-05,3.0,MUSKRAT DAM,ON,53.2629,-91.4546,277.7,6015026
393452507,148760,NT,60.392,-117.896,2024-05-05,3.0,ROUYN-NORANDA A,QC,48.1222,-78.5008,301.1,7086719


In [22]:
# Convert fire and weather climate station to radians using haversine
df[['fire_lat_radians', 'fire_lon_radians']] = np.radians(df.loc[:, ['lat', 'lon']])
df[['clim_lat_radians', 'clim_lon_radians']] = np.radians(df.loc[:, ['LATITUDE', 'LONGITUDE']])

In [23]:
df

Unnamed: 0,fire_id,agency,lat,lon,date,hectares,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,ELEVATION,CLIMATE_IDENTIFIER,fire_lat_radians,fire_lon_radians,clim_lat_radians,clim_lon_radians
0,100000,ON,52.350,-88.645,2015-06-30,2.0,FORTEAU,NL,51.2800,-56.5800,,8501615,0.913680,-1.547147,0.895005,-0.987507
1,100000,ON,52.350,-88.645,2015-06-30,2.0,GRANTON,ON,43.1200,-81.2000,315.5,6142993,0.913680,-1.547147,0.752586,-1.417207
2,100000,ON,52.350,-88.645,2015-06-30,2.0,CORNWALL,ON,45.0100,-74.4400,53.3,6101872,0.913680,-1.547147,0.785573,-1.299223
3,100000,ON,52.350,-88.645,2015-06-30,2.0,CRANBOURNE,QC,46.2300,-70.4200,,7021952,0.913680,-1.547147,0.806866,-1.229061
4,100000,ON,52.350,-88.645,2015-06-30,2.0,LONDON,ON,42.5900,-81.1200,246.3,6144470,0.913680,-1.547147,0.743336,-1.415811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393452504,148760,NT,60.392,-117.896,2024-05-05,3.0,CLYDE RIVER A,NU,70.2909,-68.3101,26.5,2400804,1.054039,-2.057673,1.226808,-1.192236
393452505,148760,NT,60.392,-117.896,2024-05-05,3.0,LA GRANDE RIVIERE A,QC,53.3731,-77.4215,194.8,7093716,1.054039,-2.057673,0.931536,-1.351260
393452506,148760,NT,60.392,-117.896,2024-05-05,3.0,MUSKRAT DAM,ON,53.2629,-91.4546,277.7,6015026,1.054039,-2.057673,0.929613,-1.596184
393452507,148760,NT,60.392,-117.896,2024-05-05,3.0,ROUYN-NORANDA A,QC,48.1222,-78.5008,301.1,7086719,1.054039,-2.057673,0.839891,-1.370097


In [24]:
# Create new column containing distance in km
df['distance'] = haversine(df['fire_lon_radians'], df['fire_lat_radians'], df['clim_lon_radians'], df['clim_lat_radians'])

In [25]:
df

Unnamed: 0,fire_id,agency,lat,lon,date,hectares,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,ELEVATION,CLIMATE_IDENTIFIER,fire_lat_radians,fire_lon_radians,clim_lat_radians,clim_lon_radians,distance
0,100000,ON,52.350,-88.645,2015-06-30,2.0,FORTEAU,NL,51.2800,-56.5800,,8501615,0.913680,-1.547147,0.895005,-0.987507,2187.912550
1,100000,ON,52.350,-88.645,2015-06-30,2.0,GRANTON,ON,43.1200,-81.2000,315.5,6142993,0.913680,-1.547147,0.752586,-1.417207,1165.459327
2,100000,ON,52.350,-88.645,2015-06-30,2.0,CORNWALL,ON,45.0100,-74.4400,53.3,6101872,0.913680,-1.547147,0.785573,-1.299223,1319.538896
3,100000,ON,52.350,-88.645,2015-06-30,2.0,CRANBOURNE,QC,46.2300,-70.4200,,7021952,0.913680,-1.547147,0.806866,-1.229061,1480.046118
4,100000,ON,52.350,-88.645,2015-06-30,2.0,LONDON,ON,42.5900,-81.1200,246.3,6144470,0.913680,-1.547147,0.743336,-1.415811,1221.496902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393452504,148760,NT,60.392,-117.896,2024-05-05,3.0,CLYDE RIVER A,NU,70.2909,-68.3101,26.5,2400804,1.054039,-2.057673,1.226808,-1.192236,2456.121125
393452505,148760,NT,60.392,-117.896,2024-05-05,3.0,LA GRANDE RIVIERE A,QC,53.3731,-77.4215,194.8,7093716,1.054039,-2.057673,0.931536,-1.351260,2531.919884
393452506,148760,NT,60.392,-117.896,2024-05-05,3.0,MUSKRAT DAM,ON,53.2629,-91.4546,277.7,6015026,1.054039,-2.057673,0.929613,-1.596184,1775.853433
393452507,148760,NT,60.392,-117.896,2024-05-05,3.0,ROUYN-NORANDA A,QC,48.1222,-78.5008,301.1,7086719,1.054039,-2.057673,0.839891,-1.370097,2839.076963


## Create new dataset with unique fire-station pair

In [26]:
# Group by the entire dataset on the basis of fire identity (fire_id) and then select the rows with
# minimum value for distance from weather station
fire_station = df.groupby(['fire_id'])['distance'].min()

In [27]:
fire_stations_df = pd.DataFrame(fire_station, columns=['distance'])
fire_stations_df['fire_id'] = fire_stations_df.index
fire_stations_df.index = range(fire_stations_df.shape[0]) # Make label column

In [28]:
fire_stations_df

Unnamed: 0,distance,fire_id
0,78.276220,100000
1,59.943096,100001
2,114.308566,100002
3,106.771430,100003
4,114.839353,100004
...,...,...
48756,91.556968,148756
48757,15.005006,148757
48758,18.156877,148758
48759,22.870821,148759


In [29]:
# Final merge
final_df = pd.merge(df, fire_stations_df, on=['fire_id', 'distance'])

In [30]:
final_df

Unnamed: 0,fire_id,agency,lat,lon,date,hectares,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,ELEVATION,CLIMATE_IDENTIFIER,fire_lat_radians,fire_lon_radians,clim_lat_radians,clim_lon_radians,distance
0,100000,ON,52.3500,-88.6450,2015-06-30,2.0,LANSDOWNE HOUSE (AUT),ON,52.1146,-87.5610,253.4,6014353,0.913680,-1.547147,0.909571,-1.528228,78.276220
1,100001,ON,52.0929,-88.4385,2015-06-26,7.0,LANSDOWNE HOUSE A,ON,52.1145,-87.5610,254.2,6014355,0.909193,-1.543543,0.909570,-1.528228,59.943096
2,100002,ON,52.4716,-89.8217,2015-06-27,35.0,BIG TROUT LAKE READAC,ON,53.4856,-89.5344,222.2,6010739,0.915802,-1.567684,0.933500,-1.562670,114.308566
3,100003,ON,52.5713,-89.0429,2015-06-29,1.0,BIG TROUT LAKE,ON,53.4858,-89.5330,222.5,6010740,0.917542,-1.554092,0.933503,-1.562646,106.771430
4,100004,ON,52.3231,-90.3257,2016-07-21,15.2,CENTRAL PATRICIA,ON,51.3000,-90.0900,345.0,6011305,0.913210,-1.576481,0.895354,-1.572367,114.839353
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48757,148756,AK,65.0282,-141.2460,2024-05-15,0.0,CLINTON CREEK,YT,64.2800,-140.4400,576.1,2100366,1.134956,-2.465208,1.121898,-2.451140,91.556968
48758,148757,BC,55.9370,-121.9170,2024-05-12,295.0,HUDSON HOPE BCHPA DAM,BC,56.0100,-122.1200,678.2,1183FL0,0.976285,-2.127853,0.977559,-2.131396,15.005006
48759,148758,PC,60.2150,-112.8720,2024-04-26,0.1,BEAR TOWER,NT,60.3100,-113.1400,213.4,2200554,1.050950,-1.969988,1.052608,-1.974666,18.156877
48760,148759,PC,60.1950,-112.7960,2024-04-26,0.1,BEAR TOWER,NT,60.3100,-113.1400,213.4,2200554,1.050601,-1.968662,1.052608,-1.974666,22.870821


In [31]:
# Select columns
final_df = final_df[['fire_id', 'agency', 'lat', 'lon', 'STATION_NAME', 'PROV_STATE_TERR_CODE', 'LATITUDE', 'LONGITUDE', 'distance', 'ELEVATION', 'hectares']]

In [32]:
# final_df.to_csv('fires-merged-1.csv', index=False)
# final_df.to_csv('fires-merged-2.csv', index=False)
final_df.to_csv('fires-merged-3.csv', index=False)

# Merge datasets

In [5]:
# Read
fires_1 = pd.read_csv('fires-merged-1.csv')
fires_2 = pd.read_csv('fires-merged-2.csv')
fires_3 = pd.read_csv('fires-merged-3.csv')

In [6]:
fires_merged_df = pd.concat([fires_1, fires_2, fires_3])

In [7]:
fires_merged_df

Unnamed: 0,fire_id,agency,lat,lon,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,distance,ELEVATION,hectares
0,0,BC,50.9050,-126.9292,EGG ISLAND,BC,51.1450,-127.50073,48.031939,14.0,6.00
1,1,BC,49.2971,-122.2321,PITT LAKE,BC,49.2600,-122.31000,6.992181,7.6,0.20
2,2,BC,55.8000,-124.8167,MESILINKA CAMP,BC,56.0700,-124.30000,43.984046,803.0,0.01
3,3,BC,49.8770,-121.5730,LYTTON A,BC,50.1328,-121.34550,32.742051,224.9,0.10
4,4,BC,49.0667,-121.8333,VEDDER SOUTH LICKMAN,BC,49.0600,-122.00000,12.160548,15.2,0.10
...,...,...,...,...,...,...,...,...,...,...,...
48757,148756,AK,65.0282,-141.2460,CLINTON CREEK,YT,64.2800,-140.44000,91.556968,576.1,0.00
48758,148757,BC,55.9370,-121.9170,HUDSON HOPE BCHPA DAM,BC,56.0100,-122.12000,15.005006,678.2,295.00
48759,148758,PC,60.2150,-112.8720,BEAR TOWER,NT,60.3100,-113.14000,18.156877,213.4,0.10
48760,148759,PC,60.1950,-112.7960,BEAR TOWER,NT,60.3100,-113.14000,22.870821,213.4,0.10


In [8]:
fires_merged_df.to_csv('fires-merged.csv', index=False)