# Weather data
This notebook merges the [Canadian Wildfire Dataset](https://www.kaggle.com/datasets/ulasozdemir/wildfires-in-canada-19502021) from Kaggle and [Canadian Wildland Fire Information System](https://cwfis.cfs.nrcan.gc.ca/downloads/activefires/) with [weather data](https://api.weather.gc.ca/collections/climate-daily?lang=en) from Environment and Climate Change Canada (ECCC) and Meteorological Service of Canada (MSC) by using the [MSC GeoMet API](https://eccc-msc.github.io/open-data/msc-geomet/readme_en/). To do this, we use the merged dataset (fire + nearest station) to associate each wildfire point with weather data in the [daily climate observation collection](https://api.weather.gc.ca/collections/climate-daily?lang=en)

In [1]:
import time

import pandas as pd
from owslib.ogcapi.features import Features

In [10]:
features = Features('https://api.weather.gc.ca/')

In [3]:
wildfire_df = pd.read_csv('fires-merged-1.csv')
# wildfire_df = pd.read_csv('fires-merged-2.csv')
# wildfire_df = pd.read_csv('fires-merged-3.csv')

In [4]:
# Repeat for smaller wildfires because API calls crashes, so split into smaller datasets
small_df1 = wildfire_df[:25000]
small_df2 = wildfire_df[25000:]

## Querying MSC GeoMet API

In [21]:
columns = ['TOTAL_RAIN_FLAG', 'MEAN_TEMPERATURE_FLAG', 'MIN_REL_HUMIDITY', 'HEATING_DEGREE_DAYS_FLAG', 'ID', 'MAX_TEMPERATURE', 'MAX_TEMPERATURE_FLAG', 'MIN_TEMPERATURE', 'TOTAL_RAIN', 'DIRECTION_MAX_GUST_FLAG', 'LOCAL_DAY', 'LOCAL_DATE', 'SNOW_ON_GROUND', 'PROVINCE_CODE', 'DIRECTION_MAX_GUST', 'COOLING_DEGREE_DAYS', 'SPEED_MAX_GUST_FLAG', 'MAX_REL_HUMIDITY_FLAG', 'MIN_TEMPERATURE_FLAG', 'TOTAL_PRECIPITATION_FLAG', 'STATION_NAME', 'LOCAL_YEAR', 'LOCAL_MONTH', 'MIN_REL_HUMIDITY_FLAG', 'TOTAL_SNOW_FLAG', 'MEAN_TEMPERATURE', 'CLIMATE_IDENTIFIER', 'SNOW_ON_GROUND_FLAG', 'TOTAL_SNOW', 'COOLING_DEGREE_DAYS_FLAG', 'SPEED_MAX_GUST', 'HEATING_DEGREE_DAYS', 'MAX_REL_HUMIDITY', 'TOTAL_PRECIPITATION']

In [6]:
rows = []

# Loop over every row in wildfire
for _, row in small_df2.iterrows():
    # For every row, we query wildfire by using date and station name. Some stations.csv you can see that some stations share names, 
    # but if they do, it is because they are at the same location, which allows for the algorithm to be more general when querying date
    climate_data = features.collection_items(
        'climate-daily', LOCAL_DATE=row['date'], STATION_NAME=row['STATION_NAME']
    )
    
    # If there is a match
    if climate_data['features']:
        # Get weather data
        properties = climate_data['features'][0]['properties']

        # Merge old wildfire data with associated weather data
        fire_data = row.to_dict()
        new_row = {**fire_data, **properties}
        
        # Add row
        rows.append(new_row)
    else:
        # Merge old wildfire data
        fire_data = row.to_dict()
        properties = {column: None for column in columns if column != 'STATION_NAME'} # Do not override STATION_NAME with None
        # Merge empty weather data
        new_row = {**fire_data, **properties}
        rows.append(new_row)

In [7]:
df = pd.DataFrame(rows)
df.to_csv('fires-weather-32.csv', index=False)

# Merge smaller datasets

In [1]:
import pandas as pd

In [3]:
# Read
fires_11 = pd.read_csv('fires-weather-11.csv')
fires_12 = pd.read_csv('fires-weather-12.csv')
fires_21 = pd.read_csv('fires-weather-21.csv')
fires_22 = pd.read_csv('fires-weather-22.csv')
fires_31 = pd.read_csv('fires-weather-31.csv')
fires_32 = pd.read_csv('fires-weather-32.csv')

In [4]:
fires_weather_concat = pd.concat([fires_11, fires_12, fires_21, fires_22, fires_31, fires_32])

In [5]:
# Save to CSV
fires_weather_concat.to_csv('fires-weather.csv', index=False)

# Get non-wildfire weather data
After the wildfire and weather has been merged, the duplicates and wildfire without `ID` (no corresponding weather data) were removed.
To query data points with no wildfire, we will query the same locations, but 50 days before

In [16]:
wildfire_climate_df = pd.read_csv('2000-2021+2023-2024-fire-weather.csv')

In [17]:
# Remove weather data
wildfire_climate_df = wildfire_climate_df[['fire_id', 'lat', 'lon', 'date', 'hectares', 'STATION_NAME', 'PROV_STATE_TERR_CODE', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'distance']]

In [18]:
# Convert date column to datetime
wildfire_climate_df['date'] = pd.to_datetime(wildfire_climate_df['date'])

# Shift back date by 50 days
wildfire_climate_df['date'] = wildfire_climate_df['date'] - pd.Timedelta(days=50)

In [19]:
wildfire_climate_df

Unnamed: 0,fire_id,lat,lon,date,hectares,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,ELEVATION,distance
0,0,50.9050,-126.9292,2000-01-20,6.00,EGG ISLAND,BC,51.145000,-127.500730,14.0,48.031939
1,11,59.9617,-121.3608,2000-05-08,1000.00,SAMBAA K'E,NT,60.262700,-121.142000,498.0,35.575309
2,12,59.9752,-121.0342,2000-05-22,12.00,SAMBAA K'E,NT,60.262700,-121.142000,498.0,32.501109
3,13,59.1767,-122.0190,2001-04-24,5.00,FORT NELSON UA,BC,58.502900,-122.342207,378.3,77.147719
4,24,59.4008,-120.6438,2000-05-07,0.10,PETITOT LO,AB,59.320000,-119.370000,777.2,72.695087
...,...,...,...,...,...,...,...,...,...,...,...
35953,148730,55.8120,-108.9650,2024-05-23,0.10,BUFFALO NARROWS A,SK,55.520000,-108.290000,421.2,53.317052
35954,148741,42.4775,-122.6030,2024-05-18,1652.81,RACE ROCKS,BC,48.175274,-123.315319,7.4,635.598709
35955,148747,66.8992,-141.3160,2024-05-10,2512.57,OLD CROW A,YT,67.341200,-139.502400,250.2,92.473700
35956,148750,66.1294,-141.0790,2024-04-30,21266.30,OLD CROW A,YT,67.341200,-139.502400,250.2,151.390408


In [22]:
rows = []
delay = 5 # seconds

# Loop over every row in wildfire
for _, row in wildfire_climate_df.iterrows():
    success = False
    # Retry until data is found
    while not success:
        try:
            climate_data = features.collection_items(
                'climate-daily', LOCAL_DATE=row['date'], STATION_NAME=row['STATION_NAME']
            )
            success = True
        # If network error, wait 5 seconds and try again
        except Exception as e:
            print(e)
            time.sleep(delay)
    # If there is a match
    if climate_data['features']:
        # Get weather data
        properties = climate_data['features'][0]['properties']

        # Merge old wildfire data with associated weather data
        fire_data = row.to_dict()
        new_row = {**fire_data, **properties}
        
        # Add row
        rows.append(new_row)
    else:
        # Merge old wildfire data
        fire_data = row.to_dict()
        properties = {column: None for column in columns if column != 'STATION_NAME'} # Do not override STATION_NAME with None
        # Merge empty weather data
        new_row = {**fire_data, **properties}
        rows.append(new_row)

In [23]:
df = pd.DataFrame(rows)
df.to_csv('no-fires-weather.csv', index=False)

In [5]:
# Missing data: weather data was found for wildfire, but not 50 days prior
missing = df['ID'].isnull().sum()
missing_percent = missing / len(df) * 100
print('Missing amount: ' + str(missing))
print('Missing percentage: ' + str(missing_percent))

Missing amount: 3626
Missing percentage: 10.083986873574725


In [6]:
# Remove rows without weather data
df.dropna(subset='ID', inplace=True)

In [8]:
df

Unnamed: 0,fire_id,lat,lon,date,hectares,STATION_NAME,PROV_STATE_TERR_CODE,LATITUDE,LONGITUDE,ELEVATION,...,HEATING_DEGREE_DAYS_FLAG,TOTAL_SNOW_FLAG,LOCAL_DATE,MIN_TEMPERATURE,PROVINCE_CODE,SPEED_MAX_GUST_FLAG,SPEED_MAX_GUST,MAX_TEMPERATURE,MEAN_TEMPERATURE_FLAG,SNOW_ON_GROUND_FLAG
0,0,50.9050,-126.9292,2000-01-20,6.00,EGG ISLAND,BC,51.145000,-127.500730,14.0,...,,,2000-01-20 00:00:00,-1.3,BC,,,3.3,,
1,11,59.9617,-121.3608,2000-05-08,1000.00,SAMBAA K'E,NT,60.262700,-121.142000,498.0,...,,,2000-05-08 00:00:00,-2.2,NT,,,14.5,,
2,12,59.9752,-121.0342,2000-05-22,12.00,SAMBAA K'E,NT,60.262700,-121.142000,498.0,...,,,2000-05-22 00:00:00,-0.9,NT,,,15.1,,
3,13,59.1767,-122.0190,2001-04-24,5.00,FORT NELSON UA,BC,58.502900,-122.342207,378.3,...,,,2001-04-24 00:00:00,0.4,BC,,,18.5,,
4,24,59.4008,-120.6438,2000-05-07,0.10,PETITOT LO,AB,59.320000,-119.370000,777.2,...,M,M,2000-05-07 00:00:00,,AB,,,,M,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35953,148730,55.8120,-108.9650,2024-05-23,0.10,BUFFALO NARROWS A,SK,55.520000,-108.290000,421.2,...,,,2024-05-23 00:00:00,4.6,SK,,32.0,11.4,,
35954,148741,42.4775,-122.6030,2024-05-18,1652.81,RACE ROCKS,BC,48.175274,-123.315319,7.4,...,,,2024-05-18 00:00:00,8.1,BC,,64.0,10.6,,
35955,148747,66.8992,-141.3160,2024-05-10,2512.57,OLD CROW A,YT,67.341200,-139.502400,250.2,...,,,2024-05-10 00:00:00,0.7,YT,,33.0,8.1,,
35956,148750,66.1294,-141.0790,2024-04-30,21266.30,OLD CROW A,YT,67.341200,-139.502400,250.2,...,,,2024-04-30 00:00:00,-14.1,YT,,53.0,-8.0,,


Since there are no wildfires here, we must set hectares to 0

In [10]:
df['hectares'] = 0

In [11]:
df.to_csv('no-fires-weather.csv', index=False)