# Weather Data Processing
This notebook processes weather data: feature engineering and region-station matching.

## 1. Setup

In [1]:
import os
import sys
import numpy as np
import pandas as pd

sys.path.append(os.path.dirname(os.path.abspath('.')))
from src.data_connection import DatabaseConnector

db_connector = DatabaseConnector()



In [2]:
# Input/Output directories
RAW_INPUT_DIR = 'data/weather_data/raw'
AGG_OUTPUT_DIR = 'data/weather_data/agg'
FULL_OUTPUT_DIR = 'data/weather_data/full'

os.makedirs(AGG_OUTPUT_DIR, exist_ok=True)
os.makedirs(FULL_OUTPUT_DIR, exist_ok=True)

In [3]:
# Load excluded stations list from exploration notebook
exclude_df = pd.read_parquet(f'{FULL_OUTPUT_DIR}/excluded_stations.parquet')
exclude_stations = exclude_df['station_name'].tolist()
print(f"Stations to exclude: {exclude_stations}")

Stations to exclude: ['EAGLE RIDGE', 'ALEX FRASER CROSS BEAM', 'Annacis Island', 'Vancouver International', 'Vancouver Boundary Bay', 'SANDHEADS CS', 'BRADNER ROAD', 'ALEX FRASER BRIDGE', 'Vancouver Harbour', 'Rocky Point Park', 'Pitt Meadows', 'RICEMILL ROAD', 'PORT MANN BRIDGE MID SPAN', 'TSAWWASSEN FERRY AUTO', 'PORT MANN BRIDGE JOHNSTON HILL', 'Second Narrows', 'POINT ATKINSON', 'ANNACIS ISLAND', 'ALEX FRASER TOP', 'Langley Regional']


## 2. Weather Feature Engineering

In [4]:
data_list = os.listdir(RAW_INPUT_DIR)
weather_all = pd.DataFrame()

for file in data_list:
    if file.endswith('.parquet'):
        file_path = os.path.join(RAW_INPUT_DIR, file)
        weather = pd.read_parquet(file_path)
        weather = weather.copy()
        
        # Filter out excluded stations
        weather = weather[~weather['stn_nam-value'].isin(exclude_stations)]

        # Create local_time and time_bucket
        weather['date_tm-value'] = pd.to_datetime(weather['date_tm-value'])
        weather['local_time'] = weather['date_tm-value'].dt.tz_convert('America/Vancouver')
        weather['time_bucket'] = weather['local_time'].dt.floor('10min')

        # Aggregation by station and time bucket
        agg_wind = weather.groupby(['stn_nam-value', 'time_bucket'])['avg_wnd_spd_10m_pst10mts'].mean().round(2)
        agg_temp = weather.groupby(['stn_nam-value', 'time_bucket'])['air_temp'].mean().round(2)
        agg_dew_point = weather.groupby(['stn_nam-value', 'time_bucket'])['dwpt_temp'].mean().round(2)
        agg_hum = weather.groupby(['stn_nam-value', 'time_bucket'])['rel_hum'].mean().round(2)
        agg_rain = weather.groupby(['stn_nam-value', 'time_bucket'])['rnfl_amt_pst1hr'].mean().round(2)

        # Merge aggregations
        weather_agg = pd.concat([agg_wind, agg_temp, agg_dew_point, agg_hum, agg_rain], axis=1).reset_index()
        weather_agg.columns = ['station_name', 'time_bucket', 'wind_speed', 'temperature', 'dew_point', 'relative_humidity', 'rainfall_amount']

        # Calculate humidex
        weather_agg['humidex'] = round(weather_agg['temperature'] + (0.5555 * (6.11 * np.exp(5417.7530 * (1/273.15 - 1/(weather_agg['dew_point'] + 273.15))) - 10)), 2)
        
        # Generate output filename
        output_filename = file.replace('weather_vancouver_', 'aggregated_weather_vancouver_')
        output_file = f"{AGG_OUTPUT_DIR}/{output_filename}"
        weather_agg.to_parquet(output_file, index=False)
        weather_all = pd.concat([weather_all, weather_agg], ignore_index=True)
        print(f"Processed and saved: {output_file}")

Processed and saved: data/weather_data/agg/aggregated_weather_vancouver_20251118_20251125.parquet
Processed and saved: data/weather_data/agg/aggregated_weather_vancouver_20251125_20251202.parquet
Processed and saved: data/weather_data/agg/aggregated_weather_vancouver_20251202_20251209.parquet
Processed and saved: data/weather_data/agg/aggregated_weather_vancouver_20251209_20251216.parquet
Processed and saved: data/weather_data/agg/aggregated_weather_vancouver_20251217_20251224.parquet
Processed and saved: data/weather_data/agg/aggregated_weather_vancouver_20251223_20251229.parquet
Processed and saved: data/weather_data/agg/aggregated_weather_vancouver_20251224_20251228.parquet


In [5]:
# Check for duplicates before deduplication
duplicates_before = weather_all.duplicated(subset=['station_name', 'time_bucket']).sum()
print(f"Total records before deduplication: {len(weather_all)}")
print(f"Duplicate records found: {duplicates_before}")

# Remove duplicates - keep the last occurrence (most recent data)
weather_all = weather_all.drop_duplicates(subset=['station_name', 'time_bucket'], keep='last')
print(f"Total records after deduplication: {len(weather_all)}")

weather_all.head()

Total records before deduplication: 27874
Duplicate records found: 2916
Total records after deduplication: 24958


Unnamed: 0,station_name,time_bucket,wind_speed,temperature,dew_point,relative_humidity,rainfall_amount,humidex
0,DELTA BURNS BOG,2025-11-18 00:00:00-08:00,0.01,5.96,5.95,100.0,0.0,5.59
1,DELTA BURNS BOG,2025-11-18 00:10:00-08:00,0.1,6.06,6.05,100.0,0.0,5.72
2,DELTA BURNS BOG,2025-11-18 00:20:00-08:00,0.07,6.04,6.03,100.0,0.0,5.69
3,DELTA BURNS BOG,2025-11-18 00:30:00-08:00,0.0,6.03,6.03,100.0,0.0,5.68
4,DELTA BURNS BOG,2025-11-18 00:40:00-08:00,0.0,6.04,6.03,100.0,0.0,5.69


In [6]:
weather_all.isnull().sum()

station_name           0
time_bucket            0
wind_speed           955
temperature            0
dew_point              0
relative_humidity      0
rainfall_amount        0
humidex                0
dtype: int64

In [7]:
weather_all.to_parquet(f'{FULL_OUTPUT_DIR}/weather_data_aggregated_all_stations.parquet', index=False)
print(f"All aggregated data saved to {FULL_OUTPUT_DIR}/weather_data_aggregated_all_stations.parquet")

All aggregated data saved to data/weather_data/full/weather_data_aggregated_all_stations.parquet


## 3. Match Regions to Nearest Weather Stations
Calculate the distance between each region's center and all weather stations to find the nearest one.

In [8]:
region = db_connector.read_sql("""
select region_id, center_lon, center_lat from 
gtfs_static.regions;
""")
print(f"Regions loaded: {len(region)}")
region.head()

Regions loaded: 21


Unnamed: 0,region_id,center_lon,center_lat
0,bowen_island_municipality,-123.370399,49.374255
1,burnaby,-122.964999,49.247917
2,coquitlam,-122.771894,49.29712
3,delta,-123.073195,49.077525
4,maple_ridge,-122.515789,49.261984


In [9]:
# Load station coordinates
station_coords = pd.read_parquet(f'{FULL_OUTPUT_DIR}/station_coordinates_mst.parquet')
station_coords = station_coords.set_index('station_name')
station_coords

Unnamed: 0_level_0,lon,lat
station_name,Unnamed: 1_level_1,Unnamed: 2_level_1
DELTA BURNS BOG,-123.002436,49.125992
PITT MEADOWS CS,-122.690076,49.208307
VANCOUVER HARBOUR CS,-123.121687,49.295348
VANCOUVER SEA ISLAND CCG,-123.187254,49.182552
WEST VANCOUVER AUT,-123.193392,49.347059
WHITE ROCK,-122.7839,49.0181


In [10]:
def get_nearest_station(row, stations_df):
    """Calculate Euclidean distance and return nearest station name."""
    distances = np.sqrt(
        (stations_df['lon'] - row['center_lon'])**2 + 
        (stations_df['lat'] - row['center_lat'])**2
    )
    nearest_idx = distances.idxmin()
    return nearest_idx

# Find nearest station for each region
region['nearest_station'] = region.apply(lambda row: get_nearest_station(row, station_coords), axis=1)

print(f"Regions with nearest stations assigned: {len(region)}")
region

Regions with nearest stations assigned: 21


Unnamed: 0,region_id,center_lon,center_lat,nearest_station
0,bowen_island_municipality,-123.370399,49.374255,WEST VANCOUVER AUT
1,burnaby,-122.964999,49.247917,DELTA BURNS BOG
2,coquitlam,-122.771894,49.29712,PITT MEADOWS CS
3,delta,-123.073195,49.077525,DELTA BURNS BOG
4,maple_ridge,-122.515789,49.261984,PITT MEADOWS CS
5,new_westminster,-122.918903,49.20824,DELTA BURNS BOG
6,pitt_meadows,-122.663145,49.272184,PITT MEADOWS CS
7,port_coquitlam,-122.762487,49.253257,PITT MEADOWS CS
8,richmond,-123.174548,49.169626,VANCOUVER SEA ISLAND CCG
9,port_moody,-122.873204,49.297214,PITT MEADOWS CS


In [11]:
region.to_parquet(f'{FULL_OUTPUT_DIR}/regions_with_nearest_stations.parquet', index=False)
print(f"Region-station mapping saved to {FULL_OUTPUT_DIR}/regions_with_nearest_stations.parquet")

Region-station mapping saved to data/weather_data/full/regions_with_nearest_stations.parquet


In [12]:
# Check station distribution across regions
region['nearest_station'].value_counts()

nearest_station
PITT MEADOWS CS             7
DELTA BURNS BOG             4
VANCOUVER HARBOUR CS        4
WEST VANCOUVER AUT          3
WHITE ROCK                  2
VANCOUVER SEA ISLAND CCG    1
Name: count, dtype: int64

In [13]:
# Check aggregated weather data station distribution
weather_all['station_name'].value_counts()

station_name
DELTA BURNS BOG             5762
PITT MEADOWS CS             5762
WHITE ROCK                  5762
WEST VANCOUVER AUT          5755
VANCOUVER SEA ISLAND CCG     962
VANCOUVER HARBOUR CS         955
Name: count, dtype: int64