# Weather Data Exploration
This notebook analyzes the retrieved weather data and identifies potential data quality issues.

## 1. Setup
Prepare the environment by importing necessary libraries and setting up input/output directories.

In [13]:
import os
import ast
import pandas as pd

In [14]:
# Input/Output directories
RAW_INPUT_DIR = 'data/weather_data/raw'
FULL_OUTPUT_DIR = 'data/weather_data/full'
os.makedirs(FULL_OUTPUT_DIR, exist_ok=True)

## 2. Load Raw Weather Data
Load and combine all raw weather data files into a single DataFrame for analysis.

In [15]:
# Load all raw weather data files
data_list = os.listdir(RAW_INPUT_DIR)
weather = pd.DataFrame()
for file in data_list:
    if file.endswith('.parquet'):
        file_path = os.path.join(RAW_INPUT_DIR, file)
        temp_df = pd.read_parquet(file_path)
        if not weather.empty:
            weather = pd.concat([weather, temp_df], ignore_index=True)
        else:
            weather = temp_df

print(f"Total records loaded: {len(weather)}")
print(f"Columns: {weather.columns.tolist()}")

Total records loaded: 404430
Columns: ['geometry.coordinates', 'date_tm-value', 'stn_nam-value', 'avg_wnd_spd_10m_pst10mts', 'avg_wnd_spd_10m_pst1hr', 'air_temp', 'air_temp-qa', 'dwpt_temp', 'rel_hum', 'rel_hum-qa', 'rnfl_amt_pst1mt', 'rnfl_amt_pst1mt-qa', 'rnfl_amt_pst1hr', 'dwpt_temp-qa', 'avg_vis_pst10mts', 'rnfl_amt_pst1hr-qa', 'vis', 'vis-qa']


In [16]:
weather.head()

Unnamed: 0,geometry.coordinates,date_tm-value,stn_nam-value,avg_wnd_spd_10m_pst10mts,avg_wnd_spd_10m_pst1hr,air_temp,air_temp-qa,dwpt_temp,rel_hum,rel_hum-qa,rnfl_amt_pst1mt,rnfl_amt_pst1mt-qa,rnfl_amt_pst1hr,dwpt_temp-qa,avg_vis_pst10mts,rnfl_amt_pst1hr-qa,vis,vis-qa
0,"[-123.1333, 49.0039, 8.3]",2025-11-18T08:00:00.000Z,TSAWWASSEN FERRY AUTO,4.0,3.1,,,,,,,,,,,,,
1,"[-122.7839, 49.0181, 13.0]",2025-11-18T08:00:00.000Z,WHITE ROCK,1.3,1.1,6.6,100.0,5.9,95.0,100.0,0.0,100.0,0.0,,,,,
2,"[-123.193392, 49.347059, 170.885]",2025-11-18T08:00:00.000Z,WEST VANCOUVER AUT,2.7,2.9,4.9,100.0,4.3,96.0,100.0,0.0,100.0,0.0,,,,,
3,"[-122.690076, 49.208307, 6.3]",2025-11-18T08:00:00.000Z,PITT MEADOWS CS,2.3,2.6,7.2,100.0,6.7,97.0,100.0,0.0,100.0,0.0,,,,,
4,"[-122.631, 49.1008, 10.4]",2025-11-18T08:00:00.000Z,Langley Regional,2.5,,6.2,100.0,5.4,95.0,100.0,,,,100.0,,,,


In [17]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404430 entries, 0 to 404429
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   geometry.coordinates      404430 non-null  object 
 1   date_tm-value             404430 non-null  object 
 2   stn_nam-value             404430 non-null  object 
 3   avg_wnd_spd_10m_pst10mts  399458 non-null  float64
 4   avg_wnd_spd_10m_pst1hr    394695 non-null  float64
 5   air_temp                  337451 non-null  float64
 6   air_temp-qa               337451 non-null  float64
 7   dwpt_temp                 337164 non-null  float64
 8   rel_hum                   337164 non-null  float64
 9   rel_hum-qa                337451 non-null  float64
 10  rnfl_amt_pst1mt           257164 non-null  float64
 11  rnfl_amt_pst1mt-qa        257164 non-null  float64
 12  rnfl_amt_pst1hr           263844 non-null  float64
 13  dwpt_temp-qa              7070 non-null    f

## 3. Check Missing Values by Station
Analyze missing values for key features grouped by station. Identify stations with incomplete or insufficient data, such as missing rainfall measurements.

In [18]:
cols_to_check = ['vis', 'rnfl_amt_pst1mt', 'rnfl_amt_pst1hr', 'air_temp', 'rel_hum', 'dwpt_temp', 'avg_wnd_spd_10m_pst10mts', 'avg_wnd_spd_10m_pst1hr']

# Calculate the number of null values for each column grouped by station name
result = weather.groupby('stn_nam-value')[cols_to_check].apply(lambda x: x.isnull().sum())
result['total_rows'] = weather.groupby('stn_nam-value').size()
result[['total_rows']+cols_to_check]

Unnamed: 0_level_0,total_rows,vis,rnfl_amt_pst1mt,rnfl_amt_pst1hr,air_temp,rel_hum,dwpt_temp,avg_wnd_spd_10m_pst10mts,avg_wnd_spd_10m_pst1hr
stn_nam-value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ALEX FRASER BRIDGE,145,145,145,145,0,0,0,145,145
ALEX FRASER CROSS BEAM,143,143,143,143,0,143,143,143,143
ALEX FRASER TOP,142,142,142,142,0,0,0,142,142
ANNACIS ISLAND,144,144,144,144,0,144,144,144,144
Annacis Island,778,778,778,778,778,778,778,778,778
BRADNER ROAD,136,136,136,136,0,0,0,136,136
DELTA BURNS BOG,65442,65442,1116,0,0,0,0,0,0
EAGLE RIDGE,127,127,127,127,0,0,0,127,127
Langley Regional,1116,1116,1116,1116,0,0,0,1,1116
PITT MEADOWS CS,65444,65444,1116,0,0,0,0,0,0


## 4. Identify Stations to Exclude
Based on the analysis, identify stations to exclude. Since rainfall data is critical for this project, stations with no rainfall measurements are carefully excluded.

In [19]:
# Identify stations where rnfl_amt_pst1hr is all null
rain_null_check = weather.groupby('stn_nam-value')['rnfl_amt_pst1hr'].apply(lambda x: x.isnull().all())
exclude_stations = rain_null_check[rain_null_check].index.tolist()
exclude_stations = list(set(exclude_stations))

print(f"Stations to exclude (no rainfall data): {exclude_stations}")

Stations to exclude (no rainfall data): ['EAGLE RIDGE', 'ALEX FRASER CROSS BEAM', 'Annacis Island', 'Vancouver International', 'Vancouver Boundary Bay', 'SANDHEADS CS', 'BRADNER ROAD', 'ALEX FRASER BRIDGE', 'Vancouver Harbour', 'Rocky Point Park', 'Pitt Meadows', 'RICEMILL ROAD', 'PORT MANN BRIDGE MID SPAN', 'TSAWWASSEN FERRY AUTO', 'PORT MANN BRIDGE JOHNSTON HILL', 'Second Narrows', 'POINT ATKINSON', 'ANNACIS ISLAND', 'ALEX FRASER TOP', 'Langley Regional']


In [20]:
# Filter out excluded stations
weather_filtered = weather[~weather['stn_nam-value'].isin(exclude_stations)]
print(f"Records after filtering: {len(weather_filtered)}")

Records after filtering: 263844


In [21]:
# Calculate the number of null values for each column grouped by station name (after filtering)
result = weather_filtered.groupby('stn_nam-value')[cols_to_check].apply(lambda x: x.isnull().sum())
result['total_rows'] = weather_filtered.groupby('stn_nam-value').size()
result[['total_rows']+cols_to_check]

Unnamed: 0_level_0,total_rows,vis,rnfl_amt_pst1mt,rnfl_amt_pst1hr,air_temp,rel_hum,dwpt_temp,avg_wnd_spd_10m_pst10mts,avg_wnd_spd_10m_pst1hr
stn_nam-value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DELTA BURNS BOG,65442,65442,1116,0,0,0,0,0,0
PITT MEADOWS CS,65444,65444,1116,0,0,0,0,0,0
VANCOUVER HARBOUR CS,1104,1104,1104,0,0,0,0,1104,1104
VANCOUVER SEA ISLAND CCG,1116,1116,1116,0,0,0,0,0,0
WEST VANCOUVER AUT,65295,65295,1112,0,0,0,0,0,0
WHITE ROCK,65443,65443,1116,0,0,0,0,0,0


## 5. Time Bucket Analysis
Group the data into 10-minute intervals to ensure stable and consistent values for analysis.

In [22]:
# Avoid SettingWithCopyWarning by using .loc
weather_filtered = weather_filtered.copy()

# Convert to datetime with UTC timezone
weather_filtered['date_tm-value'] = pd.to_datetime(weather_filtered['date_tm-value'], utc=True)

# Convert to local timezone
weather_filtered['local_time'] = weather_filtered['date_tm-value'].dt.tz_convert('America/Vancouver')
weather_filtered['time_bucket'] = weather_filtered['local_time'].dt.floor('10min')

weather_filtered[['date_tm-value', 'local_time', 'time_bucket']].head()

Unnamed: 0,date_tm-value,local_time,time_bucket
1,2025-11-18 08:00:00+00:00,2025-11-18 00:00:00-08:00,2025-11-18 00:00:00-08:00
2,2025-11-18 08:00:00+00:00,2025-11-18 00:00:00-08:00,2025-11-18 00:00:00-08:00
3,2025-11-18 08:00:00+00:00,2025-11-18 00:00:00-08:00,2025-11-18 00:00:00-08:00
7,2025-11-18 08:00:00+00:00,2025-11-18 00:00:00-08:00,2025-11-18 00:00:00-08:00
9,2025-11-18 08:00:00+00:00,2025-11-18 00:00:00-08:00,2025-11-18 00:00:00-08:00


In [23]:
# Check null values after aggregation by time bucket
weather_filtered.groupby(['stn_nam-value', 'time_bucket'])[cols_to_check].mean().isnull().sum()

vis                         24958
rnfl_amt_pst1mt              1917
rnfl_amt_pst1hr                 0
air_temp                        0
rel_hum                         0
dwpt_temp                       0
avg_wnd_spd_10m_pst10mts      955
avg_wnd_spd_10m_pst1hr        955
dtype: int64

## 6. Extract and Save Station Coordinates
Extract the longitude and latitude of each weather station to facilitate mapping and integration with other datasets.

In [24]:
# Extract geometry info
def parse_coordinates(coord):
    """Parse coordinates from string, list, or numpy array."""
    if isinstance(coord, str):
        return ast.literal_eval(coord)
    elif hasattr(coord, 'tolist'):  # numpy array
        return coord.tolist()
    return coord  # already a list

weather_filtered['geometry.coordinates'] = weather_filtered['geometry.coordinates'].apply(parse_coordinates)
station_coords = weather_filtered.groupby('stn_nam-value')['geometry.coordinates'].first()
coords_df = pd.DataFrame(station_coords.tolist(), index=station_coords.index)
coords_df = coords_df.iloc[:, :2]
coords_df.columns = ['lon', 'lat']
coords_df.index.name = 'station_name'
coords_df = coords_df.reset_index()
coords_df

Unnamed: 0,station_name,lon,lat
0,DELTA BURNS BOG,-123.002436,49.125992
1,PITT MEADOWS CS,-122.690076,49.208307
2,VANCOUVER HARBOUR CS,-123.121687,49.295348
3,VANCOUVER SEA ISLAND CCG,-123.187254,49.182552
4,WEST VANCOUVER AUT,-123.193392,49.347059
5,WHITE ROCK,-122.7839,49.0181


In [25]:
import folium

# Create a map centered on Vancouver area
center_lat = coords_df['lat'].mean()
center_lon = coords_df['lon'].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add markers for each station
for _, row in coords_df.iterrows():
    folium.Marker(
        location=[row['lat'], row['lon']],
        popup=row['station_name'],
        tooltip=row['station_name'],
        icon=folium.Icon(color='blue', icon='cloud')
    ).add_to(m)

m

In [26]:
coords_df.to_parquet(f'{FULL_OUTPUT_DIR}/station_coordinates_mst.parquet', index=False)
print(f"Station coordinates saved to {FULL_OUTPUT_DIR}/station_coordinates_mst.parquet")

Station coordinates saved to data/weather_data/full/station_coordinates_mst.parquet


## 7. Save Excluded Stations List
Save the list of excluded stations for reference and use in subsequent processing steps.

In [27]:
# Save excluded stations list for use in processing notebook
exclude_df = pd.DataFrame({'station_name': exclude_stations})
exclude_df.to_parquet(f'{FULL_OUTPUT_DIR}/excluded_stations.parquet', index=False)
print(f"Excluded stations saved to {FULL_OUTPUT_DIR}/excluded_stations.parquet")

Excluded stations saved to data/weather_data/full/excluded_stations.parquet
