# Retrieve Data

In [None]:
import pandas as pd
import requests
import time
import os

# 保存先ディレクトリの作成
output_dir = 'weather_data'
os.makedirs(output_dir, exist_ok=True)

# Vancouverの現地時間で期間を設定
start_date_local = '2025-12-17'
end_date_local = '2025-12-24'

# 日付範囲を生成 (Vancouver時間)
dates = pd.date_range(start=start_date_local, end=end_date_local, freq='D', tz='America/Vancouver')

weekly_data = []
week_start_date = dates[0]

base_url = "https://api.weather.gc.ca/collections/swob-realtime/items"
bbox = "-123.35,49.00,-122.40,49.40"
properties = "date_tm-value,stn_nam-value,air_temp,air_temp-qa,dwpt_temp,dwpt_temp-qa,rel_hum,rel_hum-qa,avg_wnd_spd_10m_pst10mts,avg_wnd_spd_10m_pst1hr,rnfl_amt_pst1mt,rnfl_amt_pst1mt-qa,rnfl_amt_pst1hr,rnfl_amt_pst1hr-qa,vis,vis-qa,avg_vis_pst10mts"

print(f"Fetching data from {dates[0]} to {dates[-1]} (Vancouver Time)...")

for i in range(len(dates) - 1):
    start_dt = dates[i]
    end_dt = dates[i+1]
    
    # API用にUTCに変換
    start_utc = start_dt.tz_convert('UTC').strftime('%Y-%m-%dT%H:%M:%SZ')
    end_utc = end_dt.tz_convert('UTC').strftime('%Y-%m-%dT%H:%M:%SZ')
    
    print(f"Processing: {start_utc} -> {end_utc}")
    
    params = {
        "f": "json",
        "bbox": bbox,
        "datetime": f"{start_utc}/{end_utc}",
        "properties": properties,
        "sortby": "date_tm-value",
        "limit": "10000"
    }
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if 'features' in data and len(data['features']) > 0:
            df = pd.json_normalize(data['features'])
            # カラム名のクリーニング
            df.columns = [c.replace('properties.', '') for c in df.columns]
            weekly_data.append(df)
            print(f"  Fetched {len(df)} records")
        else:
            print("  No data found")
            
    except Exception as e:
        print(f"  Error: {e}")
        
    time.sleep(3) # APIへの負荷軽減

    # 1週間ごと、または最後のループで保存
    if (i + 1) % 7 == 0 or i == len(dates) - 2:
        if weekly_data:
            weather_chunk = pd.concat(weekly_data, ignore_index=True)
            
            # 不要なカラムの削除
            cols_to_drop = ['id', 'type', 'geometry.type']
            weather_chunk = weather_chunk.drop(columns=[c for c in cols_to_drop if c in weather_chunk.columns])
            
            # ファイル名の生成 (開始日_終了日)
            filename = f"{output_dir}/weather_vancouver_{week_start_date.strftime('%Y%m%d')}_{end_dt.strftime('%Y%m%d')}.csv"
            weather_chunk.to_csv(filename, index=False)
            
            print(f"Saved {len(weather_chunk)} records to {filename}")
            
            # 次の週のためにリセット
            weekly_data = []
            # weather変数を更新（後続のセルのために最後のチャンクを残す）
            weather = weather_chunk
        
        # 次の週の開始日を設定
        if i < len(dates) - 2:
            week_start_date = dates[i+1]

print("Data collection complete.")
if 'weather' in locals():
    display(weather.head())
else:
    print("No data collected.")
    weather = pd.DataFrame()

# Exploring Data

In [None]:
import os
import pandas as pd
output_dir = 'weather_data'

data_list = os.listdir(output_dir)
weather = pd.DataFrame()
for file in data_list:
    if file.endswith('.csv'):
        file_path = os.path.join(output_dir, file)
        temp_df = pd.read_csv(file_path)
        if 'weather' in locals() and not weather.empty:
            weather = pd.concat([weather, temp_df], ignore_index=True)
        else:
            weather = temp_df

In [None]:
cols_to_check = ['vis', 'rnfl_amt_pst1mt', 'rnfl_amt_pst1hr', 'air_temp', 'rel_hum', 'dwpt_temp', 'avg_wnd_spd_10m_pst10mts', 'avg_wnd_spd_10m_pst1hr']

# Calculate the number of null values for each column grouped by station name
result = weather.groupby('stn_nam-value')[cols_to_check].apply(lambda x: x.isnull().sum())
result['total_rows'] = weather.groupby('stn_nam-value').size()
result[['total_rows']+cols_to_check]

In [None]:
# rnfl_amt_pst1hrが全て欠損しているステーションを特定
rain_null_check = weather.groupby('stn_nam-value')['rnfl_amt_pst1hr'].apply(lambda x: x.isnull().all())
exclude_stations = rain_null_check[rain_null_check].index.tolist()
exclude_stations = list(set(exclude_stations))

weather = weather[~weather['stn_nam-value'].isin(exclude_stations)]

In [None]:
# Calculate the number of null values for each column grouped by station name
result = weather.groupby('stn_nam-value')[cols_to_check].apply(lambda x: x.isnull().sum())
result['total_rows'] = weather.groupby('stn_nam-value').size()
result[['total_rows']+cols_to_check]

In [None]:
weather['date_tm-value'] = pd.to_datetime(weather['date_tm-value'])
weather['local_time'] = weather['date_tm-value'].dt.tz_convert('America/Vancouver')
weather['time_bucket'] = weather['local_time'].dt.floor('10min')

In [None]:
weather.groupby(['stn_nam-value', 'time_bucket'])[cols_to_check].mean().isnull().sum()


In [None]:
import ast
# geometry info
if isinstance(weather['geometry.coordinates'].iloc[0], str):
    weather['geometry.coordinates'] = weather['geometry.coordinates'].apply(ast.literal_eval)
station_coords = weather.groupby('stn_nam-value')['geometry.coordinates'].first()
coords_df = pd.DataFrame(station_coords.tolist(), index=station_coords.index)
coords_df = coords_df.iloc[:, :2]
coords_df.columns = ['lon', 'lat']
coords_df.index.name = 'station_name'

In [None]:
coords_df.index.name = 'station_name'

In [None]:
coords_df.to_csv('weather_data_full/station_coordinates_mst.csv')

# Feature Engineering

In [None]:
import numpy as np
import os
import pandas as pd

output_dir = 'weather_data'

data_list = os.listdir(output_dir)
weather_all = pd.DataFrame()
for file in data_list:
    if file.endswith('.csv'):
        file_path = os.path.join(output_dir, file)
        weather = pd.read_csv(file_path)
        weather = weather.copy()
        
        weather = weather[~weather['stn_nam-value'].isin(exclude_stations)]

        # local_time_10min_blockを作成
        weather['date_tm-value'] = pd.to_datetime(weather['date_tm-value'])
        weather['local_time'] = weather['date_tm-value'].dt.tz_convert('America/Vancouver')
        weather['time_bucket'] = weather['local_time'].dt.floor('10min')

        # aggregation
        agg_wind = weather.groupby(['stn_nam-value', 'time_bucket'])['avg_wnd_spd_10m_pst10mts'].mean().round(2)
        agg_temp = weather.groupby(['stn_nam-value', 'time_bucket'])['air_temp'].mean().round(2)
        agg_dew_point = weather.groupby(['stn_nam-value', 'time_bucket'])['dwpt_temp'].mean().round(2)
        agg_hum = weather.groupby(['stn_nam-value', 'time_bucket'])['rel_hum'].mean().round(2)
        agg_rain = weather.groupby(['stn_nam-value', 'time_bucket'])['rnfl_amt_pst1hr'].mean().round(2)


        # merge
        weather_agg = pd.concat([agg_wind, agg_temp, agg_dew_point, agg_hum, agg_rain], axis=1).reset_index()
        weather_agg.columns = ['station_name', 'time_bucket', 'wind_speed', 'temperature', 'dew_point', 'relative_humidity', 'rainfall_amount']

        # calculate humidex
        weather_agg['humidex'] = round(weather_agg['temperature'] + (0.5555 * (6.11 * np.exp(5417.7530 * (1/273.15 - 1/(weather_agg['dew_point'] + 273.15))) - 10)), 2)
        
        output_file = f"{output_dir}_agg/aggregated_{file}"
        weather_agg.to_csv(output_file, index=False)
        weather_all = pd.concat([weather_all, weather_agg], ignore_index=True)
        print(f"Processed and saved: {output_file}")

In [None]:
weather_all.head()

In [None]:
weather_all.isnull().sum()

In [None]:
weather_all.to_csv('weather_data_full/weather_data_aggregated_all_stations.csv', index=False)

In [None]:
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath('.')))
from src.data_connection import DatabaseConnector

db_connector = DatabaseConnector()

In [None]:

region = db_connector.read_sql(f"""
select region_id, center_lon, center_lat from 
gtfs_static.regions;
""")
region

In [None]:
import pandas as pd
weather_all = pd.read_csv('weather_data_full/weather_data_aggregated_all_stations.csv')

In [None]:
weather_all.head()

# Match Regions to Nearest Weather Stations
Calculate the distance between each region's center and all weather stations to find the nearest one.

In [None]:
import numpy as np

# Load station coordinates
station_coords = pd.read_csv('weather_data_full/station_coordinates_mst.csv')

def get_nearest_station(row, stations_df):
    # Calculate Euclidean distance
    distances = np.sqrt(
        (stations_df['lon'] - row['center_lon'])**2 + 
        (stations_df['lat'] - row['center_lat'])**2
    )
    nearest_idx = distances.idxmin()
    return stations_df.loc[nearest_idx, 'station_name']

# Find nearest station for each region
region['nearest_station'] = region.apply(lambda row: get_nearest_station(row, station_coords), axis=1)

region

In [None]:
region.to_csv('weather_data_full/regions_with_nearest_stations.csv', index=False)

In [None]:
weather_all['station_name'].value_counts()