The raw data sources—rainfall logs, traffic volume, and accident statistics—are initially unstructured, spatially scattered, or time-based. Without proper transformation, they can't be directly linked to individual roads in the road network graph. Preprocessing steps like spatial joins (for rainfall and casualty mapping), aggregation (e.g., average traffic volume per road), type normalization (ensuring all osmid values are strings), and missing value handling ensure that each node in the graph has a complete, aligned, and usable feature vector

In [70]:
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point


In [71]:
rainfall_df = pd.read_csv("data/rainfall_data.csv")
casualty_df = pd.read_csv("data/RoadTrafficAccidentCasualtiesMonthly.csv")
traffic_df = pd.read_csv("data/traffic_flow_data.csv")
road_gdf = gpd.read_file("data/singapore_road_network.gpkg", layer="edges")


RainFall feature engineering

In [None]:

# Step 2: Convert rainfall DataFrame to GeoDataFrame
rainfall_gdf = gpd.GeoDataFrame(
    rainfall_df,
    geometry=gpd.points_from_xy(rainfall_df.longitude, rainfall_df.latitude),
    crs="EPSG:4326"  # WGS 84 (latitude/longitude)
)

#project both datasets to SVY21 (meters), CRS: EPSG:3414
rainfall_gdf = rainfall_gdf.to_crs(epsg=3414)
road_gdf = road_gdf.to_crs(epsg=3414)

#nearest spatial join (each rainfall point to nearest road)
joined_rain = gpd.sjoin_nearest(
    rainfall_gdf,
    road_gdf,
    how="left",
    distance_col="dist_to_road"
)

# Aggregate rainfall per road segment using osmid
avg_rainfall_per_road = joined_rain.groupby('osmid')['rainfall_mm'].mean().reset_index()
avg_rainfall_per_road.columns = ['osmid', 'rainfall_mm'] 

# Normalize road_gdf osmid so each row has a single ID
def extract_first_osmid(val):
    if isinstance(val, list):
        return str(val[0])
    elif isinstance(val, str) and val.startswith("["):
        try:
            val = eval(val)
            return str(val[0]) if isinstance(val, list) else str(val)
        except:
            return str(val)
    else:
        return str(val)

# Create a new GeoDataFrame for road features
road_features = road_gdf[['osmid', 'geometry']].copy()
road_features['osmid'] = road_features['osmid'].apply(extract_first_osmid)
avg_rainfall_per_road['osmid'] = avg_rainfall_per_road['osmid'].astype(str)

# Merge rainfall into road features
road_features = road_features.merge(avg_rainfall_per_road, on='osmid', how='left')


road_features['rainfall_mm'] = road_features['rainfall_mm'].fillna(0).round(3)



In [73]:
rainfall_gdf = gpd.GeoDataFrame(
    rainfall_df,
    geometry=gpd.points_from_xy(rainfall_df.longitude, rainfall_df.latitude),
    crs="EPSG:4326"
)
#convert rainrall_df to a geodataframe , each row is now a geographic point
rainfall_gdf = gpd.GeoDataFrame(
    rainfall_df,
    geometry=gpd.points_from_xy(rainfall_df.longitude, rainfall_df.latitude),
    crs="EPSG:4326"
)
# project both to EPSG:3414 (SVY21 – projected CRS for Singapore)
rainfall_gdf = rainfall_gdf.to_crs(epsg=3414)
road_gdf = road_gdf.to_crs(epsg=3414)

joined_rain = gpd.sjoin_nearest(
    rainfall_gdf,
    road_gdf,
    how="left",
    distance_col="dist_to_road"
)

#This builds a GeoDataFrame from the rainfall data using lat/lon and sets the correct geographic CRS

In [74]:
avg_rainfall_per_road = joined_rain.groupby('osmid')['rainfall_mm'].mean().reset_index()
avg_rainfall_per_road.columns = ['osmid', 'rainfall_mm']  # Rename for clarity

# Step 6: Normalize osmid in road_gdf to ensure it's a single ID for joining

def extract_first_osmid(val):
    if isinstance(val, list):
        return str(val[0])
    elif isinstance(val, str) and val.startswith("["):
        try:
            val = eval(val)
            return str(val[0]) if isinstance(val, list) else str(val)
        except:
            return str(val)
    else:
        return str(val)

# Create a copy for features
road_features = road_gdf[['osmid', 'geometry']].copy()

# Apply osmid normalization
road_features['osmid'] = road_features['osmid'].apply(extract_first_osmid)
avg_rainfall_per_road['osmid'] = avg_rainfall_per_road['osmid'].astype(str)

# Step 7: Merge rainfall with road features
road_features = road_features.merge(avg_rainfall_per_road, on='osmid', how='left')

# Step 8: Handle missing data (rainfall for unmatched roads)
road_features['rainfall_mm'] = road_features['rainfall_mm'].fillna(0).round(3)

road_features[['osmid', 'rainfall_mm']].to_csv("data/rainfall_node_features.csv", index=False)
road_features.to_file("data/processed_rainfall_features.geojson", driver="GeoJSON")



In [75]:
# #project both to EPSG:3414 (SVY21 – projected CRS for Singapore)
# rainfall_gdf = rainfall_gdf.to_crs(epsg=3414)
# road_gdf = road_gdf.to_crs(epsg=3414)

# joined_rain = gpd.sjoin_nearest(
#     rainfall_gdf,
#     road_gdf,
#     how="left",
#     distance_col="dist_to_road"
# )
# #for rainfall and roadnetwork to use same coordinate system in meters for accurate spatial distance ops

In [76]:
avg_rainfall_per_road = joined_rain.groupby('osmid')['rainfall_mm'].mean().reset_index()
avg_rainfall_per_road.columns = ['osmid', 'rainfall_mm']

road_features['osmid'] = road_features['osmid'].astype(str)
avg_rainfall_per_road['osmid'] = avg_rainfall_per_road['osmid'].astype(str)

print(road_features[['osmid', 'rainfall_mm']].head())

       osmid  rainfall_mm
0   49961799          0.0
1   46337834          0.0
2  627326844          0.0
3  150829205          0.0
4  633215386          0.0


In [77]:
print(avg_rainfall_per_road.columns)
print(avg_rainfall_per_road.head())

Index(['osmid', 'rainfall_mm'], dtype='object')
        osmid  rainfall_mm
0   110858935     0.023621
1  1187360472     0.023162
2   172367140     0.013747
3   174886143     0.007576
4    22762645     0.008004


Casualty Data Transformation

In [78]:
casualty_long = casualty_df.melt(
    id_vars='DataSeries',
    var_name='Month',
    value_name='Value'
)
#data is in wide format which is hard to work with -> convert to melt

In [79]:
casualty_long["Value"] = pd.to_numeric(casualty_long["Value"], errors="coerce")

casualty_pivot = casualty_long.pivot_table(
    index='Month',
    columns='DataSeries',
    values='Value',
    aggfunc='mean'   # default, but you can change to sum, max, etc.
)

casualty_pivot = casualty_pivot.fillna(0).reset_index()

Traffic Volume Aggregation

In [80]:
traffic_df['Date'] = pd.to_datetime(traffic_df['Date'], dayfirst=True)
avg_traffic_volume = traffic_df.groupby('LinkID')['Volume'].mean().reset_index() #group traffic by linkID
avg_traffic_volume.columns = ['osmid', 'avg_volume'] #calc avg vehicle volume per road over dates

Merge all features into GCN Node Table

In [81]:
print(road_features['osmid'].dtype)
print(avg_traffic_volume['osmid'].dtype)


object
int64


In [85]:
# Step 1: Start from road geometries
road_features = road_gdf[['osmid', 'geometry']].copy()

# Step 2: Normalize osmid types
road_features['osmid'] = road_features['osmid'].astype(str)
avg_rainfall_per_road['osmid'] = avg_rainfall_per_road['osmid'].astype(str)
avg_traffic_volume['osmid'] = avg_traffic_volume['osmid'].astype(str)
accident_stats['osmid'] = accident_stats['osmid'].astype(str)  # <- assumes you've created this

# Step 3: Merge all feature data (rainfall, traffic, casualties)
road_features = road_features.merge(avg_rainfall_per_road, on='osmid', how='left')
road_features = road_features.merge(avg_traffic_volume, on='osmid', how='left')
road_features = road_features.merge(accident_stats, on='osmid', how='left')  # <- casualty data

# Step 4: Fill missing values with 0
road_features['rainfall_mm'] = road_features['rainfall_mm'].fillna(0)
road_features['avg_volume'] = road_features['avg_volume'].fillna(0)
road_features['casualties'] = road_features['casualties'].fillna(0)  # <- casualty data

# Step 5: Optionally create a binary label: 1 = hotspot, 0 = safe
road_features['label'] = (road_features['casualties'] > 5).astype(int)  # you can adjust threshold

# Step 6: Round numerical columns
road_features['rainfall_mm'] = road_features['rainfall_mm'].round(2)
road_features['avg_volume'] = road_features['avg_volume'].round(2)
road_features['casualties'] = road_features['casualties'].astype(int)  # or round if float

# Step 7: Save all useful features for GCN
road_features.to_file("data/node_features.geojson", driver="GeoJSON")

road_features[['osmid', 'rainfall_mm', 'avg_volume', 'casualties', 'label']].to_csv("data/node_features.csv", index=False)


NameError: name 'accident_stats' is not defined

In [None]:
# road_features.to_file("processed_gcn_node_features.geojson", driver="GeoJSON")
# road_features[['osmid', 'rainfall_mm', 'avg_volume']].to_csv("data/gcn_node_features.csv", index=False)
