In [4]:
# notebooks/2_Feature_Engineering.ipynb
# ---------------------------------------------------
# PURPOSE: Merge UHI, Weather (Bronx & Manhattan), and optionally other datasets,
# create new features, and save the final processed dataset.

import os
import pandas as pd
import geopandas as gpd

# Append project root to sys.path so we can import from src/
import sys
import os
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from src.features.spatial_features import extract_spatial_features
from src.features.temporal_features import extract_temporal_features

# Ensure the processed data folder exists
os.makedirs("../data/processed", exist_ok=True)

# === 1. Load UHI Data (CSV) ===
uhi_file = "../data/raw/UHI_data.csv"
uhi_df = pd.read_csv(uhi_file)
print("UHI Data Columns:", uhi_df.columns)

# Convert 'datetime' column to datetime (assuming day-first format)
if 'datetime' in uhi_df.columns:
    uhi_df['datetime'] = pd.to_datetime(uhi_df['datetime'], dayfirst=True)
print("UHI Data Preview:")
print(uhi_df.head())

# === 2. Load Weather Data from Excel (Two Worksheets: Bronx and Manhattan) ===
excel_file = "../data/raw/NY_Mesonet_Weather.xlsx"
xls = pd.ExcelFile(excel_file)
print("Excel Sheets Found:", xls.sheet_names)

# Load Bronx weather data and add location metadata
bronx_weather = pd.read_excel(xls, sheet_name="Bronx")
bronx_weather["Latitude"] = 40.87248
bronx_weather["Longitude"] = -73.89352
bronx_weather["Altitude"] = 57.5

# Load Manhattan weather data and add location metadata
manhattan_weather = pd.read_excel(xls, sheet_name="Manhattan")
manhattan_weather["Latitude"] = 40.76754
manhattan_weather["Longitude"] = -73.96449
manhattan_weather["Altitude"] = 94.8

# Combine the weather data from both sheets
weather_df = pd.concat([bronx_weather, manhattan_weather], ignore_index=True)
print("Combined Weather Data Preview:")
print(weather_df.head())

# Convert timestamp columns to datetime if available
if 'timestamp' in weather_df.columns:
    weather_df['timestamp'] = pd.to_datetime(weather_df['timestamp'])
    weather_df['hour'] = weather_df['timestamp'].dt.hour
elif 'datetime' in weather_df.columns:
    weather_df['datetime'] = pd.to_datetime(weather_df['datetime'], dayfirst=True)
    weather_df['hour'] = weather_df['datetime'].dt.hour

# Save processed weather data for reference
weather_csv_path = "../data/processed/weather_data.csv"
weather_df.to_csv(weather_csv_path, index=False)
print("Processed weather data saved to:", weather_csv_path)

# === 3. Merge UHI Data with Weather Data ===
# We assume UHI data has a 'datetime' column and weather data has either 'datetime' or 'timestamp'
if 'datetime' in uhi_df.columns and 'datetime' in weather_df.columns:
    # Both DataFrames have a 'datetime' column
    uhi_df = uhi_df.sort_values("datetime")
    weather_df = weather_df.sort_values("datetime")
    merged_df = pd.merge_asof(uhi_df, weather_df, on="datetime", direction="nearest")
elif 'datetime' in uhi_df.columns and 'timestamp' in weather_df.columns:
    uhi_df = uhi_df.sort_values("datetime")
    weather_df = weather_df.sort_values("timestamp")
    merged_df = pd.merge_asof(uhi_df, weather_df, left_on="datetime", right_on="timestamp", direction="nearest")
else:
    print("No common datetime field found. Using UHI data only.")
    merged_df = uhi_df.copy()

print("Merged Data Preview:")
print(merged_df.head())

# === 4. (Optional) Merge Building Footprint Data ===
# If you need to incorporate building footprints, you could load and perform a spatial join.
# For example:
# footprint_gdf = gpd.read_file("../data/raw/Building_Footprint.kml", driver="KML")
# footprint_gdf = footprint_gdf.to_crs(epsg=4326)
# If merged_df contains 'Longitude' and 'Latitude', convert to GeoDataFrame and join:
# merged_gdf = gpd.GeoDataFrame(
#     merged_df, geometry=gpd.points_from_xy(merged_df.Longitude, merged_df.Latitude), crs="EPSG:4326"
# )
# merged_gdf = gpd.sjoin(merged_gdf, footprint_gdf, how="left", op="intersects")
# merged_df = pd.DataFrame(merged_gdf.drop(columns="geometry"))

# === 5. Apply Custom Feature Extraction Functions ===
merged_df = extract_spatial_features(merged_df)
merged_df = extract_temporal_features(merged_df)

# === 6. Save Final Processed Data ===
final_features_path = "../data/processed/UHI_features.csv"
merged_df.to_csv(final_features_path, index=False)
print("Final feature dataset saved to:", final_features_path)


UHI Data Columns: Index(['Longitude', 'Latitude', 'datetime', 'UHI Index'], dtype='object')
UHI Data Preview:
   Longitude   Latitude            datetime  UHI Index
0 -73.909167  40.813107 2021-07-24 15:53:00   1.030289
1 -73.909187  40.813045 2021-07-24 15:53:00   1.030289
2 -73.909215  40.812978 2021-07-24 15:53:00   1.023798
3 -73.909242  40.812908 2021-07-24 15:53:00   1.023798
4 -73.909257  40.812845 2021-07-24 15:53:00   1.021634
Excel Sheets Found: ['Summary', 'Terms', 'Location', 'Bronx', 'Manhattan']
Combined Weather Data Preview:
               Date / Time  Air Temp at Surface [degC]  \
0  2021-07-24 06:00:00 EDT                        19.3   
1  2021-07-24 06:05:00 EDT                        19.4   
2  2021-07-24 06:10:00 EDT                        19.3   
3  2021-07-24 06:15:00 EDT                        19.4   
4  2021-07-24 06:20:00 EDT                        19.4   

   Relative Humidity [percent]  Avg Wind Speed [m/s]  \
0                         88.2                   