In [12]:
# 📌 Feature Engineering Script

import pandas as pd
import numpy as np
import os
import joblib
from haversine import haversine
from sklearn.preprocessing import OneHotEncoder

# ----------------- 1️⃣ Load & Preprocess Data -----------------
file_path = "../data/processed/UHI_Weather_Building_Sentinel_LST_Merged.csv"
df = pd.read_csv(file_path)

print(f"✅ Dataset Loaded. Shape: {df.shape}")

# **Print Available Columns Before Processing**
print("📌 Available Columns in Dataset:", df.columns.tolist())

# **Sanitize column names (remove special characters, lowercase)**
df.columns = df.columns.str.strip().str.lower().str.replace(r"[^\w\s]", "_", regex=True)
print("✅ Column Names Sanitized:", df.columns.tolist())

# ----------------- 2️⃣ Extract Temporal Features -----------------
if "datetime" in df.columns:
    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
    df["hour"] = df["datetime"].dt.hour
    df["weekday"] = df["datetime"].dt.weekday
    df["month"] = df["datetime"].dt.month
    df.drop(columns=["datetime"], inplace=True)  # Remove original datetime column
    print("✅ Extracted Temporal Features: 'hour', 'weekday', 'month'")

# ----------------- 3️⃣ Calculate Haversine Distance -----------------
# Ensure required columns exist
required_columns = ["latitude", "longitude", "nearest_building_lat", "nearest_building_lon"]
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"❌ Missing required column: {col}")

# Function to calculate distance from nearest building
def calculate_haversine_distance(row):
    return haversine(
        (row["latitude"], row["longitude"]),
        (row["nearest_building_lat"], row["nearest_building_lon"])
    )

df["building_distance_m"] = df.apply(calculate_haversine_distance, axis=1)
print("✅ Haversine Distance Calculated!")

# ----------------- 4️⃣ Categorize Wind Direction -----------------
# Function to classify wind direction
def categorize_wind_direction(degrees):
    if 0 <= degrees < 90:
        return "North-East"
    elif 90 <= degrees < 180:
        return "South-East"
    elif 180 <= degrees < 270:
        return "South-West"
    else:
        return "North-West"

wind_direction_col = [col for col in df.columns if "wind_direction" in col.lower()]
if wind_direction_col:
    df["wind_direction_category"] = df[wind_direction_col[0]].apply(categorize_wind_direction)
    print("✅ Wind Direction Categorized!")

# ----------------- 5️⃣ Categorize Time of Day -----------------
df["hour_category"] = pd.cut(
    df["hour"],
    bins=[0, 6, 12, 18, 24],
    labels=["Night", "Morning", "Afternoon", "Evening"],
    include_lowest=True,
)
print("✅ Created `hour_category` for time-based analysis.")

# ----------------- 6️⃣ One-Hot Encoding for Categorical Features -----------------
categorical_columns = ["hour_category", "wind_direction_category"]

# ✅ One-Hot Encoding
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_features = encoder.fit_transform(df[categorical_columns])

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# ✅ Save the encoder for later use in predictions
os.makedirs("models", exist_ok=True)
joblib.dump(encoder, "models/encoder.pkl")
print("✅ Encoder saved at models/encoder.pkl")

# Drop original categorical columns & Concatenate encoded ones
df = df.drop(columns=categorical_columns).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)

print("✅ Categorical Features Encoded Successfully!")

# ----------------- 7️⃣ Feature Selection -----------------
# ✅ Drop Unused Columns
unused_cols = ["nearest_building_lon", "nearest_building_lat"]
df = df.drop(columns=[col for col in unused_cols if col in df.columns], errors="ignore")

# Ensure all features are numeric before training
for col in df.columns:
    if df[col].dtype == "object":
        print(f"⚠️ Non-numeric column detected: {col} (Dropping)")
        df.drop(columns=[col], inplace=True)

# ----------------- 8️⃣ Save Processed Data -----------------
processed_path = "../data/processed/UHI_Weather_Building_Sentinel_LST_Featured_Cleaned.csv"
df.to_csv(processed_path, index=False)
print(f"✅ Processed Data Saved: {processed_path}")

✅ Dataset Loaded. Shape: (11229, 16)
📌 Available Columns in Dataset: ['datetime', 'longitude', 'latitude', 'uhi_index', 'land_surface_temp', 'band1', 'band2', 'band3', 'band4', 'air_temp_at_surface_', 'relative_humidity_', 'avg_wind_speed_', 'wind_direction_', 'solar_flux_', 'nearest_building_lon', 'nearest_building_lat']
✅ Column Names Sanitized: ['datetime', 'longitude', 'latitude', 'uhi_index', 'land_surface_temp', 'band1', 'band2', 'band3', 'band4', 'air_temp_at_surface_', 'relative_humidity_', 'avg_wind_speed_', 'wind_direction_', 'solar_flux_', 'nearest_building_lon', 'nearest_building_lat']
✅ Extracted Temporal Features: 'hour', 'weekday', 'month'
✅ Haversine Distance Calculated!
✅ Wind Direction Categorized!
✅ Created `hour_category` for time-based analysis.
✅ Encoder saved at models/encoder.pkl
✅ Categorical Features Encoded Successfully!
✅ Processed Data Saved: ../data/processed/UHI_Weather_Building_Sentinel_LST_Featured_Cleaned.csv


In [15]:
df.dtypes

datetime                 datetime64[ns]
longitude                       float64
latitude                        float64
uhi_index                       float64
land_surface_temp               float64
band1                           float64
band2                           float64
band3                           float64
band4                           float64
air_temp_at_surface_            float64
relative_humidity_              float64
avg_wind_speed_                 float64
wind_direction_                 float64
solar_flux_                     float64
nearest_building_lon            float64
nearest_building_lat            float64
building_distance_m             float64
building_density_50m              int64
building_density_100m             int64
building_density_200m             int64
hour                              int32
weekday                           int32
month                             int32
hour_category                  category
is_weekend                        int64


In [16]:
df.isnull().sum()

datetime                 0
longitude                0
latitude                 0
uhi_index                0
land_surface_temp        0
band1                    0
band2                    0
band3                    0
band4                    0
air_temp_at_surface_     0
relative_humidity_       0
avg_wind_speed_          0
wind_direction_          0
solar_flux_              0
nearest_building_lon     0
nearest_building_lat     0
building_distance_m      0
building_density_50m     0
building_density_100m    0
building_density_200m    0
hour                     0
weekday                  0
month                    0
hour_category            0
is_weekend               0
dtype: int64