In [None]:
import pandas as pd

# ------------------------- 1️⃣ Load UHI Dataset -------------------------
print("\n🔹 Loading UHI Dataset...")
uhi_path = "../data/processed/UHI_with_LST_Sentinel_Final.csv"
uhi_df = pd.read_csv(uhi_path)

# Convert datetime column
uhi_df["datetime"] = pd.to_datetime(uhi_df["datetime"], dayfirst=True, errors="coerce")

print(f"✅ UHI Dataset Loaded. Shape: {uhi_df.shape}")
print(f"🔹 UHI Columns: {uhi_df.columns}\n")


# ------------------------- 2️⃣ Load Weather Dataset -------------------------
print("\n🔹 Loading Weather Dataset...")
weather_path = "../data/raw/NY_Mesonet_Weather.xlsx"

# Load all sheets
weather_sheets = pd.ExcelFile(weather_path).sheet_names
print(f"📌 Available Weather Sheets: {weather_sheets}")

weather_dfs = []
for sheet in ["Bronx", "Manhattan"]:
    df = pd.read_excel(weather_path, sheet_name=sheet)
    
    # Fix datetime column issue
    df.columns = df.columns.str.lower().str.replace(" ", "_")
    df.rename(columns={"date_/_time": "datetime"}, inplace=True)

    if "datetime" not in df.columns:
        print(f"⚠️ 'datetime' column not found in {sheet}, skipping merge.")
        continue  

    # Remove timezone and parse datetime correctly
    df["datetime"] = df["datetime"].astype(str).str.replace(" EDT", "", regex=False)
    df["datetime"] = pd.to_datetime(df["datetime"], dayfirst=True, errors="coerce")

    weather_dfs.append(df)

# Merge Bronx & Manhattan Weather data
weather_df = pd.concat(weather_dfs, ignore_index=True)

# Remove duplicates by averaging over timestamps
weather_df = weather_df.groupby("datetime").mean().reset_index()

# 🔹 Fix: Set datetime as index before interpolation
weather_df.set_index("datetime", inplace=True)
weather_df.interpolate(method="time", inplace=True)
weather_df.reset_index(inplace=True)  # Restore datetime as a column

print(f"✅ Weather Data Processed. Shape: {weather_df.shape}")
print(f"🔹 Weather Columns: {weather_df.columns}\n")


# ------------------------- 3️⃣ Merge UHI with Weather Data -------------------------
print("\n🔹 Merging UHI with Weather Data...")
merged_df = pd.merge(uhi_df, weather_df, on="datetime", how="left")

# 🔹 Fill remaining missing values with forward fill method
# Handle missing values using forward fill, backward fill, and interpolation
merged_df.set_index("datetime", inplace=True)  # Set datetime as index
merged_df = merged_df.interpolate(method="time")  # Time-based interpolation
merged_df.fillna(method="ffill", inplace=True)  # Forward fill
merged_df.fillna(method="bfill", inplace=True)  # Backward fill
merged_df.reset_index(inplace=True)  # Restore datetime column

print(f"✅ Merged Weather Data. Shape: {merged_df.shape}")

# Save the final merged dataset
output_path = "../data/processed/UHI_Weather_Merged.csv"
merged_df.to_csv(output_path, index=False)
print(f"✅ Final dataset saved: {output_path}")


# ------------------------- 4️⃣ Summary Statistics -------------------------
print("\n📌 Final Merged Dataset Summary:")
print(merged_df.info())
print(merged_df.head())

✅ Models and Scaler Loaded Successfully
✅ Loaded Submission Data. Shape: (1040, 3)


Extracting Sentinel-2 Data: 100%|██████████| 1040/1040 [00:02<00:00, 456.64it/s]


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- NDBI
- NDVI
- UHI Index
Feature names seen at fit time, yet now missing:
- air_temp_at_surface_
- avg_wind_speed_
- evi
- heat_index
- humidity_temp_interaction
- ...
