In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
from sklearn.preprocessing import LabelEncoder

# 1. Load the final feature dataset
df = pd.read_csv("../data/processed/UHI_Weather_Building_Sentinel_LST_Featured_Cleaned.csv")
df.describe()

# Print initial data types for debugging
print("Before datetime processing:")
print(df.dtypes)

# 2. Convert 'datetime' column to datetime object and extract numeric features
if 'datetime' in df.columns:
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
    # Create numeric features: hour, day, month
    df['hour'] = df['datetime'].dt.hour
    df['day'] = df['datetime'].dt.day
    df['month'] = df['datetime'].dt.month
    # Optionally, you could extract other features like weekday
    df['weekday'] = df['datetime'].dt.weekday
    # Drop the original datetime column as it is non-numeric
    df = df.drop(columns=["datetime"])

# Clean column names by removing spaces and special characters
df.columns = df.columns.str.replace('[^A-Za-z0-9_]+', '', regex=True)

# Print cleaned column names for debugging
print("\nCleaned Column Names:")
print(df.columns)

# Print data types after processing
print("\nAfter datetime processing:")
print(df.dtypes)

# 3. Handle categorical columns (if any)
# Convert categorical columns to numeric using LabelEncoder or one-hot encoding
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    # If a column is categorical, convert it using LabelEncoder
    label_encoder = LabelEncoder()
    df[column] = label_encoder.fit_transform(df[column])

# Ensure that all columns are now numeric
print("\nData types after encoding categorical columns:")
print(df.dtypes)

# 4. Define features and target
X = df.drop(columns=["uhi_index"])  # Ensure the column name matches exactly
y = df["uhi_index"]  # Ensure this matches the target name

# 5. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 6. Train a baseline XGBoost model
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

# 7. Evaluate the model
y_pred = model.predict(X_test)
print(f"R² Score: {r2_score(y_test, y_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")

# 8. Save the trained model
joblib.dump(model, "../models/UHI_xgboost.pkl")
print("Model saved successfully to ../models/UHI_xgboost.pkl")

Before datetime processing:
longitude                             float64
latitude                              float64
uhi_index                             float64
land_surface_temp                     float64
band1                                 float64
band2                                 float64
band3                                 float64
band4                                 float64
air_temp_at_surface_                  float64
relative_humidity_                    float64
avg_wind_speed_                       float64
wind_direction_                       float64
solar_flux_                           float64
hour                                    int64
weekday                                 int64
month                                   int64
building_distance_m                   float64
hour_category_Afternoon               float64
wind_direction_category_South-East    float64
wind_direction_category_South-West    float64
dtype: object

Cleaned Column Names:
Index(['longitu