# Feature Engineering

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import scipy
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load the HVAC Energy Dataset and parse the timestamp column
df = pd.read_csv('dataset/HVAC Energy Data.csv')
df_normalized = pd.read_csv('dataset/HVAC Energy Data Cleaned and Split.csv')
timestamp_column = 'Local Time (Timezone : GMT+8h)'
df[timestamp_column] = pd.to_datetime(df[timestamp_column])
df = df.set_index(timestamp_column)

## Lag Features (past 1-6 time steps)

In [14]:
# Make a copy of the dataframe for feature engineering
df_fe = df.copy()

print("\n" + "="*70)
print("STEP 1: LAG FEATURES")
print("="*70)
print("Creating lag features to capture temporal dependencies...")

# Create lag features for key variables
lag_periods = [1, 2, 3, 6]  # 30 min, 1 hour, 1.5 hours, 3 hours (assuming 30-min intervals)

for lag in lag_periods:
    # Energy consumption lag
    df_fe[f'Energy_Lag_{lag}'] = df_fe['Chiller Energy Consumption (kWh)'].shift(lag)

    # Building load lag
    df_fe[f'BuildingLoad_Lag_{lag}'] = df_fe['Building Load (RT)'].shift(lag)

    # Temperature lag
    df_fe[f'OutsideTemp_Lag_{lag}'] = df_fe['Outside Temperature (F)'].shift(lag)

    # Cooling water temperature lag
    df_fe[f'CoolingWaterTemp_Lag_{lag}'] = df_fe['Cooling Water Temperature (C)'].shift(lag)

print(f"✓ Created lag features for periods: {lag_periods}")
print(f"✓ Total lag features created: {len(lag_periods) * 4}")


STEP 1: LAG FEATURES
Creating lag features to capture temporal dependencies...
✓ Created lag features for periods: [1, 2, 3, 6]
✓ Total lag features created: 16


## Rolling Average Features

In [15]:
# Rolling window sizes (in number of periods)
window_sizes = [3, 6, 12]  # 1.5 hours, 3 hours, 6 hours

for window in window_sizes:
    # Energy consumption rolling average
    df_fe[f'Energy_RollingAvg_{window}'] = df_fe['Chiller Energy Consumption (kWh)'].rolling(window=window, min_periods=1).mean()

    # Building load rolling average
    df_fe[f'BuildingLoad_RollingAvg_{window}'] = df_fe['Building Load (RT)'].rolling(window=window, min_periods=1).mean()

    # Outside temperature rolling average
    df_fe[f'OutsideTemp_RollingAvg_{window}'] = df_fe['Outside Temperature (F)'].rolling(window=window, min_periods=1).mean()

    # Energy rolling standard deviation (volatility)
    df_fe[f'Energy_RollingStd_{window}'] = df_fe['Chiller Energy Consumption (kWh)'].rolling(window=window, min_periods=1).std()

print(f"✓ Created rolling average features for windows: {window_sizes}")
print(f"✓ Total rolling features created: {len(window_sizes) * 4}")

✓ Created rolling average features for windows: [3, 6, 12]
✓ Total rolling features created: 12


## Cyclical Encoding for Temporal Features

In [18]:
import numpy as np

df_fe['Hour'] = df.index.hour
df_fe['DayOfWeek'] = df.index.dayofweek  # 0=Monday, 6=Sunday
df_fe['Month'] = df.index.month

# Encode hour of day (0-23) as sine and cosine
df_fe['Hour_Sin'] = np.sin(2 * np.pi * df_fe['Hour'] / 24)
df_fe['Hour_Cos'] = np.cos(2 * np.pi * df_fe['Hour'] / 24)

# Encode day of week (0-6) as sine and cosine
df_fe['DayOfWeek_Sin'] = np.sin(2 * np.pi * df_fe['DayOfWeek'] / 7)
df_fe['DayOfWeek_Cos'] = np.cos(2 * np.pi * df_fe['DayOfWeek'] / 7)

# Encode month (1-12) as sine and cosine
df_fe['Month_Sin'] = np.sin(2 * np.pi * df_fe['Month'] / 12)
df_fe['Month_Cos'] = np.cos(2 * np.pi * df_fe['Month'] / 12)

print("✓ Created cyclical encoding features:")
print("  - Hour: Hour_Sin, Hour_Cos")
print("  - Day of Week: DayOfWeek_Sin, DayOfWeek_Cos")
print("  - Month: Month_Sin, Month_Cos")
print("✓ Total cyclical features created: 6")

✓ Created cyclical encoding features:
  - Hour: Hour_Sin, Hour_Cos
  - Day of Week: DayOfWeek_Sin, DayOfWeek_Cos
  - Month: Month_Sin, Month_Cos
✓ Total cyclical features created: 6


## Interaction Features

In [19]:
# Based on EDA insights, create meaningful interactions
# Building Load × Temperature (Higher load in hot weather = more energy)
df_fe['Load_Temp_Interaction'] = df_fe['Building Load (RT)'] * df_fe['Outside Temperature (F)']

# Chilled Water Rate × Building Load (Flow rate needed for load)
df_fe['ChilledWater_Load_Interaction'] = df_fe['Chilled Water Rate (L/sec)'] * df_fe['Building Load (RT)']

# Temperature × Humidity (Heat index effect)
df_fe['Temp_Humidity_Interaction'] = df_fe['Outside Temperature (F)'] * df_fe['Humidity (%)']

# Hour × Building Load (Temporal load pattern)
df_fe['Hour_Load_Interaction'] = df_fe['Hour'] * df_fe['Building Load (RT)']

print("✓ Created interaction features:")
print("  - Load_Temp_Interaction (Building Load × Temperature)")
print("  - ChilledWater_Load_Interaction (Chilled Water × Load)")
print("  - Temp_Humidity_Interaction (Temperature × Humidity)")
print("  - Hour_Load_Interaction (Hour × Building Load)")
print("✓ Total interaction features created: 4")

✓ Created interaction features:
  - Load_Temp_Interaction (Building Load × Temperature)
  - ChilledWater_Load_Interaction (Chilled Water × Load)
  - Temp_Humidity_Interaction (Temperature × Humidity)
  - Hour_Load_Interaction (Hour × Building Load)
✓ Total interaction features created: 4


## Handling missing values and prepare final dataset

In [20]:
# Check for missing values created by lag and rolling features
print("\nChecking for missing values after feature engineering...")
missing_counts = df_fe.isnull().sum()
missing_features = missing_counts[missing_counts > 0]

if len(missing_features) > 0:
    print(f"\nFeatures with missing values:")
    for feature, count in missing_features.items():
        print(f"  - {feature}: {count} missing values")

    # Drop rows with missing values (from lag features)
    rows_before = len(df_fe)
    df_fe = df_fe.dropna()
    rows_after = len(df_fe)
    print(f"\n✓ Dropped {rows_before - rows_after} rows with missing values")
    print(f"✓ Final dataset size: {rows_after} rows")
else:
    print("✓ No missing values found!")

print(f"\n✓ Final feature count: {len(df_fe.columns)} features")


Checking for missing values after feature engineering...

Features with missing values:
  - Energy_Lag_1: 1 missing values
  - BuildingLoad_Lag_1: 1 missing values
  - OutsideTemp_Lag_1: 1 missing values
  - CoolingWaterTemp_Lag_1: 1 missing values
  - Energy_Lag_2: 2 missing values
  - BuildingLoad_Lag_2: 2 missing values
  - OutsideTemp_Lag_2: 2 missing values
  - CoolingWaterTemp_Lag_2: 2 missing values
  - Energy_Lag_3: 3 missing values
  - BuildingLoad_Lag_3: 3 missing values
  - OutsideTemp_Lag_3: 3 missing values
  - CoolingWaterTemp_Lag_3: 3 missing values
  - Energy_Lag_6: 6 missing values
  - BuildingLoad_Lag_6: 6 missing values
  - OutsideTemp_Lag_6: 6 missing values
  - CoolingWaterTemp_Lag_6: 6 missing values
  - Energy_RollingStd_3: 1 missing values
  - Energy_RollingStd_6: 1 missing values
  - Energy_RollingStd_12: 1 missing values

✓ Dropped 6 rows with missing values
✓ Final dataset size: 13609 rows

✓ Final feature count: 50 features


In [22]:
# Save the feature-engineered dataset
df_fe.to_csv('dataset/HVAC Energy Data Feature Engineered.csv')