In [None]:
%pip install pandas
%pip install numpy
%pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
import os

In [None]:
df_train = pd.read_csv("../data/regression/regression-train.csv")
df_test = pd.read_csv("../data/regression/regression-test.csv")

In [None]:
df_train.head()

# How this RandomForest Baseline Works

This baseline script uses a RandomForest model to predict `Demand_Response_Capacity_kW` for flagged DR events. Here's a step-by-step explanation:

1. **Data Loading**
   - The training set contains building power, weather, DR flags, and capacity.
   - The test set is what we want to predict.

2. **Handling Missing Flags**
   - Some sites may have missing DR flags (e.g., site E and F).
   - Missing flags are filled with `0`, indicating no DR event.

3. **Feature Engineering**
   - Time features: `hour`, `day of week`, `minute of week`, `is_weekend`.
   - Lag features: previous building power values.
   - Rolling averages: mean building power over the last few intervals.
   - Weekly profile: median building power at each minute-of-week during non-DR periods.
   - These features give the model temporal context and typical building behavior.

4. **Training Data**
   - Only rows with `Demand_Response_Flag` equal to `1` or `-1` are used for training.
   - `X_train` contains the features, and `y_train` contains the target capacities.

5. **RandomForest Regression**
   - Builds many decision trees that learn patterns like:
     > "If hour = 15 and building power is high, expected capacity is ~10 kW."
   - Predictions are the **average across all trees**.

6. **Prediction and Post-processing**
   - Predictions are made for all test rows.
   - For flagged rows (`Flag != 0`):
     - Negative predictions are clipped to `0` (or a small positive value if desired).
   - For unflagged rows (`Flag = 0`):
     - Capacity is set exactly to `0`.

7. **Submission**
   - Submission contains `Site`, `Timestamp_Local`, `Demand_Response_Flag`, and predicted capacity.
   - Sanity check ensures **all flagged rows have non-zero capacities**, and unflagged rows remain zero.

**Summary:**  
The RandomForest learns how building power, weather, and time influence DR capacity, predicting varying capacities only for flagged events and keeping zeros elsewhere. Sites without DR events naturally have all zero capacities.


In [None]:
# -----------------------------
# Load data
# -----------------------------
train = pd.read_csv("../data/regression/regression-train.csv")
test  = pd.read_csv("../data/regression/regression-test.csv")

# -----------------------------
# Fill missing flags in test
# -----------------------------
# Missing flags likely mean no DR event, safe to fill with 0
test['Demand_Response_Flag'] = test['Demand_Response_Flag'].fillna(0)

# -----------------------------
# Feature engineering
# -----------------------------
def add_time_features(df):
    df['Timestamp_Local'] = pd.to_datetime(df['Timestamp_Local'])
    df['hour'] = df['Timestamp_Local'].dt.hour
    df['minute'] = df['Timestamp_Local'].dt.minute
    df['dayofweek'] = df['Timestamp_Local'].dt.dayofweek
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype(int)
    df['minute_of_day'] = df['hour']*60 + df['minute']
    df['minute_of_week'] = df['dayofweek']*24*60 + df['minute_of_day']
    return df

def add_lag_rolling(df, group='Site', value_col='Building_Power_kW', lags=[1,2,3,4], rolls=[4,12,96]):
    df = df.sort_values([group, 'Timestamp_Local'])
    for lag in lags:
        df[f'lag_{lag}'] = df.groupby(group)[value_col].shift(lag)
    for r in rolls:
        df[f'roll_mean_{r}'] = df.groupby(group)[value_col].shift(1).rolling(window=r, min_periods=1).mean().reset_index(level=0, drop=True)
    return df

def compute_weekly_profile(df, group='Site', value_col='Building_Power_kW'):
    mask = df['Demand_Response_Flag'] == 0
    profile = df[mask].groupby([group,'minute_of_week'])[value_col].median().rename('profile_med').reset_index()
    return profile

def merge_weekly_profile(df, profile, group='Site'):
    df = df.merge(profile, on=[group,'minute_of_week'], how='left')
    df['profile_med'] = df.groupby(group)['profile_med'].transform(lambda x: x.fillna(x.median()))
    return df

# Apply features
train = add_time_features(train)
test  = add_time_features(test)

train = add_lag_rolling(train)
test  = add_lag_rolling(test)

profile = compute_weekly_profile(train)
train = merge_weekly_profile(train, profile)
test  = merge_weekly_profile(test, profile)

# Fill missing lag/roll/profile values with training median
fill_cols = [c for c in train.columns if c.startswith('lag_') or c.startswith('roll_') or c=='profile_med']
for c in fill_cols:
    median_val = train[c].median()
    train[c] = train[c].fillna(median_val)
    test[c] = test[c].fillna(median_val)

# -----------------------------
# Target
# -----------------------------
target = 'Demand_Response_Capacity_kW'
train.loc[train['Demand_Response_Flag']==0, target] = 0.0

# -----------------------------
# Features
# -----------------------------
features = [
    'hour','dayofweek','is_weekend','minute_of_week',
    'Dry_Bulb_Temperature_C','Global_Horizontal_Radiation_W/m2',
    'Building_Power_kW','profile_med'
] + fill_cols

# -----------------------------
# Train RandomForest on active events
# -----------------------------
event_rows = train[train['Demand_Response_Flag'] != 0]
X_train = event_rows[features].values
y_train = event_rows[target].values

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# -----------------------------
# Predict on test
# -----------------------------
X_test = test[features].values
y_pred = rf.predict(X_test)

# -----------------------------
# Force zeros for flag=0 and clip negatives for flagged rows
# -----------------------------
flag_mask = test['Demand_Response_Flag'].values != 0

# Clip flagged rows to minimum positive value (0.01)
y_pred[flag_mask] = np.clip(y_pred[flag_mask], 0.01, None)

# Zero for unflagged rows
y_pred[~flag_mask] = 0.0

# -----------------------------
# Build submission
# -----------------------------
submission = test[['Site','Timestamp_Local','Demand_Response_Flag']].copy()
submission['Demand_Response_Capacity_kW'] = y_pred

# -----------------------------
# Sanity check
# -----------------------------
num_flags = (test['Demand_Response_Flag'] != 0).sum()
num_pred = (submission['Demand_Response_Capacity_kW'] != 0).sum()
print("Non-zero flags in test:", num_flags)
print("Non-zero capacities in submission:", num_pred)
assert num_flags == num_pred, "Mismatch between flags and predicted non-zeros!"

# -----------------------------
# Save submission
# -----------------------------
os.makedirs("submissions", exist_ok=True)
submission.to_csv("submissions/baseline_regression.csv", index=False)

print("Saved submission to submissions/baseline_regression.csv")
print(submission.head(20))

In [None]:
df_submission = pd.read_csv("submissions/baseline_regression.csv")

In [None]:
df_submission.head()