In [None]:
import sys
import gymnasium
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from stable_baselines3 import PPO, SAC
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.monitor import Monitor

from building_energy_storage_simulation import BuildingSimulation, Environment

from observation_wrapper import ObservationWrapper
from helper import read_data, TEST_INDEX_START, TEST_INDEX_END, BATTERY_CAPACITY, BATTERY_POWER

# Applying Reiforcement Learning Using Stable Baselines 3


In [None]:
NUM_FORECAST_STEPS = 8
RESULT_PATH = 'rl_example/'

os.makedirs(RESULT_PATH, exist_ok=True)

load, price, generation = read_data()
load_train = load[:TEST_INDEX_START]
price_train = price[:TEST_INDEX_START]
generation_train = generation[:TEST_INDEX_START]

# Create Training Environment
sim = BuildingSimulation(electricity_load_profile=load_train,
                         solar_generation_profile=generation_train,
                         electricity_price=price_train,
                         max_battery_charge_per_timestep=BATTERY_POWER,
                         battery_capacity=BATTERY_CAPACITY)

env = Environment(sim, num_forecasting_steps=NUM_FORECAST_STEPS, max_timesteps=len(load_train)-NUM_FORECAST_STEPS)
# ObservationWrapper combines forecast of load and generation to one residual load forecast
env = ObservationWrapper(env, NUM_FORECAST_STEPS)
initial_obs, info = env.reset()
print(initial_obs)

In [None]:
# Wrap with Monitor() so a log of the training is saved 
env = Monitor(env, filename=RESULT_PATH)
# Warp with DummyVecEnc() so the observations and reward can be normalized using VecNormalize()
env = DummyVecEnv([lambda: env])
env = VecNormalize(env, norm_obs=True, norm_reward=True)

In [None]:
# Train :-)
model = SAC("MlpPolicy", env, verbose=1, gamma=0.95)
model.learn(total_timesteps=200000)
# Store the trained Model and environment stats (which are needed as we are standardizing the observations and reward using VecNormalize())
model.save(RESULT_PATH + 'model')
env.save(RESULT_PATH + 'env.pkl')

In [None]:
env.save(RESULT_PATH + 'env.pkl')

# Evaluation

In [None]:
# Plot the training process
training_log = pd.read_csv(RESULT_PATH + 'monitor.csv', skiprows=1)
training_log['r'].plot()

In [None]:
load, price, generation = read_data()
load_eval = load[TEST_INDEX_START:]
price_eval = price[TEST_INDEX_START:]
generation_eval = generation[TEST_INDEX_START:]

num_eval_timesteps = TEST_INDEX_END - TEST_INDEX_START

eval_sim = BuildingSimulation(electricity_load_profile=load_eval,
                              solar_generation_profile=generation_eval,
                              electricity_price=price_eval,
                              max_battery_charge_per_timestep=BATTERY_POWER, 
                              battery_capacity=BATTERY_CAPACITY)

eval_env = Environment(eval_sim, num_forecasting_steps=NUM_FORECAST_STEPS, max_timesteps=num_eval_timesteps)
eval_env = ObservationWrapper(eval_env, NUM_FORECAST_STEPS)
eval_env = DummyVecEnv([lambda: eval_env])
# It is important to load the environmental statistics here as we use a rolling mean calculation !
eval_env = VecNormalize.load(RESULT_PATH + 'env.pkl', eval_env)     

In [None]:
eval_env.training = False

actions, observations, electricity_consumption, price, rewards = ([], [], [], [], [])
done = False
obs = eval_env.reset()
while not done:
        action = model.predict(obs, deterministic=True)
        obs, r, done, info = eval_env.step([action[0][0]])

        actions.append(action[0][0][0])
        original_reward = eval_env.get_original_reward()[0]
        original_obs = eval_env.get_original_obs()[0]
        observations.append(original_obs)
        electricity_consumption.append(info[0]['electricity_consumption'])
        price.append(info[0]['electricity_price'])
        rewards.append(r)
        
trajectory = pd.DataFrame({
        'action': actions,
        'observations': observations,
        'electricity_consumption': electricity_consumption,
        'electricity_price': price,
        'reward': rewards
    })        

In [None]:
plot_data = trajectory[200:500]
observation_df = plot_data['observations'].apply(pd.Series)
augmented_load = observation_df[1] + plot_data['action'] * BATTERY_POWER
plt.rcParams["figure.figsize"] = (16,10)

fig1 = plt.figure()
ax = plt.subplot()
ax.plot(observation_df[1], label='Residual Load')
ax.plot(augmented_load, label='Augmented Load')
ax.plot(plot_data['electricity_price'], '--', label='Price')
ax.plot(plot_data['action']*50, label='Battery Power')
plt.ylabel('Load and Battery Power Applied (kW) & Price (Cent per kWh)')
plt.xlabel('Time Step')
ax.legend()
ax.grid()
plt.show()

# Compare to Baseline

In [None]:
eval_env.training = False

cost = []
done = False
obs = eval_env.reset()
while not done:
        action = model.predict(obs, deterministic=True)
        obs, r, done, info = eval_env.step([action[0][0]])
        cost.append(info[0]['electricity_consumption'] * info[0]['electricity_price'])

cost = sum(cost)

In [None]:
eval_env.training = False

baseline_cost = []
done = False
obs = eval_env.reset()
while not done:
        # Always taking noop as action. This is the electricity demand if there would be no battery
        action = [0]
        obs, r, done, info = eval_env.step(action)
        baseline_cost.append(info[0]['electricity_consumption'] * info[0]['electricity_price'])

baseline_cost = sum(baseline_cost)

In [None]:
# how much energy did we save by utilizing the battery?
1 - (cost / baseline_cost)