# Reproducing the result of a single central agent on the CityLearn simulator.

In [None]:
# get CityLearn from github
!rm -rf ./CityLearn/
!git clone https://github.com/intelligent-environments-lab/CityLearn.git

Cloning into 'CityLearn'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 783 (delta 10), reused 27 (delta 2), pack-reused 736[K
Receiving objects: 100% (783/783), 36.62 MiB | 21.88 MiB/s, done.
Resolving deltas: 100% (439/439), done.


In [None]:
!pip install stable_baselines3

Collecting stable_baselines3
[?25l  Downloading https://files.pythonhosted.org/packages/76/7c/ec89fd9a51c2ff640f150479069be817136c02f02349b5dd27a6e3bb8b3d/stable_baselines3-0.10.0-py3-none-any.whl (145kB)
[K     |██▎                             | 10kB 23.2MB/s eta 0:00:01[K     |████▌                           | 20kB 27.7MB/s eta 0:00:01[K     |██████▊                         | 30kB 25.2MB/s eta 0:00:01[K     |█████████                       | 40kB 21.6MB/s eta 0:00:01[K     |███████████▎                    | 51kB 19.5MB/s eta 0:00:01[K     |█████████████▌                  | 61kB 15.4MB/s eta 0:00:01[K     |███████████████▊                | 71kB 16.0MB/s eta 0:00:01[K     |██████████████████              | 81kB 15.3MB/s eta 0:00:01[K     |████████████████████▏           | 92kB 15.0MB/s eta 0:00:01[K     |██████████████████████▌         | 102kB 15.4MB/s eta 0:00:01[K     |████████████████████████▊       | 112kB 15.4MB/s eta 0:00:01[K     |███████████████████████

In [None]:
from google.colab import drive
drive.mount('/gdrive')
!ls /gdrive

Mounted at /gdrive
MyDrive


In [None]:
# Loading libraries
import sys
sys.path.append("./CityLearn")

from citylearn import CityLearn
from reward_function import reward_function_ma
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from agent import RL_Agents_Coord

import os
import gym
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy as MlpPolicy_SAC
from stable_baselines3.common.callbacks import BaseCallback
import matplotlib.pyplot as plt
from pathlib import Path
import time

import pandas as pd
import seaborn as sns

In [None]:
# Load environment
climate_zone = 4
data_path = Path("./CityLearn/data/Climate_Zone_"+str(climate_zone))
building_attributes = data_path / 'building_attributes.json'
weather_file = data_path / 'weather_data.csv'
solar_profile = data_path / 'solar_generation_1kW.csv'
building_state_actions = './CityLearn/buildings_state_action_space.json'
building_ids = ["Building_1","Building_2","Building_3","Building_4","Building_5","Building_6","Building_7","Building_8","Building_9"]
objective_function = ['ramping','1-load_factor','average_daily_peak','peak_demand','net_electricity_consumption']

## Multiple agents

In [None]:
# Contain the lower and upper bounds of the states and actions, to be provided to the agent to normalize the variables between 0 and 1.
# Can be obtained using observations_spaces[i].low or .high
env = CityLearn(data_path, 
                building_attributes, 
                weather_file, 
                solar_profile, 
                building_ids, 
                buildings_states_actions = building_state_actions, 
                cost_function = objective_function, 
                verbose = 0, 
                simulation_period=(0,8760-1), 
                central_agent=False)
observations_spaces, actions_spaces = env.get_state_action_spaces()

  self.state = np.array(self.state)


In [None]:
# Provides information on Building type, Climate Zone, Annual DHW demand, Annual Cooling Demand, Annual Electricity Demand, Solar Capacity, and correllations among buildings
building_info = env.get_building_information()

In [None]:
# Hyperparameters
bs = 256
tau = 0.005
gamma = 0.99
lr = 0.0003
hid = [256,256]

n_episodes = 12

In [None]:
import warnings
warnings.filterwarnings("ignore")
# Instantiating the control agent(s)
agents = RL_Agents_Coord(building_ids, 
                         building_state_actions, 
                         building_info, 
                         observations_spaces, 
                         actions_spaces, 
                         discount = gamma, 
                         batch_size = bs, 
                         replay_buffer_capacity = 1e5, 
                         regression_buffer_capacity = 12*8760, 
                         tau=tau, 
                         lr=lr, 
                         hidden_dim=hid, 
                         start_training=8760*3, 
                         exploration_period = 8760*3+1,  
                         start_regression=8760, 
                         information_sharing = False, # I changed here experimentally. 
                         pca_compression = .95, 
                         action_scaling_coef=0.5, 
                         reward_scaling = 5., 
                         update_per_step = 1, 
                         iterations_as = 2)

cost_by_epoch = []
# The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward)
start = time.time()
for e in range(n_episodes): 
    is_evaluating = (e > 7) # Evaluate deterministic policy after 7 epochs
    rewards = []
    state = env.reset()
    done = False

    j = 0
    action, coordination_vars = agents.select_action(state, deterministic=is_evaluating)    
    while not done:
        next_state, reward, done, _ = env.step(action)
        action_next, coordination_vars_next = agents.select_action(next_state, deterministic=is_evaluating)
        agents.add_to_buffer(state, action, reward, next_state, done, coordination_vars, coordination_vars_next)

        state = next_state
        coordination_vars = coordination_vars_next
        action = action_next

    cost = env.cost()
    cost_by_epoch.append(cost)
    print('Loss -', cost, 'Simulation time (min) -',(time.time()-start)/60.0)

Loss - {'ramping': 1.1929684, '1-load_factor': 1.1250790754531381, 'average_daily_peak': 1.0954074, 'peak_demand': 1.1740308, 'net_electricity_consumption': 1.0374093, 'total': 1.1249789794201566} Simulation time (min) - 0.30750746726989747
Loss - {'ramping': 1.1937909, '1-load_factor': 1.108593114277481, 'average_daily_peak': 1.0990715, 'peak_demand': 1.2124575, 'net_electricity_consumption': 1.0370535, 'total': 1.1301933066170928} Simulation time (min) - 0.848811407883962
Loss - {'ramping': 1.1920284, '1-load_factor': 1.0958920520026365, 'average_daily_peak': 1.0836817, 'peak_demand': 1.2347693, 'net_electricity_consumption': 1.0371234, 'total': 1.1286989887849266} Simulation time (min) - 2.0703074097633363
Loss - {'ramping': 0.8590052, '1-load_factor': 1.018247733312952, 'average_daily_peak': 0.9775157, 'peak_demand': 1.2033315, 'net_electricity_consumption': 1.0018773, 'total': 1.0119954843914722} Simulation time (min) - 25.1030006925265
Loss - {'ramping': 0.7939093, '1-load_factor

In [None]:
cost_by_epoch

[{'1-load_factor': 1.1250790754531381,
  'average_daily_peak': 1.0954074,
  'net_electricity_consumption': 1.0374093,
  'peak_demand': 1.1740308,
  'ramping': 1.1929684,
  'total': 1.1249789794201566},
 {'1-load_factor': 1.108593114277481,
  'average_daily_peak': 1.0990715,
  'net_electricity_consumption': 1.0370535,
  'peak_demand': 1.2124575,
  'ramping': 1.1937909,
  'total': 1.1301933066170928},
 {'1-load_factor': 1.0958920520026365,
  'average_daily_peak': 1.0836817,
  'net_electricity_consumption': 1.0371234,
  'peak_demand': 1.2347693,
  'ramping': 1.1920284,
  'total': 1.1286989887849266},
 {'1-load_factor': 1.018247733312952,
  'average_daily_peak': 0.9775157,
  'net_electricity_consumption': 1.0018773,
  'peak_demand': 1.2033315,
  'ramping': 0.8590052,
  'total': 1.0119954843914722},
 {'1-load_factor': 0.9240903198894663,
  'average_daily_peak': 0.87130195,
  'net_electricity_consumption': 0.99536526,
  'peak_demand': 1.0705855,
  'ramping': 0.7939093,
  'total': 0.931050466

In [None]:
import pickle
with open("/gdrive/My Drive/cost_by_epoch_cz4.pkl", "wb") as f:
    pickle.dump(cost_by_epoch, f)

##Multiple Agent with $r^3$ reward

In [None]:
# confirm whetehr reward function is r^3 or not.
import inspect
lines = inspect.getsource(reward_function_ma)
print(lines)

class reward_function_ma:
    def __init__(self, n_agents, building_info):
        self.n_agents = n_agents
        self.building_info = building_info

    # Electricity_demand contains negative values when the building consumes more electricity than it generates
    def get_rewards(self, electricity_demand):  
        #electricity_demand = np.float32(electricity_demand)
        #total_electricity_demand = 0
        #for e in electricity_demand:
        #    total_electricity_demand += -e
            
        #electricity_demand = np.array(electricity_demand)
        
        #return list(np.sign(electricity_demand)*0.01*(np.array(np.abs(electricity_demand))**2 * max(0, total_electricity_demand)))
        
        # Single-agent reward
        reward_ = np.array(electricity_demand)**3.0
        reward_[reward_>0] = 0
        return list(reward_)



In [None]:
# Load environment
climate_zone = 4
data_path = Path("./CityLearn/data/Climate_Zone_"+str(climate_zone))
building_attributes = data_path / 'building_attributes.json'
weather_file = data_path / 'weather_data.csv'
solar_profile = data_path / 'solar_generation_1kW.csv'
building_state_actions = './CityLearn/buildings_state_action_space.json'
building_ids = ["Building_1","Building_2","Building_3","Building_4","Building_5","Building_6","Building_7","Building_8","Building_9"]
objective_function = ['ramping','1-load_factor','average_daily_peak','peak_demand','net_electricity_consumption']

In [None]:
# Contain the lower and upper bounds of the states and actions, to be provided to the agent to normalize the variables between 0 and 1.
# Can be obtained using observations_spaces[i].low or .high
env = CityLearn(data_path, 
                building_attributes, 
                weather_file, 
                solar_profile, 
                building_ids, 
                buildings_states_actions = building_state_actions, 
                cost_function = objective_function, 
                verbose = 0, 
                simulation_period=(0,8760-1), 
                central_agent=False)
observations_spaces, actions_spaces = env.get_state_action_spaces()

In [None]:
# # Provides information on Building type, Climate Zone, Annual DHW demand, Annual Cooling Demand, Annual Electricity Demand, Solar Capacity, and correllations among buildings
building_info = env.get_building_information()

In [None]:
# Hyperparameters
bs = 256
tau = 0.005
gamma = 0.99
lr = 0.0003
hid = [256,256]

n_episodes = 12

In [None]:
import warnings
warnings.filterwarnings("ignore")
# Instantiating the control agent(s)
agents = RL_Agents_Coord(building_ids, 
                         building_state_actions, 
                         building_info, 
                         observations_spaces, 
                         actions_spaces, 
                         discount = gamma, 
                         batch_size = bs, 
                         replay_buffer_capacity = 1e5, 
                         regression_buffer_capacity = 12*8760, 
                         tau=tau, 
                         lr=lr, 
                         hidden_dim=hid, 
                         start_training=8760*3, 
                         exploration_period = 8760*3+1,  
                         start_regression=8760, 
                         information_sharing = False, # I changed here experimentally. 
                         pca_compression = .95, 
                         action_scaling_coef=0.5, 
                         reward_scaling = 5., 
                         update_per_step = 1, 
                         iterations_as = 2)

cost_by_epoch = []
# The number of episodes can be replaces by a stopping criterion (i.e. convergence of the average reward)
start = time.time()
for e in range(n_episodes): 
    is_evaluating = (e > 7) # Evaluate deterministic policy after 7 epochs
    rewards = []
    state = env.reset()
    done = False

    j = 0
    action, coordination_vars = agents.select_action(state, deterministic=is_evaluating)    
    while not done:
        next_state, reward, done, _ = env.step(action)
        action_next, coordination_vars_next = agents.select_action(next_state, deterministic=is_evaluating)
        agents.add_to_buffer(state, action, reward, next_state, done, coordination_vars, coordination_vars_next)

        state = next_state
        coordination_vars = coordination_vars_next
        action = action_next

    cost = env.cost()
    cost_by_epoch.append(cost)
    print('Loss -', cost, 'Simulation time (min) -',(time.time()-start)/60.0)

Loss - {'ramping': 1.1826104, '1-load_factor': 1.1163685108538957, 'average_daily_peak': 1.0879441, 'peak_demand': 1.2307191, 'net_electricity_consumption': 1.0370967, 'total': 1.1309477763437528} Simulation time (min) - 0.2943402926127116
Loss - {'ramping': 1.1815282, '1-load_factor': 1.1123980838999392, 'average_daily_peak': 1.0900203, 'peak_demand': 1.2342972, 'net_electricity_consumption': 1.0368078, 'total': 1.1310103050658156} Simulation time (min) - 0.8308295607566833
Loss - {'ramping': 1.1819001, '1-load_factor': 1.0967719192027132, 'average_daily_peak': 1.0869675, 'peak_demand': 1.1123391, 'net_electricity_consumption': 1.0372785, 'total': 1.1030514406108674} Simulation time (min) - 2.084572994709015
Loss - {'ramping': 0.8548236, '1-load_factor': 1.045004621681809, 'average_daily_peak': 0.99767196, 'peak_demand': 1.1908945, 'net_electricity_consumption': 1.0028597, 'total': 1.018250873791623} Simulation time (min) - 21.59046080907186
Loss - {'ramping': 0.8125561, '1-load_facto

In [None]:
# # Plotting winter operation
# interval = range(5000,5200)
# plt.figure(figsize=(16,5))
# plt.plot(env.net_electric_consumption_no_pv_no_storage[interval])
# plt.plot(env.net_electric_consumption_no_storage[interval])
# plt.plot(env.net_electric_consumption[interval], '--')
# plt.xlabel('time (hours)')
# plt.ylabel('kW')
# plt.legend(['Electricity demand without storage or generation (kW)', 
#             'Electricity demand with PV generation and without storage(kW)', 
#             'Electricity demand with PV generation and using SAC for storage(kW)'])

## Appendix:
* Climate zone 4: without information sharing

![image](https://user-images.githubusercontent.com/56372825/107677087-dc657500-6c67-11eb-95b1-017fb426fb6a.png)

* Climate zone 4: without information sharing and reward = $r^3$
    * So, in this case, the agents are independent with each other. One difference from the single central agent is the input state (observation) dimention. 
    * multiagent non info sharing $r^33$: $26 \times (PCA dim reduction rate)$ for each policy network (there is 9 policy network) 
    * single central agent $81$