In [7]:
import os
from stable_baselines3.common.callbacks import BaseCallback,StopTrainingOnRewardThreshold,EvalCallback
log_path = os.path.join('Training','Logs')

In [4]:


from gym.core import RewardWrapper
import numpy as np
import gym
from gym import error, spaces, utils
from gym.utils import seeding
from gym.spaces import MultiDiscrete, Box
import numpy as np
from numpy.core.fromnumeric import mean
from numpy.core.numeric import True_
from numpy.lib.function_base import average
from stable_baselines3 import DDPG,A2C,SAC,TD3,PPO
from stable_baselines3.common.env_checker import check_env


from stable_baselines3.common.evaluation import evaluate_policy
import matplotlib.pyplot as plt
from stable_baselines3.common.noise import ActionNoise
#from torch._C import DoubleTensor
#from torch._C import TreeView





class SimpleEnv(gym.Env):

    """
     Description:
         The agent (control signal to G number of generators) is given a scalar value D at each timestep.
         At each timestep the agent can choose how much energy each generator shall generate, with the goal
         to match the demand at the given timestep.
     Observation:
         Type: Box(1)
         Num    Observation               Min            Max
         0      Demand                    0          0.07
     Actions:
         Type: Box(G)
         Num    Action                    Min            Max
         0      Generation of gen 1..     p_low          p_high
         1      ...
         .
         .
         G      ...
     Reward:
         Reward is calculated as the sum of generation costs of all generators and the difference
         in power generation and demand.
         Generation costs are calculated according to the quadratic cost function:
         C_gen = a*p^2 + b*p + c for each generator, and difference is calculated as:
         C_diff = k * abs( sum(p) - D ) where k is a coefficient. (more elaborate cost can be implemented later)
         The reward is then equal to the negative cost:
         R = -(C_gen + C_diff) at each timestep
     Starting State:
         The state evolution follows the given time-series of demands D.
     Episode Termination:
         When the end of the time series of demands is reached, the episode terminates.
     """

    def __init__(self, gen_args=[[200, 600, 0.002, 10, 500, 50]], D=[800, 850, 880, 900, 860, 930, 950]):
        self.D = D
        self.gen_list = []
        for args in gen_args:
            self.gen_list.append(Generator(*args))
        self.action_space = Box(low=np.array([gen.p_low for gen in self.gen_list]), high=np.array(
            [gen.p_high for gen in self.gen_list]))
        self.observation_space = Box(
            low=np.array([0]), high=np.array([1000]),)
        self.state_length = len(self.D)
        self.index = 0


    def step(self, action):
        """
        Input: action (tuple)
        """
        gen_reward = 0
        # Sum all actions (generation of each generator) and compare to demand
        total_p = 0
        for i, p in enumerate(action):
            total_p += p
            # Extract parameters of current generator
            gen = self.gen_list[i]
            # Also add costs of generation to rewards
            gen_reward -= gen.a*p**2 + gen.b*p + gen.c

        # Cost constants
        k1 = 1 # 0.001  # generation 
        k2 = 1  # difference
        k21 = 1  # overproduction
        k22 = 1  # underproduction
        diff = total_p-self.D[self.index]
        if diff >= 0:
            diff_reward = k21*diff
        else: 
            diff_reward = -k22*diff
        reward = -k1*gen_reward - k2*diff_reward

        if self.index == self.state_length-1:
            done = True
        else:
            done = False
            # Increment index
            self.index += 1
            self.state = np.array([self.D[self.index]], dtype=np.float32)
        info = {}
        return self.state, reward, done, info

    def reset(self):
        self.index = 0
        self.state = np.array([self.D[self.index]], dtype=np.float32)
        return self.state

    # Not needed
    # def render(self, mode='human'):
        ...
    # def close(self):
        ...


class Generator:
    def __init__(self, p_low, p_high, a, c, b, p_r):
        self.p_low = p_low
        self.p_high = p_high
        self.a = a
        self.c = c
        self.b = b
        self.p_r = p_r

def main():

    arg1 = [200, 600, 0.002, 10, 500, 50]
    arg2 = [100, 400, 0.0025, 8, 300, 50]
    arg3 = [100, 300, 0.0050, 6, 100, 50]
    arg4 = [50, 200, 0.0060,  5, 90, 50]
    gen_args = [arg1, arg2, arg3, arg4]
    env = SimpleEnv(gen_args)
    
    MODELS = [DDPG] #,A2C,TD3,PPO,SAC
    total_timesteps=20000
    model_rewards = []
    


    for m in MODELS:
        print(m)
        callback =CustomCallback(model=m)
        model= m('MlpPolicy', env, verbose=0,tensorboard_log=log_path).learn(total_timesteps=total_timesteps,callback=callback)
        print(callback.episode_rewards)
        done = False 
        obs = env.reset()
        current_reward = 0
        try:
            mean_reward , standard_deviation = evaluate_policy(model, env ,n_eval_episodes=10,render=False)
            print (mean_reward, standard_deviation)
        except:
            pass
        while not done:
            action,_states = model.predict(obs)
            obs,reward,done,info =env.step(action)
            print(action) 
            current_reward += reward
            
        
        model_rewards.append(current_reward)
        obs = env.reset()  
        del model
        
        print('------------------------------')
        



    
    
    

class CustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self,model, verbose=0):
        self.model= model
        super(CustomCallback, self).__init__(verbose)
        self.episode_rewards =[]
        self.episode_count = 0
        self.episode_lenght = 0
        self.current_rewards = 0

    def _on_step(self):
        """
        This method will be called by the model after each call to `env.step()`.

        For child callback (of an `EventCallback`), this will be called
        when the event is triggered.

        :return: (bool) If the callback returns False, training is aborted early.
        """ 
        if self.model is A2C:
            self.current_rewards += self.locals['rewards']
            if self.locals['done']:
                self.episode_rewards.append(self.current_rewards)
                self.episode_lenght += 1 

        #else :
            #f self.locals['done']:
                #self.episode_rewards.append(self.locals['episode_rewards'])
                #self.episode_lenght += 1 

        return True
        
    def _on_rollout_end(self):
        # Not needed 
        pass
    def _on_training_end(self):
        # for debugging 
        '''
        print(self.locals)
        print(self.globals)
        print(self.model)
        try:
            print(self.locals['done'])
        except:
            print(self.locals['dones'])
        '''
        pass

    


if __name__ == "__main__":
    test()
    main()

# env = SimpleEnv()
# action = env.action_space.sample()
# print(action)



<class 'stable_baselines3.ddpg.ddpg.DDPG'>
[]
1741303.0 0.0
[200. 400. 100. 200.]
[200. 400. 100. 200.]
[200. 400. 100. 200.]
[200. 400. 100. 200.]
[200. 400. 100. 200.]
[200. 400. 100. 200.]
[200. 400. 100. 200.]
------------------------------




In [5]:
training_log_path =os.path.join(log_path,'DDPG_1')

In [None]:
!pip install tensorboard

In [6]:
!tensorboard --logdir={training_log_path}
# http://localhost:6006/

^C


In [None]:
save_path =os.path.join('Training','SavedModels')

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, callback_on_new_best = stop_callback,
                            eval_freq=10000,
                            best_model_save_path=save_path,
                            verbose =1 )

model =PPO('MlpPolicy',env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=20000,callback=eval_callback)