In [1]:
import pandapower as pp
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
import pickle
import gymnasium as gym
from gymnasium import spaces
from gymnasium.utils import seeding
from typing import Any
import numpy as np
import torch
from stable_baselines3 import PPO
import wandb
from stable_baselines3.common.callbacks import BaseCallback



In [2]:
class SimpleTwoBus:
 
    def __init__(self, V_ext, P, Q, G, B, V_init, theta_init):
        '''This class creates a simple 2-bus network.'''
        
        self.V_ext = V_ext
        self.P = P
        self.Q = Q
        self.G = G
        self.B = B
        self.V_init = V_init
        self.theta_init = theta_init
        self.net = pp.create_empty_network()
        self.create_two_bus_grid()
 
 
    def create_two_bus_grid(self):
   
        # Create two buses with initialized voltage and angle
        bus1 = pp.create_bus(self.net, vn_kv=20.0, name="Bus 1")
        bus2 = pp.create_bus(self.net, vn_kv=0.4, name="Bus 2")
   
        # Initialize voltage and angle for buses
        self.net.bus.loc[bus1, 'vm_pu'] = self.V_init[0]
        self.net.bus.loc[bus1, 'va_degree'] = self.theta_init[0]
        self.net.bus.loc[bus2, 'vm_pu'] = self.V_init[1]
        self.net.bus.loc[bus2, 'va_degree'] = self.theta_init[1]
   
        # create a line between the two buses
        pp.create_line_from_parameters(
            self.net,
            from_bus=0,
            to_bus=1,
            length_km=1.0,
            r_ohm_per_km=1/self.G,
            x_ohm_per_km=1/self.B,
            c_nf_per_km=0.0,
            g_us_per_km=0.0,
            max_i_ka=100.0,
        )
 
        # Create a transformer between the two buses
        # pp.create_transformer(self.net, bus1, bus2, std_type="0.25 MVA 20/0.4 kV")
   
        # Create a load at bus 2 with specified P and Q
        pp.create_load(self.net, bus2, p_mw=self.P, q_mvar=self.Q, name="Load")
   
        # Create an external grid connection at bus 1 with specified G and B
        pp.create_ext_grid(self.net, bus1, vm_pu=self.V_ext, name="Grid Connection")


In [4]:
class GridEnv(gym.Env):
    def __init__(self,V_ext = 1.02, G = 100, B = 0.1, k_limit = 4, termination_counter=10):

        self.observation_space = spaces.Box(low = np.array([-1e10,-1e10]), high = np.array([1e10, 1e10]), dtype=np.float32)
        self.action_space = spaces.Box(low=np.array([-0.02, -20]), high=np.array([0.02, 20]), dtype=np.float32)

        self.k_limit = k_limit
        self.termination_counter = termination_counter


        self.G = G
        self.B = B
        self.V_ext = V_ext

        #initialize network
        self.state, info = self.reset()

    def create_feasible_Ybusnet(self):
        YbusNet = SimpleTwoBus(self.V_ext,2,1,self.G,self.B,[1,1],[0,0])
        net = YbusNet.net

        return net


    
    def seed(self, seed=None):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self, seed=None):
        if seed is not None:
            self.seed(seed)

        self.counter = 0
        self.done = False
        self.terminated = False

        self.P = np.random.uniform(low= 0, high=20)
        self.Q = np.random.uniform(low = 0, high = 20)
        # self.P = 8 #keeping P and Q fixed for now
        # self.Q = 8
        # self.V_init = np.random.uniform(low = 0.9, high = 1.1, size=2)
        # self.theta_init = np.random.uniform(low = -20, high = 20, size=2)
        self.V = np.random.uniform(low = 0.85, high = 1.15, size=2)
        self.theta = np.random.uniform(low = -45, high = 45, size=2)
        self.V0 = np.random.uniform(low = -0.05, high = 0.05, size=2)
        self.theta0 = np.random.uniform(low = -30, high = 30, size=2)
        self.complex_V = self.calculate_complex_V(self.V, self.theta)

        Net = SimpleTwoBus(self.V_ext,self.P,self.Q,self.G,self.B,self.V,self.theta)
        self.net = Net.net

        self.Ybus = self.calculate_Ybus()

        initial_guesses = np.array([self.V0, self.theta0])

        self.state = self.calculate_first_residual(initial_guesses)

        return self.state, {}



    def calculate_first_residual(self,initial_guesses):

        # net = self.net.deepcopy()  # Keep the network unchanged

        self.update_V(initial_guesses)

        term2 = self.Ybus@self.complex_V
        term2_complex_conj = np.conj(term2)

        term1 = self.complex_V@term2_complex_conj

        F = self.P + 1j*self.Q - term1

        delta_P = np.real(F)
        delta_Q = np.imag(F)

        residual = np.array([delta_P, delta_Q])

        return residual


    def calculate_Ybus(self):

        # net_copy = SimpleTwoBus(self.V_ext,self.P,self.Q,self.G,self.B,self.V,self.theta)
        Ybusnet = self.create_feasible_Ybusnet()
        pp.runpp(Ybusnet, max_iteration = 50, tolerance_mva=1e-5)
        Ybus = Ybusnet._ppc["internal"]["Ybus"]




        # print(f"{Ybus[1,1]=}")
        # print(f"{Ybus[1]=}")
        # Ybus = self.G + 1j*self.B
        # Ybus_matrix = np.eye(2,2) * Ybus

        return Ybus


    def calculate_complex_V(self, V, theta):
        complex_V = V*np.exp(1j*theta)

        return complex_V
    
    def update_V(self, action):

        # print(f"old V = {self.V}")
        # print(f"old theta = {self.theta}")
        new_V = self.V - action[0]
        new_theta = self.theta - action[1]


        # maybe try different way of scaling the actions back when they exceed the limits?
        if new_V[0] > 1.15:
            new_V[0] = 1.15 
        if new_V[1] > 1.15:
            new_V[1] = 1.15 
        
        if new_V[0] < 0.85:
            new_V[0] = 0.85
        if new_V[1] < 0.85:
            new_V[1] = 0.85

        if new_theta[0] > 45:
            new_theta[0] = 45
        if new_theta[1] > 45:
            new_theta[1] = 45
        if new_theta[0] < -45:
            new_theta[0] = -45
        if new_theta[1] < -45:
            new_theta[1] = -45



        self.complex_V = self.calculate_complex_V(new_V, new_theta)



        self.V = new_V
        self.theta = new_theta

        # print(f"new V = {self.V}")
        # print(f"new theta = {self.theta}")


        





 
    # def compute_residual_torch(self, V_mag, V_ang, Ybus, S):
    #     V_ang = torch.deg2rad(V_ang)
    #     complex_v = V_mag*(torch.exp(V_ang*1j))
    #     current = Ybus@complex_v
    #     diag_V = torch.diag(complex_v)
    #     residual = diag_V@torch.conj(current) - S
    #     return residual 


    def calculate_residual(self, action):

        # net = self.net.deepcopy()  # Keep the network unchanged

        self.update_V(action)

        term2 = self.Ybus@self.complex_V
        term2_complex_conj = np.conj(term2)

        term1 = self.complex_V@term2_complex_conj

        F = self.P + 1j*self.Q - term1

        delta_P = np.real(F)
        delta_Q = np.imag(F)

        residual = np.array([delta_P, delta_Q])

        # pp.runpp(net, max_iteration = 1, tolerance_mva = np.inf) # not the correct function, this is just to let the environment loop be able to run
        # err = net._ppc['et']

        # residual = np.zeros(2)
        # residual[:] = err
        
        #needs a function!

        # print(f"{residual=}")

        return residual


    def perform_NR_step(self):

        net = self.net.deepcopy()  # Keep the network unchanged

        try:
            pp.runpp(net, max_iteration = 50, tolerance_mva = 1e-5)
        

            iterations = net._ppc["iterations"]
        except:
            iterations = 50

        return iterations
        


    def calculate_reward(self):

        iterations = self.perform_NR_step()

        reward = - iterations

        return reward
    

    def step(self, action):
        # print(f"{action=}")

        self.counter += 1
        # action = [delta_V, delta_theta]

        # perform action
        residual = self.calculate_residual(action)


        # calcualate reward
        reward = self.calculate_reward()


        #update state:
        self.state = residual

        if reward >= -self.k_limit:
            self.done = True
            return self.state, reward, self.done, self.terminated, {}

        

        if self.counter == self.termination_counter:
            self.terminated = True
            return self.state, reward, self.done, self.terminated, {}

        return self.state, reward, self.done, self.terminated, {}


  

    def render(self):
        pass

In [6]:
# Test run


env = GridEnv()


state = env.reset()
print("Initial State:")
# env.render()

# Define a sample action within the specified ranges
action = np.array([0.03, 15.0], dtype=np.float32)

# Take a step in the environment using the sample action
next_state, reward, done, info = env.step(action)

# Print the results
print("\nAction Taken:", action)
print("Next State:", next_state)
# env.render()
print("Reward:", reward)
print("Done:", done)

  gym.logger.warn(
  gym.logger.warn(


Initial State:


ValueError: too many values to unpack (expected 4)

In [5]:
class WandbCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(WandbCallback, self).__init__(verbose)
        self.episode_rewards = []
        self.episode_lengths = []

    def _on_step(self) -> bool:
        # Check if the episode is done
        if self.locals["dones"][0]:
            # Log the episode return (sum of rewards)
            episode_reward = self.locals["infos"][0].get("episode", {}).get("r", 0)
            episode_length = self.locals["infos"][0].get("episode", {}).get("l", 0)
            self.episode_rewards.append(episode_reward)
            self.episode_lengths.append(episode_length)
            wandb.log({"episode_reward": np.mean(self.episode_rewards[:-100]), "episode_length": np.mean(self.episode_lengths[:-100])})
            print(f"{episode_reward=}")
        
        return True

In [None]:
# train an RL agent on the environment from above
wandb.init(project="grid-env-training")
env = GridEnv()
lr = 3e-3
total_timesteps = 1e6

model = PPO("MlpPolicy", env, verbose=1, learning_rate = lr)
model.learn(total_timesteps=total_timesteps, callback = WandbCallback())
model.save(f"saved_models/PPO_{lr=}_{total_timesteps=}")

wandb.finish()

# del model # remove to demonstrate saving and loading

# model = PPO.load("ppo_cartpole")

# obs = vec_env.reset()
# while True:
#     action, _states = model.predict(obs)
#     obs, rewards, dones, info = vec_env.step(action)
#     vec_env.render("human")


[34m[1mwandb[0m: Currently logged in as: [33mlindsayspoor[0m ([33mlindsayspoor-rlg[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


  gym.logger.warn(
  gym.logger.warn(


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-3.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-2.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-50.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-3.0
episode_reward=-4.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-3.0
episode_reward=-50.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-4.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-4.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-50.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-3.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-3.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-4.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-50.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-50.0
episode_reward=-3.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-3.0
episode_reward=-3.0
episode_reward=-4.0
episode_reward=-4.0
episode_reward=-4.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-4.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-3.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-50.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-4.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-4.0
episode_reward=-4.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-4.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-50.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-4.0
episode_reward=-3.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-3.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-4.0
episode_reward=-500.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode_reward=-500.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-3.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-3.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-4.0
episode_reward=-500.0
episode_reward=-3.0
episode_reward=-500.0
episode_reward=-50.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-4.0
episode_reward=-3.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-500.0
episode_reward=-3.0
episode_reward=-5

In [None]:
def evaluate_model(model, num_evaluations, env):
    # idea: plot residual as a function of actions?
    raise NotImplementedError