In [5]:
%%capture
!pip install setuptools==65.5.0 "wheel<0.40.0"
!apt update
!apt-get install python3-opengl
!apt install xvfb -y
!pip install 'swig'
!pip install 'pyglet==1.5.27'
!pip install 'gym[box2d]==0.20.0'
!pip install 'pyvirtualdisplay==3.0'
!pip install 'box2d'
!pip install 'box2d-kengz'

In [51]:
import gym
import random
import numpy as np
import sys
import os
import itertools
# import datetime
# import json
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
# import pandas as pd
from pyvirtualdisplay import Display
from IPython import display as disp
from torchsummary import summary
from torch.distributions import Normal
from torch.optim import Adam
from collections import deque, namedtuple

%matplotlib inline

In [52]:
display = Display(visible=0,size=(600,600))
display.start()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [53]:
#  https://github.com/honghaow/FORK/blob/master/TD3-FORK/TD3_FORK.py
# based on https://arxiv.org/pdf/2010.01652
#Actor nn
# maps states (s) to actions (a)
class Actor_Network(nn.Module):
        def __init__(self, obs_dim, act_dim, seed=42, l1_size=400, l2_size=300):
            super(Actor_Network, self).__init__()
            self.seed = torch.manual_seed(seed)
            self.l1 = nn.Linear(obs_dim, l1_size) #fully connected layers
            self.l2 = nn.Linear(l1_size, l2_size)
            self.l3 = nn.Linear(l2_size, act_dim)

        def forward(self, s):
            a = F.relu(self.l1(s))
            a = F.relu(self.l2(a))
            a = F.torch.tanh(self.l3(a)) 
            return a
# Critic nn
# (s,a) to Q values
class Critic_Network(nn.Module):
        def __init__(self, obs_dim, act_dim, seed=42, l1_size=400, l2_size=300):
            super(Critic_Network, self).__init__()
            self.seed = torch.manual_seed(seed)

            self.q1_l1 = nn.Linear(obs_dim+act_dim, l1_size) #fully connected layers
            self.q1_l2 = nn.Linear(l1_size, l2_size)
            self.q1_l3 = nn.Linear(l2_size, 1)

            self.q2_l1 = nn.Linear(obs_dim+act_dim, l1_size) #fully connected layers
            self.q2_l2 = nn.Linear(l1_size, l2_size)
            self.q2_l3 = nn.Linear(l2_size, 1)

        def forward(self, s, a):
            sa = torch.cat([s, a], 1)

            q1 = F.relu(self.q1_l1(sa))
            q1 = F.relu(self.q1_l2(q1))
            q1 = self.q1_l3(q1)

            q2 = F.relu(self.q2_l1(sa))
            q2 = F.relu(self.q2_l2(q2))
            q2 = self.q2_l3(q2)

            return q1, q2
# System nn
# (s,a) to next state)
class System_Network(nn.Module):
        def __init__(self, obs_dim, act_dim, l1_size=400, l2_size=300):
            super(System_Network, self).__init__()
            self.l1 = nn.Linear(obs_dim + act_dim, l1_size)
            self.l2 = nn.Linear(l1_size, l2_size)
            self.l3 = nn.Linear(l2_size, obs_dim)
        
        def forward(self, s, a):
            sa = torch.cat([s, a], 1)

            ns = F.relu(self.l1(sa))
            ns = F.relu(self.l2(ns))
            ns = self.l3(ns)
            return ns

In [54]:
# from https://github.com/ugurcanozalp/td3-sac-bipedal-walker-hardcore-v3/blob/main/noise.py
# https://open.metu.edu.tr/handle/11511/92170
# experimented with parameters
class AbstractNoise:
    def __init__(self):
        pass
    def step_end(self):
        pass
    def episode_end(self):
        pass
    
class GaussianNoise(AbstractNoise):
    def __init__(self, mu, sigma, clip=None):
        self.mu = mu # the mean of the gaussian distribution (typically set to 0 for zero-centered noise)
        self.sigma = sigma # he standard deviation of the gaussian distribution, controlling the noise amplitude
        self.clip = clip        #clip the generated noise within a specific range
    def __call__(self):
        delta = self.sigma*np.random.normal(size=self.mu.shape)
        if self.clip is not None:
            delta = delta.clip(-self.clip,+self.clip)

        return self.mu + delta

class OrnsteinUhlenbeckNoise(AbstractNoise): # temporal noise
    def __init__(self, mu, theta = 0.15, sigma = 0.2, dt=0.02):
        # 5.0, 0.02, 1.0 # 1.0, 0.02, 0.25 # 7.5, 0.02, 1.4 # 5.0, 0.02, 0.7
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.dt = dt        
        self.x_prev = np.zeros_like(self.mu)

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
                self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

In [None]:
import gym
from collections import deque
import numpy as np
# from https://github.com/ugurcanozalp/td3-sac-bipedal-walker-hardcore-v3/blob/main/env_wrappers.py
# https://alexandervandekleut.github.io/gym-wrappers/
#env = gym.make('BipedalWalker-v3')

class EnvSkipWrapper(gym.Wrapper):
    '''
    This is custom wrapper for BipedalWalker-v3 and BipedalWalkerHardcore-v3. 
    Rewards for failure is decreased to make agent brave for exploration and 
    time frequency of dynamic is lowered by skipping two frames.
    # time frequency is from  https://github.com/ugurcanozalp/td3-sac-bipedal-walker-hardcore-v3/blob/main/env_wrappers.py
    # reward adjustment is from original FORK paper referenced in the cells above
    '''
    def __init__(self, env, skip=2):
        super().__init__(env)
        self._obs_buffer = deque(maxlen=skip)
        self._skip = skip
        
    def step(self, action):
        total_reward = 0
        total_ep_reward = 0
        c = 0
        for i in range(self._skip):
            c +=1
            obs, reward, done, info = self.env.step(action)
            total_ep_reward += reward
            if reward == -100:
                reward = -5.0
                info["dead"] = True                    
            else:
                reward = 5 * reward
                info["dead"] = False

            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        info["reward"] = total_ep_reward/c
        return obs, total_reward, done, info

    def reset(self):
        return self.env.reset()
# or
class EnvWrapper(gym.Wrapper):
    '''
    This is custom wrapper for BipedalWalker-v3 and BipedalWalkerHardcore-v3. 
    Rewards for failure is decreased to make agent brave for exploration and 
    time frequency of dynamic is lowered by skipping two frames.
    # time frequency is from  https://github.com/ugurcanozalp/td3-sac-bipedal-walker-hardcore-v3/blob/main/env_wrappers.py
    # reward adjustment is from original FORK paper referenced in the cells above
    '''
    def __init__(self, env):
        super().__init__(env)
        
    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        # info["reward"] = reward
        # if reward == -100:
        #     reward = -5.0
        #     info["dead"] = True                    
        # else:
        #     reward = 5 * reward
        #     info["dead"] = False
        # return obs, reward, done, info
        

    def reset(self):
        return self.env.reset()



In [88]:
class Agent:
    def __init__(self, env, **kwargs):

        self.env = env
        self.device = kwargs.get("device", torch.device("cuda" if torch.cuda.is_available() else "cpu"))
        self.seed = kwargs.get("seed", 42)
        self.load_weights = kwargs.get("load_weight", False)
        
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.shape[0]
        self.act_upper_bound = self.env.action_space.high[0] 
        self.act_lower_bound = self.env.action_space.low[0]  
        self.obs_upper_bound = self.env.observation_space.high[0] 
        self.obs_lower_bound = self.env.observation_space.low[0]  

        self.critic = Critic_Network(self.obs_dim, self.act_dim, seed=self.seed)
        self.critic_target = Critic_Network(self.obs_dim, self.act_dim, seed=self.seed)

        self.actor = Actor_Network(self.obs_dim, self.act_dim, seed=self.seed)
        self.actor_target = Actor_Network(self.obs_dim, self.act_dim, seed=self.seed)

        self.system = System_Network(self.obs_dim, self.act_dim)

        self.critic_optim = optim.Adam(self.critic.parameters(), lr=kwargs.get("lr_critic", 3e-4))
        self.actor_optim = optim.Adam(self.actor.parameters(), lr=kwargs.get("lr_actor", 3e-4))
        self.system_optim = optim.Adam(self.system.parameters(), lr=kwargs.get("lr_system", 3e-4))

        self.batch_size = kwargs.get("batch_size", 100)
        self.gamma = kwargs.get("gamma", 0.99)
        self.tau = kwargs.get("tau", 0.02)
        self.policy_freq = kwargs.get("policy_freq", 2)
        # hyperparameters for below are taken from https://github.com/ugurcanozalp/td3-sac-bipedal-walker-hardcore-v3/blob/main/env_wrappers.py and original FORK paper - specifically noise clip and volatility of the noise from FORK paper 
        self.noise_generator = OrnsteinUhlenbeckNoise(mu=np.zeros(self.act_dim), theta=kwargs.get("ou_noise_theta", 4.0), sigma=kwargs.get("ou_noise_sigma", 0.2), dt=kwargs.get("ou_dt", 0.04))
        self.target_noise = GaussianNoise(mu=np.zeros(self.act_dim), sigma=kwargs.get("noise_sigma",0.1), clip=kwargs.get("noise_clip", 0.5))
        self.policy_noise = GaussianNoise(mu=np.zeros(self.act_dim), sigma=kwargs.get("noise_sigma",0.2))
        self.noise_comb = kwargs.get("noise_comb", "GOU")
        print(self.noise_comb)
        
        self.replay_memory_buffer = deque(maxlen = kwargs.get("buffer_capacity", 1000000))
        self.replay_memory_bufferd_dis = deque(maxlen = kwargs.get("buffer_capacity", 1000000))

        self.set_weights()
        self.iteration = kwargs.get("iteration", 0)

    def load_weight(self):
        self.actor.load_state_dict(torch.load(f'/weights/hardcore/{self.iteration}actor.pth', map_location=self.device))
        self.critic.load_state_dict(torch.load(f'/weights/hardcore/{self.iteration}critic.pth', map_location=self.device))
        self.actor_target.load_state_dict(torch.load(f'/weights/hardcore/{self.iteration}actor_t.pth', map_location=self.device))
        self.critic_target.load_state_dict(torch.load(f'/weights/hardcore/{self.iteration}critic_t.pth', map_location=self.device))
        self.system.load_state_dict(torch.load(f'/weights/hardcore/{self.iteration}sysmodel.pth', map_location=self.device))
    
    def set_weights(self):
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

    def add_to_replay_buffer(self, t, buffer):
        buffer.append(t)

    def sample_replay_buffer(self, buffer):
        #  saple batch of observations from replay buffer
        sample = random.sample(buffer, self.batch_size)
        return sample

    def learn(self, training_iterations, weight, train):
        """
        training_iterations:Int -> how many times to train the networks
        train:Bool -> should the networks be trained
        weight:Float -> adaptive weight of the experience
        """
        if len(self.replay_memory_buffer) < 1e4:
            return 1

        for it in range(training_iterations):
            mini_batch = self.sample_replay_buffer(self.replay_memory_buffer)
            state_batch = torch.from_numpy(np.vstack([i[0] for i in mini_batch])).float().to(self.device)
            action_batch = torch.from_numpy(np.vstack([i[1] for i in mini_batch])).float().to(self.device)
            reward_batch = torch.from_numpy(np.vstack([i[2] for i in mini_batch])).float().to(self.device)
            add_reward_batch = torch.from_numpy(np.vstack([i[3] for i in mini_batch])).float().to(self.device)
            next_state_batch = torch.from_numpy(np.vstack([i[4] for i in mini_batch])).float().to(self.device)
            done_batch = torch.from_numpy(np.vstack([i[5] for i in mini_batch]).astype(np.uint8)).float().to(self.device)

            #  train critic double q network
            target_actions = self.actor_target(next_state_batch)
            #  noise already clipped in the noise class
            if self.noise_comb in ["GOU", "GG"]:
                offset_noises = torch.from_numpy(self.target_noise()).float().to(self.device)
            else:
                offset_noises = torch.from_numpy(self.noise_generator()).float().to(self.device)
                # 0.5 set as default clip should be added to hyperparameters later
                offset_noises = offset_noises.clamp(-0.5, 0.5)
            # below as in original implementation
            target_actions = (target_actions + offset_noises).clamp(self.act_lower_bound, self.act_upper_bound)

            Q_targets1, Q_targets2 = self.critic_target(next_state_batch, target_actions)
            Q_targets = torch.min(Q_targets1, Q_targets2)
            Q_targets = reward_batch + self.gamma * Q_targets * (1 - done_batch)

            current_Q1, current_Q2 = self.critic(state_batch, action_batch)

            critic_loss = F.mse_loss(current_Q1, Q_targets.detach()) + F.mse_loss(current_Q2, Q_targets.detach())

            self.critic_optim.zero_grad()
            critic_loss.backward()
            self.critic_optim.step()

            self.soft_update_target(self.critic, self.critic_target)

            #  predict new state with system network
            predict_next_state = self.system(state_batch, action_batch) * (1-done_batch)
            next_state_batch = next_state_batch * (1 -done_batch)
            # train system network
            system_loss = F.mse_loss(predict_next_state, next_state_batch.detach())

            self.system_optim.zero_grad()
            system_loss.backward()
            self.system_optim.step()

            s_flag = 1 if system_loss.item() < 0.020  else 0

            if it % self.policy_freq == 0 and train == True:
                actions = self.actor(state_batch)
                actor_loss1,_ = self.critic_target(state_batch, actions)
                actor_loss1 =  actor_loss1.mean()
                actor_loss =  - actor_loss1 
                if s_flag == 1:
                    p_actions = self.actor(state_batch)
                    p_next_state = self.system(state_batch, p_actions).clamp(self.obs_lower_bound,self.obs_upper_bound)

                    p_actions2 = self.actor(p_next_state.detach()) * self.act_upper_bound
                    actor_loss2,_ = self.critic_target(p_next_state.detach(), p_actions2)
                    actor_loss2 = actor_loss2.mean() 

                    p_next_state2= self.system(p_next_state.detach(), p_actions2).clamp(self.obs_lower_bound,self.obs_upper_bound)
                    p_actions3 = self.actor(p_next_state2.detach()) * self.act_upper_bound
                    actor_loss3,_ = self.critic_target(p_next_state2.detach(), p_actions3)
                    actor_loss3 = actor_loss3.mean() 

                    actor_loss_final =  actor_loss - weight * (actor_loss2) - 0.5 *  weight * actor_loss3
                else:
                    actor_loss_final =  actor_loss

                self.actor_optim.zero_grad()
                actor_loss_final.backward()
                self.actor_optim.step()

                self.soft_update_target(self.actor, self.actor_target)
        return system_loss.item()

    def soft_update_target(self,local_model,target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)

    def policy(self,state):
        """select action based on ACTOR"""
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        self.actor.eval()
        with torch.no_grad():
            actions = self.actor(state).cpu().data.numpy()
        self.actor.train()
        # Adding noise to action
        if self.noise_comb in ["OUOU", "GOU"]:
            shift_action = self.noise_generator()
        else:
            shift_action = self.policy_noise()
        sampled_actions = (actions + shift_action)
        # We make sure action is within bounds
        legal_action = np.clip(sampled_actions,self.act_lower_bound,self.act_upper_bound)
        return np.squeeze(legal_action)

    def select_action(self,state):
        """select action based on ACTOR"""
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        with torch.no_grad():
            actions = self.actor_target(state).cpu().data.numpy()
        return np.squeeze(actions)


    def eval_policy(self, env_name, seed, eval_episodes):
        eval_env = env_name
        eval_env.seed(seed + 100)

        avg_reward = 0.
        for _ in range(eval_episodes):
            state, done = eval_env.reset(), False
            while not done:
                action = self.select_action(np.array(state))
                state, reward, done, info = eval_env.step(action)
                try:
                    avg_reward += info["reward"]
                except:
                    avg_reward += reward
        avg_reward /= eval_episodes

        print("---------------------------------------")
        print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
        print("---------------------------------------")
        return avg_reward
    

In [89]:
def initialise_dirs(meta_f, env_type, model_type, dirs):
    iteration_number = 0
    paths = {}
    if not os.path.exists(meta_f):
        with open(meta_f, "w") as file:
            file.write(str(iteration_number))
    with open(meta_f, "r") as file:
        try:
            iteration_number = int(file.read())
        except:
            pass
    iteration_number += 1
    with open(meta_f, "w") as file:
        file.write(str(iteration_number))
    for d in dirs:
        os.makedirs(f'{d}/{env_type}/{model_type}/{iteration_number}', exist_ok=True)
        paths[d] = f'{d}/{env_type}/{model_type}/{iteration_number}'
    return paths,iteration_number

In [115]:
#init storage files for easy organisation during experimentation
log_dir = "./logs"
plot_dir = "./plots"
video_dir = "./videos"
models_dir = "./models"

meta_f = "./metadata.txt"
logs_f = "/agent-log.txt"

env_type = "normal"
model_type = "td3-fork"
wrapper = False

_paths, iteration_number = initialise_dirs(meta_f, env_type, model_type, [log_dir, plot_dir, video_dir, models_dir])

if env_type != "hardcore":
    env = gym.make('BipedalWalker-v3')
else:
    env = gym.make('BipedalWalkerHardcore-v3')

plot_every = 25
video_every = 25

env = gym.wrappers.Monitor(env, _paths['./videos'], video_callable=lambda ep_id: ep_id%video_every == 0, force=True)
if wrapper:
    env = EnvSkipWrapper(env)
with open(_paths['./logs'] + logs_f, "w") as file:
        try:
            pass
        except:
            pass
log_f = open(_paths['./logs'] + logs_f, "w")

exploration_steps = 0
total_episodes = 10000
max_steps = 2000
total_steps =0
add_experience_count = 0
save_every = 1000
train = 0
saved_times = 0
time_start = time.time() 
totest =0

ep_reward_list = []
avg_reward_list = []
data = []
agent = Agent(env,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    seed=42,
    load_weight=False,
    lr_critic=3e-4,
    lr_actor=3e-4,
    lr_system=3e-4,
    batch_size=100,
    gamma=0.99,
    tau=0.02,
    policy_freq=2,
    ou_noise_theta=4,
    ou_noise_sigma=0.2,
    ou_dt=0.04,
    noise_sigma=0.1,
    noise_clip=0.5,
    noise_comb="GG",
    buffer_capacity=1000000,
    iteration=iteration_number
)

if __name__ == '__main__':
    for ep in range(total_episodes):
        state = env.reset()
        ep_reward = 0
        t = int(0)
        temp_replay_buffer = []

        for st in range(max_steps):

            # Select action randomly or according to policy
            if total_steps < exploration_steps:
                action = env.action_space.sample()
            else:
                action = agent.policy(state)

            # Recieve state and reward from environment.
            next_state, reward, done, info = env.step(action)
            #change original reward from -100 to -5 and 5*reward for other values
            if wrapper:
                ep_reward += info["reward"]
                if info["dead"]:
                    add_reward = -1
                    add_experience_count += 1
                else:
                    add_reward = 0
            else:
                ep_reward += reward
                if reward == -100:
                    add_reward = -1
                    reward = -5
                    add_experience_count += 1
                else:
                    add_reward = 0
                    reward = 5 * reward

            temp_replay_buffer.append((state, action, reward, add_reward, next_state, done))
            
            # End this episode when `done` is True
            if done:
                if add_reward == -1 or ep_reward < 250:            
                    train = 1
                    for temp in temp_replay_buffer: 
                        agent.add_to_replay_buffer(temp, agent.replay_memory_buffer)
                elif add_experience_count > 0 and np.random.rand() > 0.5:
                    train = 1
                    add_experience_count -= 10
                    for temp in temp_replay_buffer: 
                        agent.add_to_replay_buffer(temp, agent.replay_memory_buffer)
                break
            state = next_state
            t += int(1)
            total_steps += 1
            # agent.step_end() # let decaying ou know the end of the step
        # if ep_reward > 150:
        #     print(info["c"])
        ep_reward_list.append(ep_reward)
        # Mean of last 100 episodes
        avg_reward = np.mean(ep_reward_list[-100:])
        avg_reward_list.append(avg_reward)
        # agent.episode_end()

        if np.mean(ep_reward_list[-5:]) > 300 and totest == 0:
            test_reward = agent.eval_policy(env, seed=42, eval_episodes=10)
            if test_reward > 300:
                final_test_reward = agent.eval_policy(env, seed=42, eval_episodes=100)
                if final_test_reward > 300:
                  
                    torch.save(agent.actor.state_dict(), f'{_paths["./models"]}/final-actor.pth')
                    torch.save(agent.critic.state_dict(), f'{_paths["./models"]}/final-critic.pth')
                    torch.save(agent.actor_target.state_dict(), f'{_paths["./models"]}/final-actor_t.pth')
                    torch.save(agent.critic_target.state_dict(), f'{_paths["./models"]}/final-critic_t.pth')
                    torch.save(agent.system.state_dict(), f'{_paths["./models"]}/final-sysmodel.pth')

                    print("===========================")
                    print('Task Solved')
                    print("===========================")
                    # break
                    totest = 1
                    
        s = (int)(time.time() - time_start)

       
        #Training agent only when new experiences are added to the replay buffer
        weight =  1 - np.clip(np.mean(ep_reward_list[-100:])/300, 0, 1)
        if train == 1:
            sys_loss = agent.learn(t, weight, train)
        else: 
            sys_loss = agent.learn(100, weight, train)
        train = 0

            # do NOT change this logging code - it is used for automated marking!
        log_f.write('episode: {}, reward: {}\n'.format(ep, ep_reward))
        log_f.flush()

        if ep % plot_every == 0:
            data.append([ep, np.array(ep_reward_list[-100:]).mean(), np.array(ep_reward_list[-100:]).std(), t])
            # ep_reward_list = []
            # plt.rcParams['figure.dpi'] = 100
            plt.plot([x[0] for x in data], [x[1] for x in data], '-', color='tab:grey')
            plt.fill_between([x[0] for x in data], [x[1]-x[2] for x in data], [x[1]+x[2] for x in data], alpha=0.2, color='tab:grey')
            plt.xlabel('Episode number')
            plt.ylabel('Episode reward')
            plt.savefig(_paths['./plots']+'/plot_ep_{}.png'.format(ep))
            plt.show()
            disp.clear_output(wait=True)

        print('episode {}, timestep {},  ep.timesteps {}, reward: {:.2f}, moving avg. reward: {:.2f}, time: {:02}:{:02}:{:02}'
                .format(ep, total_steps, t,
                      ep_reward, avg_reward, s//3600, s%3600//60, s%60)) 
        ep_reward = 0   
        
        if t % 500 == 0:           
            
            torch.save(agent.actor.state_dict(), f'{_paths["./models"]}/{saved_times}-actor.pth')
            torch.save(agent.critic.state_dict(), f'{_paths["./models"]}/{saved_times}-critic.pth')
            torch.save(agent.actor_target.state_dict(), f'{_paths["./models"]}/{saved_times}-actor_t.pth')
            torch.save(agent.critic_target.state_dict(), f'{_paths["./models"]}/{saved_times}-critic_t.pth')
            torch.save(agent.system.state_dict(), f'{_paths["./models"]}/{saved_times}-sysmodel.pth')        
            print("===========================")
            print('Saving Successfully!')
            print("===========================")
            saved_times +=1
        
      

# Plotting graph
# Episodes versus Avg. Rewards
plt.plot(avg_reward_list)
plt.xlabel("Episode")
plt.ylabel("Avg. Epsiodic Reward")
plt.show()
env.close()

episode 6250, timestep 5271947,  ep.timesteps 795, reward: 315.51, moving avg. reward: 315.06, time: 02:29:26
episode 6251, timestep 5272750,  ep.timesteps 803, reward: 315.16, moving avg. reward: 315.06, time: 02:29:27
episode 6252, timestep 5273553,  ep.timesteps 803, reward: 314.50, moving avg. reward: 315.06, time: 02:29:28
episode 6253, timestep 5274348,  ep.timesteps 795, reward: 314.27, moving avg. reward: 315.05, time: 02:29:28
episode 6254, timestep 5275145,  ep.timesteps 797, reward: 315.38, moving avg. reward: 315.06, time: 02:29:29
episode 6255, timestep 5275949,  ep.timesteps 804, reward: 314.34, moving avg. reward: 315.05, time: 02:29:30
episode 6256, timestep 5276738,  ep.timesteps 789, reward: 315.06, moving avg. reward: 315.06, time: 02:29:30
episode 6257, timestep 5277533,  ep.timesteps 795, reward: 316.04, moving avg. reward: 315.07, time: 02:29:31
episode 6258, timestep 5278323,  ep.timesteps 790, reward: 315.47, moving avg. reward: 315.08, time: 02:29:32
episode 62

KeyboardInterrupt: 

: 