# BipedalWalkerHardcore-v3

---
In this notebook, you will implement a TD3 agent with OpenAI Gym's BipedalWalkerHardcore-v3 environment.


#### 1. Import the Necessary Packages

In [1]:
import torch
import gym
import numpy as np
from collections import deque
from collections import namedtuple
import time
from PrioritizedReplayBuffer import PrioritizedReplayBuffer
from ReplayBuffer import ReplayBuffer
from Agent import TD3
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

#### 2. Instantiate the Environment
Initialize the environment in the code cell below.

In [2]:
env_name = 'BipedalWalkerHardcore-v3'
#env_name = 'BipedalWalker-v3'
random_seed = 0
save_every = 500            # safe trained models after interval
print_every = 10
score_to_solve = 300.0
directory = "./preTrained/" # save trained models
filename = "TD3_{}_{}".format(env_name, random_seed)
continue_training = True

max_episodes = 20000        # max num of episodes
max_timesteps = 2000        # max timesteps in one episode

exploration_noise = 0.1

In [3]:
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

env.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)



#### 3. Train the Agent with TD3
Run the code cell below to train the agent from scratch. You are welcome to amend the supplied values of the parameters in the function, to try to see if you can get better performance!

In [None]:
def train(buffer_prefill=10000):
    policy = TD3(state_dim, action_dim, max_action)
    
    if continue_training:        
        policy.load(directory, "TD3_BipedalWalker-v3_0_solved")
    
    #replay_buffer = PrioritizedReplayBuffer()
    replay_buffer = ReplayBuffer()
    
    # Prefill
    state = env.reset()
    while not len(replay_buffer) < buffer_prefill:
        action = env.action_space.sample()
        action = action.clip(env.action_space.low, env.action_space.high)
        next_state, reward, done, _ = env.step(action)        
        replay_buffer.add((state, action, reward, next_state, float(done)))              
        if done:
            state = env.reset()            
    print('Buffer prefilled')

    scores = []
    avg_reward = 0    
    episode_rewards = []
    
    ep_rewards_deque = deque(maxlen=100)

    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        
        ep_reward = 0
        
        timestep = time.time()
        
        for t in range(max_timesteps):
            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)

            # take action in env:
            next_state, reward, done, _ = env.step(action)
            
            replay_buffer.add((state, action, reward, next_state, float(done)))
            
            avg_reward += reward                       
            ep_reward += reward
            state = next_state

            # if i_episode is done then update policy:            
            if (done or t==(max_timesteps-1)):
                policy.update(replay_buffer, t)
                break           
                
        episode_rewards.append(ep_reward)
        ep_rewards_deque.append(ep_reward)        
        avg_rewards = (avg_reward / print_every)
        
        if np.mean(ep_rewards_deque) >= score_to_solve:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, avg_rewards))
            policy.save(directory, filename + '_solved')
            break        
        
        if i_episode % print_every == 0:
            min_rewards = np.min(episode_rewards)
            max_rewards = np.max(episode_rewards)            
            print('\rEpisode {}, Average Score: {:.2f}, Max: {:.2f}, Min: {:.2f}, Time: {:.2f}'\
                  .format(i_episode, avg_rewards, max_rewards, min_rewards, time.time() - timestep), end="\n")
            
            avg_reward = 0
            episode_rewards = []
            
        if i_episode % save_every == 0:
            policy.save(directory, filename)
        
        scores.append(ep_reward)
            
    return scores


scores = train()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Buffer prefilled
Episode 10, Average Score: -95.98, Max: 14.20, Min: -189.05, Time: 23.48
Episode 20, Average Score: -122.28, Max: -80.10, Min: -205.49, Time: 3.49


#### 4. Watch a Smart Agent!
In the next code cell, you will load the trained weights from file to watch a smart agent!

In [None]:
def test(preTrained=True, max_timesteps=1500):        
    n_episodes = 2    
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])    
    
    if preTrained:
        filename = "TD3_{}_{}".format(env_name, random_seed)
        filename += '_solved'    
        directory = "./preTrained/"
        policy = TD3(state_dim, action_dim, max_action)    
        policy.load_actor(directory, filename)    
        
    for ep in range(1, n_episodes+1):
        state = env.reset()
        for t in range(max_timesteps):
            if preTrained:
                action = policy.select_action(state)
            else:
                action = env.action_space.sample()
            state, reward, done, _ = env.step(action)
            env.render()            
            if done:
                break
            
    env.close()
        
test()    