In [1]:
import torch
import gym
import numpy as np
from collections import deque
import time
from ReplayBuffer import ReplayBuffer
from Model import TD3
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#env_name = 'BipedalWalker-v2'
env_name = 'BipedalWalkerHardcore-v2'
random_seed = 0
save_every = 500            # safe trained models after interval
print_every = 10
directory = "./preTrained/" # save trained models
filename = "TD3_{}_{}".format(env_name, random_seed)

# Hyperparameters
max_episodes = 20000         # max num of episodes
max_timesteps = 1000000      # max timesteps in one episode

gamma = 0.99                # discount for future rewards
batch_size = 100            # num of transitions sampled from replay buffer
exploration_noise = 0.1 
polyak = 0.995              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter

In [3]:
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

env.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


### Train

In [None]:
def train():
    policy = TD3(state_dim, action_dim, max_action)
    replay_buffer = ReplayBuffer()

    scores = []
    avg_reward = 0    
    episode_rewards = []

    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        
        ep_reward = 0
        
        for t in range(max_timesteps):
            timestep = time.time()

            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)

            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            avg_reward += reward                       
            ep_reward += reward
            state = next_state

            # if i_episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        episode_rewards.append(ep_reward)
        
        avg_rewards = (avg_reward / print_every)        
        if avg_rewards >= 320.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, avg_rewards))
            policy.save(directory, filename + '_solved')
            break        
        
        if i_episode % print_every == 0:
            min_rewards = np.min(episode_rewards)
            max_rewards = np.max(episode_rewards)            
            print('\rEpisode {}, Average Score: {:.2f}, Max: {:.2f}, Min: {:.2f}, Time: {:.2f}'\
                  .format(i_episode, avg_rewards, max_rewards, min_rewards, time.time() - timestep), end="\n")
            
            avg_reward = 0
            episode_rewards = []
            
        if i_episode % save_every == 0:
            policy.save(directory, filename)
        
        scores.append(ep_reward)
            
    return scores

scores = train()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 10, Average Score: -127.46, Max: -116.60, Min: -179.45, Time: 1.65
Episode 20, Average Score: -116.99, Max: -112.14, Min: -119.52, Time: 1.08
Episode 30, Average Score: -110.16, Max: -104.76, Min: -117.75, Time: 2.10
Episode 40, Average Score: -115.48, Max: -107.55, Min: -121.12, Time: 1.67
Episode 50, Average Score: -116.45, Max: -104.44, Min: -152.81, Time: 1.34
Episode 60, Average Score: -121.08, Max: -114.08, Min: -136.40, Time: 3.97
Episode 70, Average Score: -115.16, Max: -98.87, Min: -159.27, Time: 3.82
Episode 80, Average Score: -107.78, Max: -98.76, Min: -134.29, Time: 1.62
Episode 90, Average Score: -118.00, Max: -99.31, Min: -184.95, Time: 2.10
Episode 100, Average Score: -111.03, Max: -99.84, Min: -191.53, Time: 2.06
Episode 110, Average Score: -123.41, Max: -92.03, Min: -219.38, Time: 4.55
Episode 120, Average Score: -119.13, Max: -101.11, Min: -138.47, Time: 2.87
Episode 130, Average Score: -116.03, Max: -98.68, Min: -145.45, Time: 2.94
Episode 140, Average Score:

Episode 1100, Average Score: -153.70, Max: -103.51, Min: -195.62, Time: 48.59
Episode 1110, Average Score: -99.75, Max: -68.49, Min: -137.78, Time: 48.53
Episode 1120, Average Score: -115.66, Max: -83.17, Min: -137.55, Time: 48.82
Episode 1130, Average Score: -98.12, Max: -69.78, Min: -119.70, Time: 48.44
Episode 1140, Average Score: -114.82, Max: -47.94, Min: -158.58, Time: 48.34
Episode 1150, Average Score: -121.28, Max: -76.44, Min: -157.94, Time: 48.37
Episode 1160, Average Score: -109.51, Max: -75.99, Min: -132.29, Time: 48.70
Episode 1170, Average Score: -121.40, Max: -88.66, Min: -156.69, Time: 23.61
Episode 1180, Average Score: -116.77, Max: -88.64, Min: -150.01, Time: 48.43
Episode 1190, Average Score: -100.45, Max: -75.35, Min: -141.68, Time: 48.38
Episode 1200, Average Score: -99.84, Max: -58.93, Min: -129.86, Time: 48.50
Episode 1210, Average Score: -93.84, Max: -53.16, Min: -143.71, Time: 48.70
Episode 1220, Average Score: -93.17, Max: -56.98, Min: -135.10, Time: 48.60
Epi

Episode 2180, Average Score: -54.39, Max: -39.03, Min: -95.10, Time: 48.29
Episode 2190, Average Score: -60.91, Max: -44.58, Min: -84.33, Time: 48.25
Episode 2200, Average Score: -85.44, Max: -61.94, Min: -127.23, Time: 48.14
Episode 2210, Average Score: -66.15, Max: -33.71, Min: -106.34, Time: 48.12
Episode 2220, Average Score: -74.06, Max: -36.56, Min: -123.40, Time: 48.26
Episode 2230, Average Score: -86.69, Max: -62.66, Min: -139.49, Time: 48.03
Episode 2240, Average Score: -61.40, Max: -38.69, Min: -111.86, Time: 48.15
Episode 2250, Average Score: -65.64, Max: -39.90, Min: -118.44, Time: 48.29
Episode 2260, Average Score: -67.39, Max: -56.13, Min: -80.67, Time: 48.18
Episode 2270, Average Score: -69.83, Max: -40.61, Min: -94.79, Time: 51.59
Episode 2280, Average Score: -72.16, Max: -38.11, Min: -99.59, Time: 52.82
Episode 2290, Average Score: -66.00, Max: -41.53, Min: -88.96, Time: 49.92
Episode 2300, Average Score: -58.29, Max: -43.80, Min: -72.50, Time: 53.23
Episode 2310, Avera

Episode 3270, Average Score: -93.11, Max: -64.47, Min: -139.73, Time: 50.14
Episode 3280, Average Score: -66.70, Max: -51.44, Min: -80.60, Time: 50.30
Episode 3290, Average Score: -90.58, Max: -47.98, Min: -191.10, Time: 50.37
Episode 3300, Average Score: -77.65, Max: -51.98, Min: -99.27, Time: 50.11
Episode 3310, Average Score: -72.67, Max: -52.08, Min: -106.21, Time: 50.17
Episode 3320, Average Score: -80.72, Max: -53.03, Min: -106.46, Time: 49.91
Episode 3330, Average Score: -74.64, Max: -45.03, Min: -100.21, Time: 50.00
Episode 3340, Average Score: -68.41, Max: -51.76, Min: -83.67, Time: 49.80
Episode 3350, Average Score: -63.44, Max: -36.54, Min: -132.58, Time: 49.41
Episode 3360, Average Score: -67.59, Max: -46.22, Min: -118.86, Time: 49.37
Episode 3370, Average Score: -71.20, Max: -40.69, Min: -100.44, Time: 50.38
Episode 3380, Average Score: -55.83, Max: -32.34, Min: -101.74, Time: 50.00
Episode 3390, Average Score: -80.00, Max: -57.13, Min: -117.32, Time: 49.79
Episode 3400, A

### Test

In [None]:
def test():        
    n_episodes = 3
    max_timesteps = 2000    
    
    filename = "TD3_{}_{}".format(env_name, random_seed)
    filename += '_solved'    
    directory = "./preTrained/"
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            env.render()            
            if done:
                break
            
        env.close()
        
test()    