In [1]:
import torch
import gym
import numpy as np
from collections import deque
import time
from ReplayBuffer import ReplayBuffer
from Model import TD3
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env_name = 'BipedalWalker-v2'
random_seed = 0
save_every = 500            # safe trained models after interval
print_every = 10
directory = "./preTrained/" # save trained models
filename = "TD3_{}_{}".format(env_name, random_seed)

# Hyperparameters
max_episodes = 20000         # max num of episodes
max_timesteps = 1000000      # max timesteps in one episode

gamma = 0.99                # discount for future rewards
batch_size = 100            # num of transitions sampled from replay buffer
exploration_noise = 0.1 
polyak = 0.995              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter

In [3]:
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

env.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


### Train

In [None]:
def train():
    policy = TD3(state_dim, action_dim, max_action)
    replay_buffer = ReplayBuffer()

    scores = []
    avg_reward = 0    
    episode_rewards = []

    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        
        ep_reward = 0
        
        for t in range(max_timesteps):
            timestep = time.time()

            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)

            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))
            avg_reward += reward                       
            ep_reward += reward
            state = next_state

            # if i_episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        episode_rewards.append(ep_reward)
        
        avg_rewards = (avg_reward / print_every)        
        if avg_rewards >= 300.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, avg_rewards))
            policy.save(directory, filename + '_solved')
            break        
        
        if i_episode % print_every == 0:
            min_rewards = np.min(episode_rewards)
            max_rewards = np.max(episode_rewards)            
            print('\rEpisode {}, Average Score: {:.2f}, Max: {:.2f}, Min: {:.2f}, Time: {:.2f}'\
                  .format(i_episode, avg_rewards, max_rewards, min_rewards, time.time() - timestep), end="\n")
            
            avg_reward = 0
            episode_rewards = []
            
        if i_episode % save_every == 0:
            policy.save(directory, filename)
        
        scores.append(ep_reward)
            
    return scores

scores = train()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 10, Average Score: -116.61, Max: -92.33, Min: -173.37, Time: 2.14
Episode 20, Average Score: -121.93, Max: -99.96, Min: -164.53, Time: 0.92
Episode 30, Average Score: -119.44, Max: -110.86, Min: -124.92, Time: 1.07
Episode 40, Average Score: -119.95, Max: -114.45, Min: -124.00, Time: 1.10
Episode 50, Average Score: -113.57, Max: -102.66, Min: -134.67, Time: 1.66
Episode 60, Average Score: -123.43, Max: -100.11, Min: -174.96, Time: 2.69
Episode 70, Average Score: -105.36, Max: -98.47, Min: -110.86, Time: 1.40
Episode 80, Average Score: -108.22, Max: -100.08, Min: -121.34, Time: 1.70


### Test

In [None]:
def test():        
    n_episodes = 3
    max_timesteps = 2000    
    
    filename = "TD3_{}_{}".format(env_name, random_seed)
    filename += '_solved'    
    directory = "./preTrained/"
    
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    
    policy = TD3(state_dim, action_dim, max_action)
    
    policy.load_actor(directory, filename)
    
    for ep in range(1, n_episodes+1):
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            env.render()            
            if done:
                break
            
        env.close()
        
test()    