In [1]:
import torch
import gym
import numpy as np
from collections import deque
import time
from ReplayBuffer import ReplayBuffer
from Model import TD3
from IPython import display
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
env_name = 'BipedalWalker-v2'
random_seed = 0
log_interval = 10           # print avg reward after interval
save_every = 500            # safe trained models after interval
print_every = 1
directory = "./preTrained/" # save trained models
filename = "TD3_{}_{}".format(env_name, random_seed)

# Hyperparameters
max_episodes = 1000         # max num of episodes
max_timesteps = 2000        # max timesteps in one episode

gamma = 0.99                # discount for future rewards
batch_size = 100            # num of transitions sampled from replay buffer
exploration_noise = 0.1 
polyak = 0.995              # target policy update parameter (1-tau)
policy_noise = 0.2          # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter

In [3]:
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

env.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [None]:
def train():
    policy = TD3(state_dim, action_dim, max_action)
    replay_buffer = ReplayBuffer()

    scores = []

    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        
        episode_rewards = []
        
        for t in range(max_timesteps):
            timestep = time.time()

            # select action and add exploration noise:
            action = policy.select_action(state)
            action = action + np.random.normal(0, exploration_noise, size=env.action_space.shape[0])
            action = action.clip(env.action_space.low, env.action_space.high)

            # take action in env:
            next_state, reward, done, _ = env.step(action)
            replay_buffer.add((state, action, reward, next_state, float(done)))            
            episode_rewards.append(reward)            
            state = next_state

            # if i_episode is done then update policy:
            if done or t==(max_timesteps-1):
                policy.update(replay_buffer, t, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay)
                break
        
        if i_episode % save_every == 0:
            policy.save(directory, filename)
        
        avg_rewards = np.mean(episode_rewards)
        if avg_rewards >= 300.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, score_average))
            policy.save(directory, filename + '_solved')
            break        
        
        if i_episode % print_every == 0:
            min_rewards = np.min(episode_rewards)
            max_rewards = np.max(episode_rewards)
            
            print('\rEpisode {}, Average Score: {:.2f}, Max: {:.2f}, Min: {:.2f}, Time: {:.2f}'\
                  .format(i_episode, avg_rewards, max_rewards, min_rewards, time.time() - timestep), end="\n")
            
            episode_rewards = []
        
        scores.append(avg_rewards)
            
    return scores

scores = train()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

Episode 1, Average Score: -1.16, Max: 0.15, Min: -100.00, Time: 2.48
Episode 2, Average Score: -0.96, Max: 0.13, Min: -100.00, Time: 2.70
Episode 3, Average Score: -0.11, Max: 0.03, Min: -0.42, Time: 38.83
Episode 4, Average Score: -0.79, Max: -0.00, Min: -100.00, Time: 3.61
Episode 5, Average Score: -0.83, Max: -0.00, Min: -100.00, Time: 3.38
Episode 6, Average Score: -2.67, Max: -0.05, Min: -100.00, Time: 0.95
Episode 7, Average Score: -0.87, Max: -0.01, Min: -100.00, Time: 3.15
Episode 8, Average Score: -0.80, Max: 0.00, Min: -100.00, Time: 3.45


In [None]:
'''
for ep in range(1, n_episodes+1):
        ep_reward = 0
        state = env.reset()
        for t in range(max_timesteps):
            action = policy.select_action(state)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
            if done:
                break
            
        print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        ep_reward = 0
        env.close()        
'''        

In [None]:
'''
env = gym.make(env_name)
env.reset()

img = plt.imshow(env.render(mode='rgb_array')) # only call this once

for _ in range(1000):
    img.set_data(env.render(mode='rgb_array')) # just update the data
    display.display(plt.gcf())
    display.clear_output(wait=True)
    env.step(env.action_space.sample()) # take a random action
'''