In [None]:
import os
import time
import random
import collections

import gym
from gym import wrappers
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [None]:
from ddpg import DDPG
from td3 import TD3

In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

%matplotlib inline

def live_plot(data_dict, figsize=(15,5)):

    clear_output(wait=True)
    plt.figure(figsize=figsize)
    
    for label, data in data_dict.items():
        plt.plot(data, label=label)
    
    plt.legend(loc='lower left')
    plt.show();
    

---

##### Environment

In [None]:
env_name = 'LunarLanderContinuous-v2'
save_models = True
seed = 13

##### Initialize environment and set seeds

In [None]:
env = gym.make(env_name)

env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [None]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

##### Choose saved model

In [None]:
os.listdir('./pytorch_models')

In [None]:
file_name = 'TD3_LunarLanderContinuous-v2_13'

##### Initialize agent

In [None]:
agent = TD3(
    state_dim, 
    action_dim, 
    max_action,
)


##### Load policy

In [None]:
agent.load(file_name, './pytorch_models')

##### Define testing parameters

In [None]:
max_steps = 1000
episodes = 10
report = 1

reward_trace = collections.defaultdict(list)

##### Testing:

In [None]:
for episode in np.arange(1, episodes):
    
    obs = env.reset()
    done = False
    episode_reward = []
    steps = 0
    

    
    while not done:

        action = agent.select_action(np.array(obs))
        obs, reward, done, info = env.step(action)

        done = (done) or (steps >= max_steps)
        
        steps += 1
        episode_reward.append(reward)

        env.render()
        
    
    
    reward_trace['episode_reward'].append(sum(episode_reward))

    live_plot(reward_trace)
 
    if episode % report == 0:
        
        print('Episode:', episode)
        print('Average score:', np.mean(reward_trace['episode_reward'][-report:]))
        time.sleep(1.5)
 