In [None]:
import gym
import numpy as np
import torch
import time
from collections  import deque
from parallelEnv import parallelEnv
from envs import make_vec_envs
from model import Policy
from utils import get_render_func, get_vec_normalize

device = torch.device("cpu")
print('device: ', device)
model_path = "dir_save_test"
seed = 0 

## model Policy uses MLPBase
envs = parallelEnv('MountainCarContinuous-v0', n=8, seed=seed) ## weights created by n = 16

max_steps = envs.max_steps
print('max_steps: ', max_steps)

policy = Policy(envs.observation_space.shape, envs.action_space,\
        base_kwargs={'recurrent': False})

print('policy: ', policy)
policy.to(device)

num_processes = 1
env_venv = make_vec_envs('MountainCarContinuous-v0', \
                    seed + 1000, num_processes,
                    None, None, False, device=device, allow_early_resets=False)

print('envs.observation_space.shape: ', envs.observation_space.shape, \
      ', len(obs_shape): ', len(envs.observation_space.shape))
print('envs.action_space: ',  envs.action_space, \
      ', action_space.shape[0]: ', envs.action_space.shape[0])


def load_test(model):
    model.base = torch.load(model_path + '/we0_model_base_pre_train.pth')
    model.base.actor.load_state_dict(torch.load(model_path + '/we0_actor_pre_train.pth'))
    # model.base.critic.load_state_dict(torch.load('dir_save\we0_critic_final.pth'))
    # model.base.critic_linear.load_state_dict(torch.load('dir_save\we0_critic_linear_final.pth'))
    model.dist = torch.load(model_path + '/we0_model_dist_pre_train.pth')
    
load_test(model = policy)    





device:  cpu
max_steps:  999
policy:  Policy(
  (base): MLPBase(
    (actor): Sequential(
      (0): Linear(in_features=2, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (critic): Sequential(
      (0): Linear(in_features=2, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (critic_linear): Linear(in_features=64, out_features=1, bias=True)
  )
  (dist): DiagGaussian(
    (fc_mean): Linear(in_features=64, out_features=1, bias=True)
    (logstd): AddBias()
  )
)
envs.observation_space.shape:  (2,) , len(obs_shape):  1
envs.action_space:  Box(-1.0, 1.0, (1,), float32) , action_space.shape[0]:  1


  model.base = torch.load(model_path + '/we0_model_base_final.pth')
  model.base.actor.load_state_dict(torch.load(model_path + '/we0_actor_final.pth'))
  model.dist = torch.load(model_path + '/we0_model_dist_final.pth')


In [2]:
## No CUDA, only CPU

def play_VecEnv(env, model, num_episodes):

    obs = env.reset()
    obs = torch.Tensor(obs)
    obs = obs.float()
        
    recurrent_hidden_states = torch.zeros(1, model.recurrent_hidden_state_size)
    
    masks = torch.zeros(1, 1)
    
    scores_deque = deque(maxlen=100)

    render_func = get_render_func(env)
        
    for i_episode in range(1, num_episodes+1):     

        time_start = time.time()
        total_reward = np.zeros(num_processes)
        timestep = 0

        done = False
        
        while not done:
        
            with torch.no_grad():
                value, action, _, recurrent_hidden_states = \
                    model.act(obs, recurrent_hidden_states, masks, deterministic=False) # obs = state
                            

            render_func()
            
            obs, reward, done, _ = env.step(action.unsqueeze(1))
            obs = torch.Tensor(obs)
            obs = obs.float()

            reward = reward.detach().numpy()
            masks.fill_(0.0 if done else 1.0)
            
            total_reward += np.mean(reward)
            
            time.sleep(0.04)
            
            timestep += 1
            
            if done.all() == True or timestep + 1 == max_steps: ##   999:
                break

        s = (int)(time.time() - time_start)
        
        scores_deque.append(total_reward)        
        avg_score = np.mean(scores_deque)
                    
        print('Episode {} \tScore: {:.2f}, Avg.Score: {:.2f}, \tTime: {:02}:{:02}:{:02}'\
                  .format(i_episode, np.mean(total_reward), avg_score,  s//3600, s%3600//60, s%60))
    
play_VecEnv(env=env_venv, model=policy, num_episodes=7)   



Episode 1 	Score: 276.45, Avg.Score: 276.45, 	Time: 00:00:14
Episode 2 	Score: 276.77, Avg.Score: 276.61, 	Time: 00:00:13
Episode 3 	Score: 276.26, Avg.Score: 276.49, 	Time: 00:00:13
Episode 4 	Score: 277.02, Avg.Score: 276.63, 	Time: 00:00:13
Episode 5 	Score: 277.04, Avg.Score: 276.71, 	Time: 00:00:14
Episode 6 	Score: 278.90, Avg.Score: 277.07, 	Time: 00:00:12
Episode 7 	Score: 277.60, Avg.Score: 277.15, 	Time: 00:00:12


In [3]:
env_venv.close()