In [None]:
from IPython import display
%matplotlib inline
import os
from ddpg_reacher import *
import warnings
warnings.filterwarnings("ignore")

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1", rand_init=False)
ddpg = DDPG(env, action_dim=2, state_dim=8, device=device, critic_lr=1e-3, actor_lr=1e-4, gamma=0.99, batch_size=100)

In [None]:
# Train the policy
value_losses, policy_losses, validation_reward, validation_steps = ddpg.train(1e2)

In [None]:
if not os.path.exists("./results"):
    os.makedirs("./results")

plotting(validation_reward, "Average Rewards for Evaluation",
                            "./results/eval_return.{}.png".format(SEED),"Iterations", "Reward")
plotting(validation_steps, "Steps to Completion",
                            "./results/eval_steps_{}.png".format(SEED),"Iterations", "Steps to Completion")

torch.save(ddpg.actor,"./results/Actor_{}.pth".format(SEED))
torch.save(ddpg.critic,"./results/Critic_{}.pth".format(SEED))

np.save("./results/validation_reward_{}.npy".format(SEED), validation_reward)
np.save("./results/validation_steps_{}.npy".format(SEED), validation_steps)

In [None]:
# Evaluate the final policy
state, step, done = env.reset(), 0, False
while not done:
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    action = ddpg.actor(state).detach().squeeze().cpu().numpy()
    next_state, reward, done, _ = env.step(action)
    env.render()
    time.sleep(0.1)
    state = next_state
    step+=1
    print("Steps: {:4d}, Action: {}, Reward: {:.4f}".format(step, action, reward))