In [15]:
import gym
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import torch
import time
import os

import sys
sys.path.append("../utils/")
from normalize_action import NormalizeActions
from replay_buffer import ReplayBuffer
from DDPG import DDPG

In [2]:
## hyperparameter
env_name = "MountainCarContinuous-v0"

current_time = time.strftime('%Y-%m-%d_%H:%M:%S',time.localtime(time.time()))
ROOT_DIR = "../test_log/DDPG_for_mountian_car/noise_and_step_{}".format(current_time)
model_dir = os.path.join(ROOT_DIR, "model")
plot_dir = os.path.join(ROOT_DIR, "tensorboard")
os.makedirs(model_dir)
os.makedirs(plot_dir)

buffer_size = 1000000
batch_size = 128
learning_rate = 1e-3

train_episodes = 200
train_steps = 1000
test_episodes = 100
test_steps = 100

update_steps = [50, 100, 150, 200, 250, 300]
noise_discounts = [0.1, 0.5, 1.0, 1.5, 2.0, 3.0]

env = NormalizeActions(gym.make(env_name))
in_dim = env.observation_space.shape[0]
out_dim = env.action_space.shape[0]
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
replay_buffer = ReplayBuffer(in_dim, batch_size, buffer_size, device)
writer = SummaryWriter(plot_dir)

ddpg = DDPG(in_dim, out_dim, replay_buffer, device, learning_rate)

In [3]:
def run_ddpg(ddpg, writer, update_step, noise_discount, current_model_dir):
    test = True

    for episode in range(train_episodes):
        state = env.reset()
        episode_reward = 0
        for step in range(train_steps):
            action1 = ddpg.get_action(state)
    
            if step % update_step == 0:
                test = not test
            noise_sample = abs(np.random.randn(1)) * noise_discount
            noise = noise_sample if test else -noise_sample
            action = action1 + noise

            next_state, reward, done, _ = env.step(action.flatten())

            ddpg.store(state, action, next_state.flatten(), reward, done)
            if ddpg.buffer_size() > batch_size :
                value_loss, policy_loss = ddpg.train()

            state = next_state
            episode_reward += reward
        
            if done:
                break
        
        writer.add_scalars("train_reward/update_step_{}".format(update_step),
                           {"noise_discount_{}".format(noise_discount):episode_reward}, episode)
        
    ddpg.save(current_model_dir)
    print("Train < update_step : {}, noise_discount : {} > finished !".format(update_step, noise_discount))

def test_ddpg(ddpg, writer, update_step, noise_discount, current_model_dir):
    ddpg.load(current_model_dir)
    for test_episode in range(test_episodes):
        state = env.reset()
        rewards = 0
        for _ in range(test_steps):
            action = ddpg.get_action(state.flatten())
            next_state, reward, done, info = env.step(action)
            state = next_state
            rewards += reward
            if done: break
        writer.add_scalars("test_reward/update_step_{}".format(update_step),
                           {"noise_discount_{}".format(noise_discount):rewards}, test_episode)
    print("Test  < update_step : {}, noise_discount : {} > finished !".format(update_step, noise_discount))

In [4]:
# torch.save(policy_net.state_dict(), "./model/DDPG_for_mountain_car.pth")

In [5]:
# policy_net_1 = PolicyNetwork(in_dim, out_dim).to(device)
# policy_net_1.load_state_dict(torch.load("./model/DDPG_for_mountain_car.pth"))
# policy_net_1.eval()

# import pdb
# import gym
# from IPython import display
# import matplotlib
# import matplotlib.pyplot as plt
# %matplotlib inline

# env = gym.make("MountainCarContinuous-v0")
# state = env.reset()
# img = plt.imshow(env.render(mode='rgb_array')) # only call this once
# for _ in range(1000):
#     img.set_data(env.render(mode='rgb_array')) # just update the data
#     display.display(plt.gcf())
#     display.clear_output(wait=True)
#     policy_net = policy_net.cpu()
    
#     action = policy_net(torch.FloatTensor(state)).detach().numpy()
#     # action = env.action_space.sample()
#     next_state, _, done, _ = env.step(action)
#     if done: 
#         state = env.reset()
#     state = next_state
    
# from gym import wrappers

# env = gym.make("MountainCarContinuous-v0")
# env = wrappers.Monitor(env, "./gym-results/DDPG_mountaincar/", force=True)

## Test Results

#### Plot use tensorboard

In [6]:
for update_step in update_steps:
    for noise_discount in noise_discounts:
        current_model_dir = os.path.join(model_dir, "update_step_{}_noise_discount_{}"
                                         .format(update_step, noise_discount))
        # train
        
        run_ddpg(ddpg, writer, update_step, noise_discount, current_model_dir)
        
        break
        
        # test
        
        test_ddpg(ddpg, writer, update_step, noise_discount, current_model_dir)

KeyboardInterrupt: 

#### Plot use seaborn and matplotlib

In [None]:
# import pandas as pd

# train_reward = np.array([])
# test_reward = np.array([])

# for epoch in range(5):
#     # train
    
#     rewards = run_ddpg(epoch)
#     train_reward = np.concatenate((train_reward, rewards))
    
#     # test
    
#     rewards = test_ddpg(epoch)
#     test_reward = np.concatenate((test_reward, rewards))
    
# train_x = lambda : np.arange(1, train_episodes+1)
# train_list = np.stack((train_x() for _ in range(5)), axis=0).flatten()
    
# test_x = lambda : np.arange(1, test_episodes+1)
# test_list = np.stack((test_x() for _ in range(5)), axis=0).flatten()

# train_data = pd.DataFrame(dict(x=train_list, y=train_reward))
# test_data = pd.DataFrame(dict(x=test_list, y=test_reward))

# train_data.to_csv("./test/DDPG_OUnoise/trian_data.csv", index=False)
# test_data.to_csv("./test/DDPG_OUnoise/test_data.csv", index=False)

In [None]:
# import seaborn as sns; sns.set()
# import matplotlib.pyplot as plt
# import pandas as pd
# DDPG_NO_test_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_Normalnoise/test_data.csv")
# DDPG_NO_train_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_Normalnoise/trian_data.csv")
# DDPG_OU_test_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_OUnoise/test_data.csv")
# DDPG_OU_train_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_OUnoise/trian_data.csv")

In [None]:
# DDPG_NO_test_data["diff"] = "Normalnoise"
# DDPG_NO_train_data["diff"] = "Normalnoise"
# DDPG_OU_test_data["diff"] = "OUnoise"
# DDPG_OU_train_data["diff"] = "OUnoise"

In [None]:
# DDPG_test_data = pd.concat((DDPG_NO_test_data, DDPG_OU_test_data))
# DDPG_train_data = pd.concat((DDPG_NO_train_data, DDPG_OU_train_data))

In [None]:
# plt.figure(figsize=(16, 8))

# plt.subplot(211)
# ax = sns.lineplot(x="x", y="y", hue="diff", data=DDPG_test_data)
# plt.title("Test 5 times Reward of 100 steps for each episode, Avg: Normalnoise {}, OUnoise {}"
#          .format(round(DDPG_test_data[DDPG_test_data['diff']=="Normalnoise"]['y'].mean(), 3), 
#                 round(DDPG_test_data[DDPG_test_data['diff']=="OUnoise"]['y'].mean(), 3)))
# plt.xlabel("episodes")
# plt.ylabel("rewards")

# plt.subplot(212)
# ax = sns.lineplot(x="x", y="y", hue="diff", data=DDPG_train_data)
# plt.title("Train 5 times Reward of 1000000 steps for each episode, Avg: Normalnoise {}, OUnoise {}"
#          .format(round(DDPG_train_data[DDPG_train_data['diff']=="Normalnoise"]['y'].mean(), 3), 
#                 round(DDPG_train_data[DDPG_train_data['diff']=="OUnoise"]['y'].mean(), 3)))
# plt.xlabel("episodes")
# plt.ylabel("rewards")

# plt.savefig("RL_notes_and_codes/algorithm_implement/test/Noise_test.png")
# plt.show()

![ddpg_noise_test.png](../assets/ddpg_noise_test_1.png)