In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
import time
import os
%matplotlib inline
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [2]:
import sys
sys.path.append("../utils/")
from replay_buffer import ReplayBuffer
from normalize_action import NormalizeActions
from noise import OUNoise

In [6]:
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ValueNetwork, self).__init__()
        self.linear_state = nn.Linear(state_dim, 64)
        self.linear_action = nn.Linear(action_dim, 64)
        self.linear2 = nn.Linear(128, 32)
        self.linear3 = nn.Linear(32, 1)
    
    def forward(self, state, action):
        hidden_state = F.relu(self.linear_state(state))
        hidden_action = F.relu(self.linear_action(action))
        cat_state_action = torch.cat((hidden_action, hidden_state),dim=1)
        hidden2 = F.relu(self.linear2(cat_state_action))
        Q = self.linear3(hidden2)
        return Q

class PolicyNetwork(nn.Module):
    def __init__(self, in_dim, out_dim):
        super(PolicyNetwork, self).__init__()
        self.linear1 = nn.Linear(in_dim, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, out_dim) # (256, 1)
    
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = torch.tanh(self.linear3(x))
        return x
    
    def get_action(self, state):
        state = torch.tensor(state,dtype=torch.float).unsqueeze(0).to(device)
        action = self.forward(state)
        return action.detach().cpu().numpy()

In [None]:
class DDPG:
    def __init__(self, ):
        

In [7]:
def ddpg_train(batch_size, gamma=0.99, soft_tau=1e-2):
    samples = replay_buffer.sample()
    state, action, next_state = samples['current_state'], samples['action'], samples['next_state']
    reward, done = samples['reward'], samples['done']
    
    target_value = reward + (1.0-done)*gamma*target_value_net(next_state, target_policy_net(next_state))
    value = value_net(state, action)
    value_loss = ((value - target_value.detach()).pow(2)).mean()
    
    policy_loss = -value_net(state, policy_net(state)).mean()
    
    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()
    
    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()
    
    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
        target_param.data.copy_(target_param.data*(1.0-soft_tau) + param.data*soft_tau)
    for target_param, param in zip(target_policy_net.parameters(), policy_net.parameters()):
        target_param.data.copy_(target_param.data*(1.0-soft_tau) + param.data*soft_tau)
    
    return value_loss.item(), policy_loss.item()

In [8]:
# env = NormalizeActions(gym.make("Pendulum-v0"))
env = NormalizeActions(gym.make("MountainCarContinuous-v0"))
ou_noise = OUNoise(env.action_space)

in_dim = env.observation_space.shape[0] # 3
out_dim = env.action_space.shape[0] # 1 连续动作空间

value_net = ValueNetwork(in_dim, out_dim).to(device)
policy_net = PolicyNetwork(in_dim, out_dim).to(device)

target_value_net = ValueNetwork(in_dim, out_dim).to(device)
target_policy_net = PolicyNetwork(in_dim, out_dim).to(device)
target_value_net.load_state_dict(value_net.state_dict())
target_policy_net.load_state_dict(policy_net.state_dict())

value_optimizer = optim.Adam(value_net.parameters())
policy_optimizer = optim.Adam(policy_net.parameters(), lr=1e-4)

train_episodes = 250
train_steps = 1000
test_episodes = int(train_episodes / 2)
test_steps = 100

buffer_size = 1000000
batch_size = 128
replay_buffer = ReplayBuffer(in_dim, batch_size, buffer_size)

test = True

In [9]:
def smooth_plot(factor, item, plot_decay):
    item_x = np.arange(len(item))
    item_smooth = [np.mean(item[i:i+factor]) if i > factor else np.mean(item[0:i+1])
                  for i in range(len(item))]
    for i in range(len(item)// plot_decay):
        item_x = item_x[::2]
        item_smooth = item_smooth[::2]
    return item_x, item_smooth
    
def plot(episode, rewards, value_losses, policy_losses, noise):
    clear_output(True)
    rewards_x, rewards_smooth = smooth_plot(10, rewards, 500)
    value_losses_x, value_losses_smooth = smooth_plot(10, value_losses, 10000)
    policy_losses_x, policy_losses_smooth = smooth_plot(10, policy_losses, 10000)
    noise_x, noise_smooth = smooth_plot(10, noise, 100)
    
    plt.figure(figsize=(18, 12))
    plt.subplot(411)
    plt.title('episode %s. reward: %s'%(episode, rewards_smooth[-1]))
    plt.plot(rewards, label="Rewards", color='lightsteelblue', linewidth='1')
    plt.plot(rewards_x, rewards_smooth, label='Smothed_Rewards', color='darkorange', linewidth='3')
    plt.legend(loc='best')
    
    plt.subplot(412)
    plt.title('Value_Losses')
    plt.plot(value_losses,label="Value_Losses",color='lightsteelblue',linewidth='1')
    plt.plot(value_losses_x, value_losses_smooth, 
             label="Smoothed_Value_Losses",color='darkorange',linewidth='3')
    plt.legend(loc='best')
    
    plt.subplot(413)
    plt.title('Policy_Losses')
    plt.plot(policy_losses,label="Policy_Losses",color='lightsteelblue',linewidth='1')
    plt.plot(policy_losses_x, policy_losses_smooth, 
             label="Smoothed_Policy_Losses",color='darkorange',linewidth='3')
    plt.legend(loc='best')
    
    plt.subplot(414)
    plt.title('Noise')
    plt.plot(noise,label="Noise",color='lightsteelblue',linewidth='1')
    plt.plot(noise_x, noise_smooth, 
             label="Smoothed_Noise",color='darkorange',linewidth='3')
    plt.legend(loc='best')
    
    plt.show()

In [10]:
# value_losses = []
# policy_losses = []
# all_rewards = []
# updates = 0
# test = True

# for episode in range(train_episodes):
#     state = env.reset()
#     ou_noise.reset()
#     episode_reward = 0
#     noises = []
#     for step in range(train_steps):
#         action1 = policy_net.get_action(state)
        
# #         action = ou_noise.get_action(action1, step)
# #         noises.append(action[0][0]-action1[0][0])
        
#         # 200 update in 10
#         if step % 200 == 0:
#             test = not test
#         noise = abs(np.random.randn(1)) if test else -abs(np.random.randn(1))
#         action = action1 + noise
#         noises.append(noise)
        
#         next_state, reward, done, _ = env.step(action.flatten())

#         replay_buffer.store(state, action, next_state.flatten(), reward, done)
#         if len(replay_buffer) > batch_size :
#             value_loss, policy_loss = ddpg_train(batch_size)
#             value_losses.append(value_loss)
#             policy_losses.append(policy_loss)

#         state = next_state
#         episode_reward += reward

#         if done:
#             break
        
#         updates += 1
    
#     all_rewards.append(episode_reward)

#     plot(episode, all_rewards, value_losses, policy_losses, noises[:200])

In [11]:
def run_ddpg(time, writer, update_step, noise_discount):
    
    test = True
    
    for episode in range(train_episodes):
        state = env.reset()
        episode_reward = 0
        for step in range(train_steps):
            action1 = policy_net.get_action(state)
    
            if step % update_step == 0:
                test = not test
            noise_sample = abs(np.random.randn(1)) * noise_discount
            noise = noise_sample if test else -noise_sample
            action = action1 + noise

            next_state, reward, done, _ = env.step(action.flatten())

            replay_buffer.store(state, action, next_state.flatten(), reward, done)
            if len(replay_buffer) > batch_size :
                value_loss, policy_loss = ddpg_train(batch_size)

            state = next_state
            episode_reward += reward
        
            if done:
                break
        
        writer.add_scalars("train_reward/update_step_{}".format(update_step),
                           {"noise_discount_{}".format(noise_discount):episode_reward}, episode)
        
    torch.save(policy_net.state_dict(), "./test/Continue_time_{}/model/update_step_{}_noise_discount_{}.pth".format(time, update_step, noise_discount))
    print("Train < update_step : {}, noise_discount : {} > finished !".format(update_step, noise_discount))

def test_ddpg(time, writer, update_step, noise_discount):
    policy_net_1 = PolicyNetwork(in_dim, out_dim).to(device)
    policy_net_1.load_state_dict(torch.load("./test/Continue_time_{}/model/update_step_{}_noise_discount_{}.pth".format(time, update_step, noise_discount)))
    for test_episode in range(test_episodes):
        state = env.reset()
        rewards = 0
        for _ in range(test_steps):
            action = policy_net_1.get_action(state.flatten())
            next_state, reward, done, info = env.step(action)
            state = next_state
            rewards += reward
            if done: break
        writer.add_scalars("test_reward/update_step_{}".format(update_step),
                           {"noise_discount_{}".format(noise_discount):rewards}, test_episode)
    print("Test  < update_step : {}, noise_discount : {} > finished !".format(update_step, noise_discount))

In [12]:
# torch.save(policy_net.state_dict(), "./model/DDPG_for_mountain_car.pth")

In [13]:
# policy_net_1 = PolicyNetwork(in_dim, out_dim).to(device)
# policy_net_1.load_state_dict(torch.load("./model/DDPG_for_mountain_car.pth"))
# policy_net_1.eval()

# import pdb
# import gym
# from IPython import display
# import matplotlib
# import matplotlib.pyplot as plt
# %matplotlib inline

# env = gym.make("MountainCarContinuous-v0")
# state = env.reset()
# img = plt.imshow(env.render(mode='rgb_array')) # only call this once
# for _ in range(1000):
#     img.set_data(env.render(mode='rgb_array')) # just update the data
#     display.display(plt.gcf())
#     display.clear_output(wait=True)
#     policy_net = policy_net.cpu()
    
#     action = policy_net(torch.FloatTensor(state)).detach().numpy()
#     # action = env.action_space.sample()
#     next_state, _, done, _ = env.step(action)
#     if done: 
#         state = env.reset()
#     state = next_state
    
# from gym import wrappers

# env = gym.make("MountainCarContinuous-v0")
# env = wrappers.Monitor(env, "./gym-results/DDPG_mountaincar/", force=True)

## Test Results

#### Plot use tensorboard

In [14]:
time = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime())
writer = SummaryWriter(log_dir="test/Continue_time_{}/tensorboard/".format(time))

dirName = "./test/Continue_time_{}/model".format(time)
if not os.path.exists(dirName):
    os.mkdir(dirName)

update_steps = [50, 100, 150, 200, 250, 300]
noise_discounts = [0.1, 0.5, 1.0, 1.5, 2.0, 3.0]


for update_step in update_steps:
    for noise_discount in noise_discounts:
        
        # train
        
        run_ddpg(time, writer, update_step, noise_discount)
        
        # test
        
        test_ddpg(time, writer, update_step, noise_discount)

Train < update_step : 50, noise_discount : 0.1 > finished !
Test  < update_step : 50, noise_discount : 0.1 > finished !
Train < update_step : 50, noise_discount : 0.5 > finished !
Test  < update_step : 50, noise_discount : 0.5 > finished !
Train < update_step : 50, noise_discount : 1.0 > finished !
Test  < update_step : 50, noise_discount : 1.0 > finished !
Train < update_step : 50, noise_discount : 1.5 > finished !
Test  < update_step : 50, noise_discount : 1.5 > finished !
Train < update_step : 50, noise_discount : 2.0 > finished !
Test  < update_step : 50, noise_discount : 2.0 > finished !
Train < update_step : 50, noise_discount : 3.0 > finished !
Test  < update_step : 50, noise_discount : 3.0 > finished !
Train < update_step : 100, noise_discount : 0.1 > finished !
Test  < update_step : 100, noise_discount : 0.1 > finished !
Train < update_step : 100, noise_discount : 0.5 > finished !
Test  < update_step : 100, noise_discount : 0.5 > finished !
Train < update_step : 100, noise_dis

KeyboardInterrupt: 

#### Plot use seaborn and matplotlib

In [None]:
# import pandas as pd

# train_reward = np.array([])
# test_reward = np.array([])

# for epoch in range(5):
#     # train
    
#     rewards = run_ddpg(epoch)
#     train_reward = np.concatenate((train_reward, rewards))
    
#     # test
    
#     rewards = test_ddpg(epoch)
#     test_reward = np.concatenate((test_reward, rewards))
    
# train_x = lambda : np.arange(1, train_episodes+1)
# train_list = np.stack((train_x() for _ in range(5)), axis=0).flatten()
    
# test_x = lambda : np.arange(1, test_episodes+1)
# test_list = np.stack((test_x() for _ in range(5)), axis=0).flatten()

# train_data = pd.DataFrame(dict(x=train_list, y=train_reward))
# test_data = pd.DataFrame(dict(x=test_list, y=test_reward))

# train_data.to_csv("./test/DDPG_OUnoise/trian_data.csv", index=False)
# test_data.to_csv("./test/DDPG_OUnoise/test_data.csv", index=False)

In [None]:
# import seaborn as sns; sns.set()
# import matplotlib.pyplot as plt
# import pandas as pd
# DDPG_NO_test_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_Normalnoise/test_data.csv")
# DDPG_NO_train_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_Normalnoise/trian_data.csv")
# DDPG_OU_test_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_OUnoise/test_data.csv")
# DDPG_OU_train_data = pd.read_csv("RL_notes_and_codes/algorithm_implement/test/DDPG_OUnoise/trian_data.csv")

In [None]:
# DDPG_NO_test_data["diff"] = "Normalnoise"
# DDPG_NO_train_data["diff"] = "Normalnoise"
# DDPG_OU_test_data["diff"] = "OUnoise"
# DDPG_OU_train_data["diff"] = "OUnoise"

In [None]:
# DDPG_test_data = pd.concat((DDPG_NO_test_data, DDPG_OU_test_data))
# DDPG_train_data = pd.concat((DDPG_NO_train_data, DDPG_OU_train_data))

In [None]:
# plt.figure(figsize=(16, 8))

# plt.subplot(211)
# ax = sns.lineplot(x="x", y="y", hue="diff", data=DDPG_test_data)
# plt.title("Test 5 times Reward of 100 steps for each episode, Avg: Normalnoise {}, OUnoise {}"
#          .format(round(DDPG_test_data[DDPG_test_data['diff']=="Normalnoise"]['y'].mean(), 3), 
#                 round(DDPG_test_data[DDPG_test_data['diff']=="OUnoise"]['y'].mean(), 3)))
# plt.xlabel("episodes")
# plt.ylabel("rewards")

# plt.subplot(212)
# ax = sns.lineplot(x="x", y="y", hue="diff", data=DDPG_train_data)
# plt.title("Train 5 times Reward of 1000000 steps for each episode, Avg: Normalnoise {}, OUnoise {}"
#          .format(round(DDPG_train_data[DDPG_train_data['diff']=="Normalnoise"]['y'].mean(), 3), 
#                 round(DDPG_train_data[DDPG_train_data['diff']=="OUnoise"]['y'].mean(), 3)))
# plt.xlabel("episodes")
# plt.ylabel("rewards")

# plt.savefig("RL_notes_and_codes/algorithm_implement/test/Noise_test.png")
# plt.show()

![ddpg_noise_test.png](../assets/ddpg_noise_test_1.png)