In [1]:
import numpy as np
import torch
import torch.nn as nn
import gym
from copy import deepcopy

In [2]:
class ReplayBuffer:
    def __init__(self, state_dim, max_size, batch_size, action_dim=1):
        self.max_size = max_size
        self.state_dim = state_dim
        self.other_dim = action_dim + 1 + 1
        self.batch_size = batch_size
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.state = np.empty((self.max_size, self.state_dim), dtype=np.float32)
        self.other = np.empty((self.max_size, self.other_dim), dtype=np.float32)
        self.size = 0
        self.current_index = 0
    
    def store(self, state, action, reward, done):
        self.state[self.current_index] = state
        self.other[self.current_index] = [action, reward, done]
        self.current_index = (self.current_index + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)
    
    def sample_batch(self):
        ptr = np.random.choice(self.size, self.batch_size)
        return (torch.FloatTensor(self.state[ptr]).to(self.device),
                torch.FloatTensor(self.state[ptr + 1]).to(self.device),
                    # TODO remove reshape
                torch.LongTensor(self.other[ptr, 0:1].reshape(-1, 1)).to(self.device),
                torch.FloatTensor(self.other[ptr, 1:2].reshape(-1, 1)).to(self.device),
                torch.FloatTensor(self.other[ptr, 2:].reshape(-1, 1)).to(self.device))
    
    def __len__(self):
        return self.size

In [3]:
class QNetwork(nn.Module):
    def __init__(self, state_dim, mid_dim, action_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(state_dim, mid_dim),
            nn.ReLU(),
            nn.Linear(mid_dim, mid_dim),
            nn.ReLU(),
            nn.Linear(mid_dim, mid_dim),
            nn.ReLU(),
            nn.Linear(mid_dim, action_dim)
        )
    
    def forward(self, state):
        return self.network(state)

In [15]:
class Agent(object):
    def __init__(self, env_name, mid_dim = 256):
        self.learning_rate = 1e-4
        self.gamma = 0.99
        self.soft_update_tau = 5e-3
        self.episode_num = 10000
        self.update_step = 300
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.env = gym.make(env_name)
        self.state_dim = self.env.observation_space.shape[0]
        self.mid_dim = mid_dim
        self.action_dim = self.env.action_space.n
        self.max_size = 100000
        self.batch_size = 256
        
        self.network = QNetwork(self.state_dim, self.mid_dim, self.action_dim).to(self.device)
        self.target_network = deepcopy(self.network)
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=self.learning_rate)
        self.criterion = torch.nn.MSELoss()
        
        self.replay_buffer = ReplayBuffer(self.state_dim, self.max_size, self.batch_size)
    
    def claculate_epsilon(self, episode):
        min_epsilon = 0.05
        max_epsilon = 1
        epsilon_decay = 800
        epsilon_episode = lambda episode : min_epsilon + np.exp(-episode / epsilon_decay)*0.95
        
        return epsilon_episode(episode)
        
    def select_action(self, episode, state):
        if np.random.random_sample() > self.claculate_epsilon(episode):
            return self.network(torch.FloatTensor(state).to(self.device)).argmax().detach().cpu().numpy()
        else:
            return self.env.action_space.sample()
    
    def update(self):
        for _ in range(self.update_step):
            with torch.no_grad():
                state, next_state, action, reward, done = self.replay_buffer.sample_batch()
                next_Q = self.target_network(next_state).max(dim = 1, keepdim=True)[0]
                target = reward + done * next_Q * self.gamma

            current_Q = self.network(state).gather(1, action)
            loss = self.criterion(current_Q, target)
            
            self.optimizer.zero_gard()
            loss.backward()
            self.optimizer.step()
            
            self.soft_update(self.target_network, self.network, self.soft_update_tau)
        
        return loss.item()
    
    @staticmethod
    def soft_update(target_net, current_net, tau):
        for tar, cur in zip(target_net.parameters(), current_net.parameters()):
            tar.data.copy_(cur.data.__mul__(tau) + tar.data.__mul__(1 - tau))
        
    def load_model(self):
        pass
    
    def save_model(self):
        pass
    

In [16]:
class Plot:
    def __init__(self, plot_num):
        self.plot_num = plot_num
        
    def smooth_plot(self, item, factor=10, plot_decay=350):
        item_x = np.arange(len(item))
        item_smooth = [np.mean(item[i:i+factor]) if i > factor else np.mean(item[0:i+1])
                      for i in range(len(item))]
        for i in range(len(item) // plot_decay):
            item_x = item_x[::2]
            item_smooth = item_smooth[::2]
        return item_x, item_smooth
    
    def plot_fig(self, episode, **kwargs):
        clear_output(True)
        plt.figure(figsize=(18, 4 * self.plot_num))
        
        for index, key in enumerate(kwargs.keys()):
            data = kwargs[key]
            data_x, data_smooth = self.smooth_plot(data)
            
            plt.subplot(self.plot_num, 1, index + 1)
            plt.title('episode {}. {}: {}'.format(episode, key, data_smooth[-1]))
            plt.plot(data, label=key, color='lightsteelblue', linewidth='1')
            plt.plot(data_x, data_smooth, label='Smothed_{}'.format(key), color='darkorange', linewidth='3')
            plt.legend(loc='best')
            
        plt.show()

In [17]:
env_name = "CartPole-v0"
agent = Agent(env_name)
plot = Plot(3)

In [19]:
all_rewards, all_losses, episode_steps = [], [], []

for episode in range(agent.episode_num):
    state, rewards = agent.env.reset(), 0
    losses = []
    for i in range(10000):
        action = agent.select_action(episode, state)
        next_state, reward, done, _ = agent.env.step(action)
        agent.replay_buffer.store(state, action, reward, done)
        state = next_state
        rewards += reward
        if done:
            episode_steps.append(i)
            break
        if len(agent.replay_buffer) > agent.batch_size:
            losses.append(agent.update())
    all_losses.append(sum(losses)/len(losses))
    all_rewards.append(rewards)
    
    if episode % 20 == 0:
        kwargs = {
            "Losses": all_losses,
            "Rewards": all_rewards,
            "Episode_Steps": episode_steps
        }
        plot.plot_fig(episode, **kwargs)

ZeroDivisionError: division by zero

In [18]:
env = gym.make(env_name)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n - 1
# replay_buffer = ReplayBuffer()
print(state_dim, action_dim)

4 1
