In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import gym, time
import numpy as np
from torch.autograd import Variable
import pandas as pd
from collections import deque

In [2]:
env = gym.make('MountainCarContinuous-v0')
env.seed(1234)
np.random.seed(1234)

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N

In [4]:
class QNetwork(nn.Module):
    def __init__(self, learning_rate=0.01, state_size=2, 
                 action_size=1, hidden_size=10, batch_size=20,
                 name='QNetwork'):
        nn.Module.__init__(self)
        self.fc1 = nn.Linear(state_size+action_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x

In [5]:
class PolicyNetwork(nn.Module):
    def __init__(self, learning_rate=0.01, state_size=2, 
                 action_size=1, hidden_size=10, batch_size=20,
                 name='PolicyNetwork'):
        nn.Module.__init__(self)
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        x = torch.tanh(x)
        return x

In [6]:
from collections import deque
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]
    
    def length(self):
        return len(self.buffer)

In [7]:
train_episodes = 500          # max number of episodes to learn from
max_steps = 1000                # max steps in an episode
gamma = 0.99                   # future reward discount

action_size = 1

# Exploration parameters
explore_start = 0.2            # exploration probability at start
explore_stop = 0.001            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 16               # number of units in each Q-network hidden layer
learning_rate = 0.01         # Q-network learning rate

# Memory parameters
memory_size = 200000            # memory capacity
batch_size = 512                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [8]:
memory = Memory(memory_size)
q_network = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate,batch_size=batch_size)
policy_network = PolicyNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate,batch_size=batch_size)

In [None]:
# Now train with experiences
#saver = tf.train.Saver()
rewards_list = []
step = 0
opt_q = optim.Adam(q_network.parameters(), learning_rate/5.0)
opt_policy = optim.Adam(policy_network.parameters(), learning_rate)

outputs = deque(maxlen=10000)

count_stop = 0
for ep in range(1, train_episodes):
    total_reward = 0
    # Start new episode
    state = env.reset()
            
    for t in range(max_steps):
        action = policy_network(Variable(torch.FloatTensor(state))).data.numpy()
        epsilon = max(explore_stop, explore_start*(50.0 - ep)/50.0)
        if np.random.rand() < explore_start:
            action += 0.2*np.random.rand()
        action = np.clip(action, -1, 1)
            
        result = np.hstack((state, action))
        outputs.append(result)

        # Take action, get new state and reward
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        state = next_state

        if len(memory.buffer) >= batch_size:
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            states = np.array([each[0] for each in batch])
            ### ポイント！！！
            # actionはスカラーなのでベクトルにする
            # actionsはベクトルでなく、statesと同じ行列
            actions = np.array([each[1] for each in batch])
            ### ポイント終わり
            rewards = np.array([[each[2]] for each in batch])
            next_states = np.array([each[3] for each in batch])
            dones = np.array([[each[4]] for each in batch])

            # Train network
            #non_final_mask = torch.tensor(tuple(map(lambda s: s==False, dones)), dtype=torch.uint8)
            # 終端状態のQ値はその後の報酬が存在しないためゼロとする
            #target_maxQs = torch.zeros(batch_size)
            #target_maxQs[non_final_mask] = q_network(Variable(torch.FloatTensor(next_states)[non_final_mask])).max(1)[0].detach()

            #tutorial way
            next_actions = policy_network(Variable(torch.FloatTensor(next_states))).detach()
            next_Qs = q_network(torch.cat([torch.FloatTensor(next_states), next_actions], -1)).detach().numpy()
            targets = (torch.FloatTensor(rewards) + gamma * torch.FloatTensor(next_Qs*(1-dones)))

            current_q_values = q_network(torch.cat([torch.FloatTensor(states), torch.FloatTensor(actions)], -1))

            critic_loss = torch.nn.SmoothL1Loss()(current_q_values, targets)
            # backpropagation of loss to NN
            # 勾配を初期化
            opt_q.zero_grad()
            critic_loss.backward()
            opt_q.step()

            #print(loss)

            actor_loss = -q_network(torch.cat([torch.FloatTensor(states), policy_network(Variable(torch.FloatTensor(states)))], -1)).mean()
            opt_policy.zero_grad()
            actor_loss.backward()
            opt_policy.step()

            #print(loss)
        
        if done:
            break
    # the episode ends so no next state
    print('Episode: {}'.format(ep),
          'Total reward: {}'.format(total_reward),
          'Training loss: {:.4f}'.format(actor_loss.data.numpy()))
    rewards_list.append((ep, total_reward))
df = pd.DataFrame(outputs)
df.to_csv('result.csv')

Episode: 1 Total reward: -0.38817920569993836 Training loss: -0.0145
Episode: 2 Total reward: -0.374665107816222 Training loss: -0.0148
Episode: 3 Total reward: -0.4253035545667357 Training loss: -0.0088
Episode: 4 Total reward: -0.6754640003929695 Training loss: 0.0117


In [None]:
memory.buffer

In [None]:
next_Qs

In [None]:
q_network(torch.cat([torch.FloatTensor(states), policy_network(Variable(torch.FloatTensor(states)))], -1))

In [None]:
memory.buffer.size