In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import gym, time
import numpy as np

In [2]:
env = gym.make('MountainCarContinuous-v0')
env.seed(1234)
np.random.seed(1234)

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N

In [4]:
class QNetwork(nn.Module):
    def __init__(self, learning_rate=0.01, state_size=2, 
                 action_size=1, hidden_size=10, batch_size=20,
                 name='QNetwork'):
        nn.Module.__init__(self)
        self.fc1 = nn.Linear(state_size+action_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x

In [5]:
class PolicyNetwork(nn.Module):
    def __init__(self, learning_rate=0.01, state_size=2, 
                 action_size=1, hidden_size=10, batch_size=20,
                 name='PolicyNetwork'):
        nn.Module.__init__(self)
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.tanh(self.fc2(x))
        x = self.output(x)
        return x

In [6]:
from collections import deque
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]
    
    def length(self):
        return len(self.buffer)

In [7]:
train_episodes = 300          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

action_size = 1

# Exploration parameters
explore_start = 0.2            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 16               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 200000            # memory capacity
batch_size = 512                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

In [8]:
q_network = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate,batch_size=batch_size)
policy_netowork = PolicyNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate,batch_size=batch_size)

In [9]:
env.reset()
values = []
for i in range(100000):
    observation, reward, done, _ = env.step(env.action_space.sample())
    values.append(observation)
    if done:
        env.reset()

values_array = np.array(values)
means = values_array.mean(axis=0)
stds = values_array.std(axis=0)
print(means)
print(stds)

[-5.22442716e-01 -3.38304673e-05]
[0.17662558 0.01380657]


In [10]:
def make_state(observation, means, stds):
    return (np.array(observation) - means)/stds

In [11]:
# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
observation, reward, done, _ = env.step(env.action_space.sample())
state = make_state(observation, means, stds)

memory = Memory(max_size=memory_size)

# Make a bunch of random actions and store the experiences
for ii in range(pretrain_length - 1):
    # Uncomment the line below to watch the simulation
    # env.render()

    # Make a random action
    action = env.action_space.sample()
    next_observation, reward, done, _ = env.step(action)
    next_state = make_state(next_observation, means, stds)

    if done:
        # The simulation fails so no next state
        next_state = np.zeros(state.shape)
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        observation, reward, done, _ = env.step(env.action_space.sample())
        state = make_state(observation, means, stds)
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        state = next_state

In [None]:
 Now train with experiences
#saver = tf.train.Saver()
rewards_list = []
step = 0
opt_q = optim.Adam(q_network.parameters(), learning_rate)
opt_policy = optim.Adam(policy_network.parameters(), learning_rate)

outputs = np.empty([1,6])

count_stop = 0
for ep in range(1, train_episodes):
    total_reward = 0
    t = 0
    # Start new episode
    env.reset()
    # Take one random step to get the pole and cart moving
    observation, reward, done, _ = env.step(env.action_space.sample())
    state = make_state(observation, means, stds)
            
    for t in range(max_steps):
        step += 1

        action = policy_network(Variable(torch.FloatTensor(state))).data.numpy()
            
        result = np.hstack((state, action))
        outputs = np.vstack((outputs, result))

        # Take action, get new state and reward
        next_observation, reward, done, _ = env.step(action)
        next_state = make_state(next_observation, means, stds)

        total_reward += reward
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        state = next_state

        # Sample mini-batch from memory
        batch = memory.sample(batch_size)
        states_actions = np.array([each[0] for each in batch])
        ### ポイント！！！
        # actionはスカラーなのでベクトルにする
        # actionsはベクトルでなく、statesと同じ行列
        actions = np.array([[each[1]] for each in batch])
        ### ポイント終わり
        rewards = np.array([each[2] for each in batch])
        next_states = np.array([each[3] for each in batch])
        dones = np.array([each[4] for each in batch])

        # Train network
        non_final_mask = torch.tensor(tuple(map(lambda s: s==False, dones)), dtype=torch.uint8)
        # 終端状態のQ値はその後の報酬が存在しないためゼロとする
        target_maxQs = torch.zeros(batch_size)
        target_maxQs[non_final_mask] = mainQN(Variable(torch.FloatTensor(next_states)[non_final_mask])).max(1)[0].detach()

        #tutorial way
        next_actions = policy_network(Variable(torch.FloatTensor(next_states))).detach()
        next_Qs = q_network(torch.cat([torch.FloatTensor(next_states), next_actions], -1))
        targets = (torch.FloatTensor(rewards) + gamma * q_network(Variable(torch.FloatTensor(torch.cat())))).unsqueeze(1)

        current_q_values = q_network(Variable(torch.FloatTensor(states_actions)))
        loss = torch.nn.SmoothL1Loss()(current_q_values, targets)
        # backpropagation of loss to NN
        # 勾配を初期化
        opt.zero_grad()
        loss.backward()
        opt.step()
        
        if done:
            # the episode ends so no next state
            print('Episode: {}'.format(ep),
                  'Total reward: {}'.format(total_reward),
                  'Training loss: {:.4f}'.format(loss.data.numpy()),
                  'Explore P: {:.4f}'.format(explore_p))
            rewards_list.append((ep, total_reward))
            break
df = pd.DataFrame(outputs)
df.to_csv('result.csv')

In [20]:
a=torch.FloatTensor([True, False, False, True])
b=torch.FloatTensor([])

tensor([1., 0., 0., 1.])

In [24]:
a=np.array([1,2,3])

In [28]:
np.append(a,4)

array([1, 2, 3, 4])

In [23]:
a

[1, 2, 3, 4]