In [1]:
import gym
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import optuna

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N

In [3]:
class QNetwork(nn.Module):
    def __init__(self, learning_rate=0.01, state_size=4, 
                 action_size=2, hidden_size=10,
                 name='QNetwork'):
        nn.Module.__init__(self)
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x

In [4]:
from collections import deque
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]
    
    def length(self):
        return len(self.buffer)

In [5]:
"""
train_episodes = 300          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

action_size = 2

# Exploration parameters
explore_start = 0.2            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 16               # number of units in each Q-network hidden layer
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 200000            # memory capacity
batch_size = 512                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory
"""

'\ntrain_episodes = 300          # max number of episodes to learn from\nmax_steps = 200                # max steps in an episode\ngamma = 0.99                   # future reward discount\n\naction_size = 2\n\n# Exploration parameters\nexplore_start = 0.2            # exploration probability at start\nexplore_stop = 0.01            # minimum exploration probability \ndecay_rate = 0.0001            # exponential decay rate for exploration prob\n\n# Network parameters\nhidden_size = 16               # number of units in each Q-network hidden layer\nlearning_rate = 0.0001         # Q-network learning rate\n\n# Memory parameters\nmemory_size = 200000            # memory capacity\nbatch_size = 512                # experience mini-batch size\npretrain_length = batch_size   # number experiences to pretrain the memory\n'

In [6]:
"""
env.reset()
values = []
for i in range(100000):
    state, reward, done, _ = env.step(env.action_space.sample())
    values.append(state)
    if done:
        env.reset()

values_array = np.array(values)
means = values_array.mean(axis=0)
stds = values_array.std(axis=0)
print(means)
print(stds)

corr_matrix = np.corrcoef(((values_array-means)/stds).T)
print(corr_matrix)
eigen_values,eigen_vectors = np.linalg.eig(corr_matrix)
print(eigen_values)
print(eigen_vectors)
"""
means = [-0.00029514, -0.00455431, -0.00026341, 0.00259509]
stds = [0.09655688, 0.5670828, 0.10286357, 0.85574364]

In [7]:
def make_state(observation, means, stds):
    # return np.dot((np.array(observation) - means)/stds, eigen_vectors.T)/eigen_values
    return (np.array(observation) - means)/stds

In [8]:
def fit(env, train_episodes = 300, max_steps = 200, gamma = 0.99, action_size = 2,
        explore_start = 0.2, explore_stop = 0.01, decay_rate = 0.0001,
        hidden_size = 16, learning_rate = 0.0001,
        memory_size = 200000,batch_size = 512):

    means = [-0.00029514, -0.00455431, -0.00026341, 0.00259509]
    stds = [0.09655688, 0.5670828, 0.10286357, 0.85574364]
    
    memory = Memory(max_size=memory_size)
    
    # Initialize the simulation
    env.reset()
    # Take one random step to get the pole and cart moving
    observation, reward, done, _ = env.step(env.action_space.sample())
    state = make_state(observation, means, stds)


    # Make a bunch of random actions and store the experiences
    for ii in range(batch_size - 1):
        # Uncomment the line below to watch the simulation
        # env.render()

        # Make a random action
        action = env.action_space.sample()
        next_observation, reward, done, _ = env.step(action)
        next_state = make_state(next_observation, means, stds)

        if done:
            # The simulation fails so no next state
            next_state = np.zeros(state.shape)
            # Add experience to memory
            memory.add((state, action, 0, next_state, done))

            # Start new episode
            env.reset()
            # Take one random step to get the pole and cart moving
            observation, reward, done, _ = env.step(env.action_space.sample())
            state = make_state(observation, means, stds)
        else:
            # Add experience to memory
            memory.add((state, action, 0, next_state, done))
            state = next_state
            
    mainQN = QNetwork(name='main', hidden_size=hidden_size, learning_rate=learning_rate)
    
    # Now train with experiences
    #saver = tf.train.Saver()
    rewards_list = []
    step = 0
    opt = optim.Adam(mainQN.parameters(), learning_rate)

    outputs = np.empty([1,6])

    count_stop = 0
    for ep in range(1, train_episodes):
        total_reward = 0
        t = 0
        # Start new episode
        env.reset()
        # Take one random step to get the pole and cart moving
        observation, reward, done, _ = env.step(env.action_space.sample())
        state = make_state(observation, means, stds)

        for t in range(max_steps):
            step += 1

            # Explore or Exploit
            explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
            if explore_p > np.random.rand():
                # Make a random action
                action = env.action_space.sample()
            else:
                # Get action from Q-network
                Qs = mainQN(Variable(torch.FloatTensor(state))).data.numpy()
                action = np.argmax(Qs)

            result = np.hstack((state, mainQN(Variable(torch.FloatTensor(state))).data.numpy()))
            outputs = np.vstack((outputs, result))

            # Take action, get new state and reward
            next_observation, reward, done, _ = env.step(action)
            next_state = make_state(next_observation, means, stds)

            total_reward += reward

            # Add experience to memory
            memory.add((state, action, 0, next_state, done))

            state = next_state

            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            states = np.array([each[0] for each in batch])
            ### ポイント！！！
            # actionはスカラーなのでベクトルにする
            # actionsはベクトルでなく、statesと同じ行列
            actions = np.array([[each[1]] for each in batch])
            ### ポイント終わり
            rewards = np.array([each[2] for each in batch])
            next_states = np.array([each[3] for each in batch])
            dones = np.array([each[4] for each in batch])

            # Train network
            non_final_mask = torch.tensor(tuple(map(lambda s: s==False, dones)), dtype=torch.uint8)
            # 終端状態のQ値はその後の報酬が存在しないためゼロとする
            target_maxQs = torch.zeros(batch_size)
            target_maxQs[non_final_mask] = mainQN(Variable(torch.FloatTensor(next_states)[non_final_mask])).max(1)[0].detach()

            #tutorial way
            targets = (torch.FloatTensor(rewards) + gamma * target_maxQs).unsqueeze(1)

            current_q_values = mainQN(Variable(torch.FloatTensor(states))).gather(1, torch.LongTensor(actions))
            loss = torch.nn.SmoothL1Loss()(current_q_values, targets)
            # backpropagation of loss to NN
            # 勾配を初期化
            opt.zero_grad()
            loss.backward()
            opt.step()

            if done:
                # the episode ends so no next state
                """
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss.data.numpy()),
                      'Explore P: {:.4f}'.format(explore_p))
                """
                rewards_list.append(total_reward)
                break
                
        if np.array(rewards_list[-5:]).mean() > 195:
            break
            
    return len(rewards_list)

In [9]:
def objective(trial):
    env = gym.make('CartPole-v0')
    env.seed(1234)
    np.random.seed(1234)
    # 変数xに-10から10までの値の中から最適な値を算出してもらう
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    hidden_size = trial.suggest_int('hidden_size', 2, 7)
    hidden_size = 2**hidden_size
    batch_size = trial.suggest_int('batch_size', 5, 10)
    batch_size = 2**batch_size

    return fit(env, train_episodes = 300, max_steps = 200, gamma = 0.99, action_size = 2,
        explore_start = 0.2, explore_stop = 0.01, decay_rate = 0.0001,
        hidden_size = hidden_size, learning_rate = learning_rate,
        memory_size = 200000, batch_size = batch_size)

In [12]:
# optimizeの第一引数に対象のメソッドを指定、n_trialsにプログラムが試行錯誤する回数を指定
#study = optuna.create_study()
study.optimize(objective, n_trials=20)

# プログラムが試行錯誤した結果最も良いパラメータを表示
print("params_{}".format(study.best_params))
# 最も良いパラメータで実行したときの結果（返り値）を表示
print("value_{}".format(study.best_value))

[I 2019-10-06 08:00:40,444] Finished trial#20 resulted in value: 281.0. Current best value is 86.0 with parameters: {'hidden_size': 6, 'learning_rate': 3.566430055143271e-05, 'batch_size': 10}.
[I 2019-10-06 08:06:15,237] Finished trial#21 resulted in value: 219.0. Current best value is 86.0 with parameters: {'hidden_size': 6, 'learning_rate': 3.566430055143271e-05, 'batch_size': 10}.
[I 2019-10-06 08:07:41,760] Finished trial#22 resulted in value: 67.0. Current best value is 67.0 with parameters: {'hidden_size': 6, 'learning_rate': 2.6265809914139044e-05, 'batch_size': 10}.
[I 2019-10-06 08:09:37,798] Finished trial#23 resulted in value: 124.0. Current best value is 67.0 with parameters: {'hidden_size': 6, 'learning_rate': 2.6265809914139044e-05, 'batch_size': 10}.
[I 2019-10-06 08:15:48,282] Finished trial#24 resulted in value: 299.0. Current best value is 67.0 with parameters: {'hidden_size': 6, 'learning_rate': 2.6265809914139044e-05, 'batch_size': 10}.
[I 2019-10-06 08:17:21,172] 

params_{'hidden_size': 6, 'learning_rate': 8.106471739895323e-05, 'batch_size': 10}
value_25.0
