In [None]:
import gym
import torch
import numpy as np

In [None]:
# envの作成
env = gym.make('MountainCar-v0')

In [None]:
# 各種定数・ハイパーパラメータの定義
num_episode = 100
num_step = 100

In [None]:
observation = env.reset()
print(observation)
print(type(observation))

In [None]:
class MountrainCarAgent:
    def __init__(self):
        self.observation_dim = 2
        self.action_dim = 3
        self.state_dim = 1 + self.observation_dim * self.action_dim
        self.num_episode = 2000
        self.num_max_step = 200
        self.decay_rate = 0.8  # lambda - the rate at which decreases return value
        self.learning_rate = 0.05
        self.gamma = 0.99  # discount factor
        
        self.weight = torch.randn(self.state_dim)
    
    def formulate_state(self, observation, action):
        """given observation, return agent's internal state representation"""
        observation = observation.astype(np.float32)
        state = torch.zeros(self.action_dim*self.observation_dim)
        state[action*self.observation_dim:(action+1)*self.observation_dim] = torch.from_numpy(observation)
        state = torch.cat([torch.Tensor(1), state])
        return state
    
    def select_action(self, observation, epsilon=0):
        """given observation and reward given by the environment, return action to take next"""
        action_values = [self._q_hat(self.formulate_state(observation, action)) for action in range(self.action_dim)]
        action = self._epsilon_greedy(action_values, epsilon)
        return action
    
    def train(self, env):
        """envにおいてエージェントを訓練する"""
        for i_episode in range(self.num_episode):
            observation = env.reset()
            action = self.select_action(observation, 0.1)
            state = self.formulate_state(observation, action)
            z = torch.zeros(self.state_dim)
            q_old = 0
            for i_step in range(self.num_max_step):
                observation_prime, reward, done, info = env.step(action)
                action_prime = self.select_action(observation_prime, 0.1) if 0.1 < 1/(i_episode+1) else \
                                    self.select_action(observation_prime, 1/(i_episode+1))
                state_prime = self.formulate_state(observation_prime, action_prime)
                q = self._q_hat(state)
                q_prime = self._q_hat(state_prime)
                td_error = reward + self.gamma * q_prime - q
                z = self.gamma * self.decay_rate * z + \
                        (1-self.learning_rate * self.gamma * torch.dot(z, state)) * state
                self.weight += self.learning_rate * ((td_error + q - q_old) * z - (q - q_old) * state)
                
                q_old = q_prime
                observation = observation_prime
                state = state_prime
                action = action_prime
                
                if done:
                    break
    
    def _epsilon_greedy(self, action_values, epsilon=0.2):
        """action_values: list of action values"""
        is_greedy = np.random.binomial(n=1, p=1-epsilon)
        if is_greedy:
            return np.argmax(action_values)
        else:
            return np.random.randint(len(action_values))
    
    def _q_hat(self, state):
        """given observation and action to take, return approximated action value q_hat"""
        return torch.dot(self.weight, state)

In [None]:
agent = MountrainCarAgent()
agent.train(env)

In [None]:
# demo

observation = env.reset()
for i in range(200):
    env.render()
    action = agent.select_action(observation, epsilon=0)
    observation, reward, done, info = env.step(action)
    if done:
        break
env.close()