In [23]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output


In [24]:
env_name = 'CartPole-v1'
env = gym.make(env_name)
print("Observation space", env.observation_space)
print("Action space:", env.action_space)


Observation space Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
Action space: Discrete(2)


In [25]:
class Agent():
    def __init__(self,env):
        self.action_size = env.action_space.n
        print("Action size:", self.action_size)
        
    def get_action(self, state):
        action = random.choice(range(self.action_size))
        
        return action

In [26]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.shape[0]
        print("State size:", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99
        
agent = QAgent(env)

Action size: 2
State size: 4


In [27]:
t_reward = 0

for ep in range(200):
    state = env.reset()
    done = False
    while not done:
        print(state)
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state, action, next_state, reward, done))
        state = next_state
        t_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,t_reward,agent.eps))
        env.render()
        print(agent.q_table)
        time.sleep(0.05)
        clear_output(wait=True)

[ 0.02474036 -0.02728559 -0.03744895 -0.04866615]


IndexError: arrays used as indices must be of integer (or boolean) type