In [1]:
import gym
import random
import torch
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

In [2]:
# Discrete the state space
def bins(clip_min, clip_max, num):
    return np.linspace(clip_min, clip_max, num + 1)[1:-1]

In [3]:
def digitize_state(observation):
    cart_pos, cart_v, pole_angle, pole_v = observation
    digitize = [np.digitize(cart_pos, bins(-2.4, 2.4, 4)),
               np.digitize(cart_v, bins(-3.0, 3.0, 4)),
               np.digitize(pole_angle, bins(-0.5, 0.5, 4)),
               np.digitize(pole_v, bins(-2.0, 2.0, 4))]
    return sum([x * (4 ** i) for i, x in enumerate(digitize)])

In [4]:
def get_action(state, action, observation, reward, episode):
    # Greedy
    epsilon = 0.5 * (0.99 ** episode)
    # Learning coef
    alpha = 0.2
    # Reward Weight
    gamma = 0.99
    next_state = digitize_state(observation)
    
    if np.random.uniform(0, 1) < epsilon:
        next_action = np.random.choice([0, 1])
    else:
        next_action = np.argmax(q_table[next_state])
    q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (reward + gamma * q_table[next_state, next_action]) 
    return next_state, next_action

In [5]:
env = gym.make('CartPole-v0')

# Maximum Expected Reward
max_num_step = 200
# Expected Reward
goal_avg_step = 195

# Iteration for final computation
num_iter_compute = 100
# The stack to store the reward
reward_stack = np.zeros(num_iter_compute)

#Maxmium number of game
max_iter = 5000
# q_table stores all the state-action pair
q_table = np.random.uniform(-1, 1, size=(4**4, env.action_space.n))

for episode in range(max_iter):
    obseravation = env.reset()
    state = digitize_state(obseravation)
    action = np.argmax(q_table[state])
    # Reward for this game
    expect_reward = 0
    for t in range(max_num_step):
        env.render()
        observation, reward, done, info = env.step(action)
        if done and t < max_num_step - 1:
            reward -= 200
        state, action = get_action(state, action, observation, reward, episode)
        expect_reward += reward
        if done:
            print('%d Episode finished after %f time steps / mean %f' % (episode, t + 1, reward_stack.mean()))
            reward_stack = np.hstack((reward_stack[1:], [expect_reward]))
            break
        if reward_stack.mean() >= goal_avg_step:
            print('Episode %d train agent successfuly!' % episode)
            break
print('Failed!')

0 Episode finished after 41.000000 time steps / mean 0.000000
1 Episode finished after 21.000000 time steps / mean -1.590000
2 Episode finished after 45.000000 time steps / mean -3.380000
3 Episode finished after 24.000000 time steps / mean -4.930000
4 Episode finished after 10.000000 time steps / mean -6.690000
5 Episode finished after 53.000000 time steps / mean -8.590000
6 Episode finished after 23.000000 time steps / mean -10.060000
7 Episode finished after 47.000000 time steps / mean -11.830000
8 Episode finished after 17.000000 time steps / mean -13.360000
9 Episode finished after 23.000000 time steps / mean -15.190000
10 Episode finished after 21.000000 time steps / mean -16.960000
11 Episode finished after 13.000000 time steps / mean -18.750000
12 Episode finished after 25.000000 time steps / mean -20.620000
13 Episode finished after 33.000000 time steps / mean -22.370000
14 Episode finished after 37.000000 time steps / mean -24.040000
15 Episode finished after 21.000000 time s