In [1]:
import gym
import math
import time
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
np.random.seed(0)

In [50]:
?env.env

[1;31mType:[0m        CartPoleEnv
[1;31mString form:[0m <CartPoleEnv<CartPole-v1>>
[1;31mFile:[0m        c:\users\taha\anaconda3\lib\site-packages\gym\envs\classic_control\cartpole.py
[1;31mDocstring:[0m  
Description:
    A pole is attached by an un-actuated joint to a cart, which moves along
    a frictionless track. The pendulum starts upright, and the goal is to
    prevent it from falling over by increasing and reducing the cart's
    velocity.

Source:
    This environment corresponds to the version of the cart-pole problem
    described by Barto, Sutton, and Anderson

Observation:
    Type: Box(4)
    Num     Observation               Min                     Max
    0       Cart Position             -4.8                    4.8
    1       Cart Velocity             -Inf                    Inf
    2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
    3       Pole Angular Velocity     -Inf                    Inf

Actions:
    Type: Discrete(2)
   

In [17]:
# Create the environment with openai gym and set a seed to work in
env = gym.make('CartPole-v1')
env.seed(0)

[0]

In this notebook all algorithms that I reference are from the textbook: Reinforcement Learning: An Introduction by Sutton and Barto.

https://github.com/openai/gym/wiki/CartPole-v0

This link has all the information regarding the specifics of the CartPole environment, such as the action space, how rewards are disitributed, and what it means to solve this problem.

Solved requirements: "Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials." 

## Q-Learning

We will first attempt to solve this problem with the Q-learning algorithm, which is an off-policy temporal difference control algorithm.

In [24]:
# Create the environment with openai gym and set a seed to work in
env = gym.make('CartPole-v1')
env.seed(0)

[0]

In [137]:
n_bins = (6, 12, 6 , 12)
lower_bounds = [env.observation_space.low[0], env.observation_space.low[1], env.observation_space.low[2], env.observation_space.low[3]]
upper_bounds = [env.observation_space.high[0], env.observation_space.high[1], env.observation_space.high[2], env.observation_space.high[3]]

def discretizer(pos, cart_velocity, angle, pole_velocity):
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int,est.transform([[pos, cart_velocity, angle, pole_velocity]])[0]))

In [138]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 12, 6, 12, 2)

In [139]:
tic = time.time()
Q_table = np.zeros(n_bins + (env.action_space.n,))
reward_list = []
total_reward = 0
success = False
for i in range(1, 10001):
    current = discretizer(*env.reset())
    done = False
    while not done:
        lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
        epsilon =  max(0.05, min(1, 1.0 - math.log10((i  + 1) / 25)))
        action = np.argmax(Q_table[current])
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        observation, reward, done, info = env.step(action) 
        next_state = discretizer(*observation)
        Q_table[current][action] += lr*(reward + np.max(Q_table[next_state]) - Q_table[current][action])
        current = next_state
        total_reward += reward
    if i % 100 == 0:
        reward_list.append(total_reward/100)
        if reward_list[-1] >= 195.0:
                toc = time.time()
                success = True
                print(f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[-1]} in {(reward_list.index(reward_list[-1]))*100} episodes')
                break
        total_reward = 0
toc = time.time()
if not success:
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
env.close()

Took 113.08 seconds and failed
Max average reward achieved: 30.84


## Sarsa

In [2]:
env = gym.make('CartPole-v1')
env.seed(0)

[0]

In [130]:
tic = time.time()
reward_list = []
Q_table = np.zeros(n_bins + (env.action_space.n,))
policy = np.zeros(n_bins)
total_reward = 0
for i in range(1, 10001):
    current, done = discretizer(*env.reset()), False
    while not done:
        lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
        epsilon =  max(0.05, min(1, 1.0 - math.log10((i + 1) / 25)))
        action = int(policy[current])
        observation, reward, done, info = env.step(action)
        next_state = discretizer(*observation)
        Q_table[current][action] += lr * (reward + Q_table[next_state][int(policy[next_state])] - Q_table[current][action])
        current = next_state
        if np.random.random() < epsilon:
            policy[current] = env.action_space.sample()
        else:
            policy[current] = np.argmax(Q_table[current])
        total_reward += reward
    if i % 100 == 0:
        reward_list.append(total_reward/100)
        if reward_list[-1] >= 195.0:
            toc = time.time()
            print(f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[-1]} in {(reward_list.index(reward_list[-1]))*100} episodes')
        total_reward = 0
toc = time.time()
if not success:
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
env.close()

SyntaxError: 'return' outside function (<ipython-input-130-bb2cfbb5acfe>, line 27)

## TESTING

In [3]:
n_bins = (6 , 12)
lower_bounds = [env.observation_space.low[2], -math.radians(50)]
upper_bounds = [env.observation_space.high[2], math.radians(50)]

def discretizer(_, __, angle, pole_velocity):
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

In [None]:
tic = time.time()
Q_table = np.zeros(n_bins + (env.action_space.n,))
reward_list = []
total_reward = 0
for i in range(1, 10001):
    current, done = discretizer(*env.reset()), False
    while not done:
        lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
        epsilon =  max(0.05, min(1, 1.0 - math.log10((i  + 1) / 25)))
        action = np.argmax(Q_table[current])
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        observation, reward, done, info = env.step(action) 
        next_state = discretizer(*observation)
        Q_table[current][action] += lr*(reward + np.max(Q_table[next_state]) - Q_table[current][action])
        current = next_state
        total_reward += reward
        if i % 100:
            env.render()
    if i % 100 == 0:
        print(total_reward/100)
        reward_list.append(total_reward/100)
        if reward_list[-1] >= 195.0:
                toc = time.time()
                success = True
                print(f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[-1]} in {(reward_list.index(reward_list[-1]) + 1)*100} episodes')
                env.close()
                break
        total_reward = 0
toc = time.time()
if not success:
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
env.close()

37.26
164.94


In [108]:
Q_table[discretizer(*env.reset())]

array([525.61625198, 504.92241404])