In [23]:
import gym
import math
import time
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
np.random.seed(0)

In this notebook all algorithms that I reference are from the textbook: Reinforcement Learning: An Introduction by Sutton and Barto.

https://github.com/openai/gym/wiki/CartPole-v0

This link has all the information regarding the specifics of the CartPole environment, such as the action space, how rewards are disitributed, and what it means to solve this problem.

Solved requirements: "Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials." 

## Q-Learning

We will first attempt to solve this problem with the Q-learning algorithm, which is an off-policy temporal difference control algorithm.

In [24]:
# Create the environment with openai gym and set a seed to work in
env = gym.make('CartPole-v1')
env.seed(0)

[0]

In [25]:
n_bins = (6, 12, 6 , 12)
lower_bounds = [-4.8, env.observation_space.low[1], env.observation_space.low[2], -42]
upper_bounds = [4.8, env.observation_space.high[1], env.observation_space.high[2], 42]

def discretizer(pos, cart_velocity, angle, pole_velocity):
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds])
    return tuple(map(int,est.transform([[pos, cart_velocity, angle, pole_velocity]])[0]))

In [26]:
Q_table = np.zeros(n_bins + (env.action_space.n,))
Q_table.shape

(6, 12, 6, 12, 2)

In [27]:
def qlearn():
    tic = time.time()
    Q_table = np.zeros(n_bins + (env.action_space.n,))
    reward_list = []
    total_reward = 0
    for i in range(1, 10001):
        current, done = discretizer(*env.reset()), False
        while not done:
            lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
            epsilon =  max(0.05, min(1, 1.0 - math.log10((i  + 1) / 25)))
            action = np.argmax(Q_table[current])
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            observation, reward, done, info = env.step(action)
            next_state = discretizer(*observation)
            Q_table[current][action] += lr*(reward + np.max(Q_table[next_state]) - Q_table[current][action])
            current = next_state
            total_reward += reward
        if i % 100 == 0:
            reward_list.append(total_reward/100)
#            print(total_reward/100)
            for index in range(len(reward_list)):
                if reward_list[index] >= 195.0:
                    toc = time.time()
                    first_avg_r = index
                    return f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[first_avg_r]} in {(first_avg_r+1)*100} episodes'
            total_reward = 0
    toc = time.time()
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
    env.close()

In [28]:
qlearn()

Took 55.99 seconds and failed
Max average reward achieved: 19.89


In [29]:
# Create the environment with openai gym and set a seed to work in
env = gym.make('CartPole-v1')
env.seed(0)

[0]

## Sarsa

In [30]:
def sarsa():
    tic = time.time()
    reward_list = []
    Q_table = np.zeros(n_bins + (env.action_space.n,))
    policy = np.zeros(n_bins)
    total_reward = 0
    for i in range(1, 10001):
        current, done = discretizer(*env.reset()), False
        while not done:
            lr = max(0.01, min(1.0, 1.0 - math.log10((i + 1) / 25)))
            epsilon =  max(0.05, min(1, 1.0 - math.log10((i  + 1) / 25)))
            action = int(policy[current])
            observation, reward, done, info = env.step(action)
            next_state = discretizer(*observation)
            Q_table[current][action] += lr*(reward + Q_table[next_state][int(policy[next_state])] - Q_table[current][action])
            current = next_state
            if np.random.random() < epsilon:
                policy[current] = env.action_space.sample()
            else:
                policy[current] = np.argmax(Q_table[current])
            total_reward += reward
        if i % 100 == 0:
#            print(total_reward/100)
            reward_list.append(total_reward/100)
            for index in range(len(reward_list)):
                if reward_list[index] >= 195.0:
                    toc = time.time()
                    first_avg_r = index
                    return f'Took {toc-tic:.2f} seconds and achieved average reward of {reward_list[first_avg_r]} in {(first_avg_r+1)*100} episodes'
            total_reward = 0
    toc = time.time()
    print(f'Took {toc-tic:.2f} seconds and failed')
    print(f'Max average reward achieved: {np.max(reward_list):.2f}')
    env.close()

In [31]:
sarsa()

Took 55.41 seconds and failed
Max average reward achieved: 19.56
