In [1]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [None]:
!apt-get install -y xvfb x11-utils
!pip install gym[all]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

In [3]:
%matplotlib inline
from pyvirtualdisplay import Display
import gym
import numpy as np
import random
import math
from time import sleep
import matplotlib.pyplot as plt

In [None]:
display = Display(visible=False, size=(400, 300)) 
display.start()

In [5]:
## Initialize the "Cart-Pole" environment
env = gym.make('CartPole-v0')

In [6]:
## Defining the environment related constants

# Number of discrete states (bucket) per state dimension
NUM_BUCKETS = (1, 1, 6, 3)  # (x, dx, theta, dtheta)
# Number of discrete actions
NUM_ACTIONS = env.action_space.n # (left, right)
# Bounds for each discrete state
STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
STATE_BOUNDS[1] = [-0.5, 0.5]
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]
# Index of the action
ACTION_INDEX = len(NUM_BUCKETS)

In [None]:
## Creating a Q-Table for each state-action pair
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))
print(q_table)
print(q_table.shape)

# Create lists to contain total rewards and steps per episode
rList = []

## Learning related constants
MIN_EXPLORE_RATE = 0.01
MIN_LEARNING_RATE = 0.1

## Defining the simulation related constants
NUM_EPISODES = 300
MAX_T = 250
STREAK_TO_END = 20
SOLVED_T = 199
DEBUG_MODE = False

In [8]:
def get_explore_rate(t):
    if t >= 24:
        return max(MIN_EXPLORE_RATE, min(1, 1.0 - math.log10((t+1)/25)))
    else:
        return 1.0

In [9]:
def get_learning_rate(t):
    if t >= 24:
         return max(MIN_LEARNING_RATE, min(0.5, 1.0 - math.log10((t+1)/25)))
    else:
         return 1.0

In [10]:
def state_to_bucket(state):
    bucket_indice = []
    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            bucket_index = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            bucket_index = NUM_BUCKETS[i] - 1
        else:
            # Mapping the state bounds to the bucket array
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i]-1)*STATE_BOUNDS[i][0]/bound_width
            scaling = (NUM_BUCKETS[i]-1)/bound_width
            bucket_index = int(round(scaling*state[i] - offset))
        bucket_indice.append(bucket_index)
    return tuple(bucket_indice)

In [11]:
def simulate():

    ## Instantiating the learning related parameters
    learning_rate = get_learning_rate(0)
    explore_rate = get_explore_rate(0)
    discount_factor = 0.99  # since the world is unchanging

    num_streaks = 0

    for episode in range(NUM_EPISODES):
        # Reset the environment
        obv = env.reset()
        # the initial state
        state = state_to_bucket(obv)
        rAll = 0
        
        for t in range(MAX_T):
            env.render()

            # Select an action
            if random.random() < explore_rate:
                action = env.action_space.sample()
            # Select the action with the highest q
            else:
                action = np.argmax(q_table[state])

            # Execute the action
            new_obv, reward, done, info = env.step(action)
            # Observe the result
            new_state = state_to_bucket(new_obv)
            
            # Update Q-Table with new knowledge using learning rate
            q_target = reward + discount_factor * np.amax(q_table[new_state])
            q_table[state + (action,)] += learning_rate * (q_target - q_table[state + (action,)]) # update
        
            # Setting up for the next iteration
            state = new_state
            rAll += reward
            
            # Print data
            if (DEBUG_MODE):
                print("\nEpisode = %d" % episode)
                print("t = %d" % t)
                print("Action: %d" % action)
                print("State: %s" % str(state))
                print("Reward: %f" % reward)
                print("Streaks: %d" % num_streaks)
                print("")

            if done:
                print("Episode %d finished after %d time steps" % (episode, t))
                # steps >= 199
                if (t >= SOLVED_T):
                    num_streaks += 1
                else:
                    num_streaks = 0
                    
                rList.append(rAll)
                break

        # It's considered done when it's solved over 20 times consecutively
        if num_streaks > STREAK_TO_END:
            break
        
        # Update parameters
        explore_rate = get_explore_rate(episode)
        learning_rate = get_learning_rate(episode)

In [None]:
if __name__ == "__main__":
    simulate()
    env.close()

In [None]:
plt.bar(range(len(rList)), rList, color="blue")
plt.show()