# Cart Pole: An explicit policy

In [12]:
import gym
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import time

## Setup environment

In [13]:
env = gym.make('CartPole-v1')

In [14]:
# actions aliases
LEFT = 0
RIGHT = 1

In [15]:
# set up the environment
state = env.reset()
total_reward = 0
done = False

## Policy

If the pole's velocity is in the same direction it is leaning, the cart should move in the opposite direction to balance the pole; e.g., if the pole is leaning left and falling left, the cart should move right. If the pole's momentum is in the opposite direction of its angle (e.g., leaning left with a positive velocity), the cart should move in the direction of the velocity of the pole in order to slow it down.

A small coushion on the velocity threshold (+/- 0.02) improves the behavior of the cart by briefly delaying its change of direction. I chose 0.02 after brief experimentation, starting at 0.01 and moving higher until the reward reached a maximum.

In [50]:
def get_action(state):
    # unpack the state
    cart_pos, cart_vel, pole_angle, pole_vel = state
    
    # If the pole is leaning left...
    if pole_angle <= 0:
        if pole_vel <= 0.02: # and falling further
            return LEFT
        else:                 # and straightening up
            return RIGHT
    # If the pole is leaning right...
    elif pole_angle > 0:
        if pole_vel >= -0.02: # and falling further
            return RIGHT
        else:                # and straightening up
            return LEFT

## Run!

In [51]:
def run_episode():
    
    # set up the environment
    state = env.reset()
    total_reward = 0
    done = False
    
    # go!
    while not done:
        a = get_action(state)
        #print(a, end=',')
        state, reward, done, _ = env.step(a)
        total_reward += reward
        
    #print(total_reward)
    return total_reward   

In [52]:
num_episodes = 5
rewards = []
for i in range(num_episodes):
    reward = run_episode()
    print(reward)
    rewards.append(reward)

avg_reward = np.sum(rewards)/num_episodes
print("Averaged {} over {} episodes".format(avg_reward, num_episodes))

500.0
500.0
500.0
500.0
500.0
Averaged 500.0 over 5 episodes
