# Setup Environment

In [8]:
import gym
import numpy as np
from gym import spaces
env = gym.make("CartPole-v0")
env.reset()

array([-0.03135789,  0.02763279, -0.03062558,  0.02469429])

In [37]:
qvals = {a: Q[env.reset(), a] for a in actions}
qvals
actions

range(0, 2)

# Warpper

Most gym environment have space in multi-dimension (Box format). To save memory, we apply a wrapper to discretize the observation. 

env.unwrapped will give back the internal original environment object.

In [33]:
class DiscretizedObservationWrapper(gym.ObservationWrapper):
    """This wrapper converts a Box observation into a single integer.
    """
    def __init__(self, env, n_bins=10, low=None, high=None):
        super().__init__(env)
        assert isinstance(env.observation_space, gym.spaces.Box)

        low = self.observation_space.low if low is None else low
        high = self.observation_space.high if high is None else high

        self.n_bins = n_bins
        self.val_bins = [np.linspace(l, h, n_bins + 1) for l, h in
                         zip(low, high)]
        self.observation_space = gym.spaces.Discrete(n_bins ** len(low))

    def _convert_to_one_number(self, digits):
        return sum([d * ((self.n_bins + 1) ** i) for i, d in enumerate(digits)])

    def observation(self, observation):
        digits = [np.digitize([x], bins)[0]
                  for x, bins in zip(observation, self.val_bins)]
        return self._convert_to_one_number(digits)


env = DiscretizedObservationWrapper(
    env, 
    n_bins=0, 
    low=[0,0,0,0], 
    high=[1,1,1,1]
)

AssertionError: 

In [36]:
env.reset()
env.reset()

4

In [25]:
low=[0,0,0,0], 
high=[1,1,1,1]
val_bins = [np.linspace(l, h, 1 + 1) for l, h in
                         zip(low, high)]
val_bins


[array([[0., 0., 0., 0.],
        [1., 1., 1., 1.]])]

In [29]:

for x, bins in zip(env.reset(), val_bins):
    np.digitize([x], bins)[0]
 

ValueError: object too deep for desired array

In [28]:
from collections import defaultdict

Q = defaultdict(float)

gamma = 0.99  # Discounting factor
alpha = 0.5  # soft update param
actions = range(env.action_space.n) # all possible actions

# Calculate Q Value    
def update_Q(s, r, a, s_next, done):
    # s: observations
    # r: reward 
    # a: actions
    # s_next: same as s
    # done: bool
    
    max_q_next = max([Q[s_next, a] for a in actions])  # get max Q value of (s', a')
    
    # Do not include the next state's value if currently at the terminal state.
    Q[s, a] += alpha * (r + gamma * max_q_next * (1.0 - done) - Q[s, a]) # the Bellman equation to update Q value. 
    # If done is True, Q(s,a) = (1-alpha)*Q(s,a)



## Pick Action 

Action is decided based on max Q value and we use ε-greedy to force exploration.

In [5]:
epsilon = 0.1  # 10% chances to apply a random action

def act(ob): 
    if np.random.random() < epsilon:
        # action_space.sample() is a convenient function to get a random action
        # that is compatible with this given action space.
        return env.action_space.sample()

    # Pick the action with highest q value.
    qvals = {a: Q[ob, a] for a in actions}
    max_q = max(qvals.values())  # get best Q value 
    # In case multiple actions have the same maximum q value.
    actions_with_max_q = [a for a, q in qvals.items() if q == max_q] # range(2); max_q
    return np.random.choice(actions_with_max_q)



## Result

Compare the total reward using Naive Q-learning and random play.

In [12]:
NUM_EPISODES = 10000
rewards = []
for e in range(NUM_EPISODES):
    
    ob = env.reset()
     # reward for each round 
    reward = 0.0 # total reward in one round

    while True:
        a = act(ob)
        ob_next, r, done, _ = env.step(a)
        update_Q(ob, r, a, ob_next, done)
        reward += r
        ob = ob_next
        
        if done:
            #print(done)
            break
    #print(1)     
    rewards.append(reward)   

    


In [15]:
rewards_r = [] # reward for each round 
for e in range(NUM_EPISODES):
    
    ob = env.reset()
    
    reward = 0.0 # total reward in one round

    while True:
    #env.render()
        ob_next, r, done, _ = env.step(env.action_space.sample())
        reward += r
        ob = ob_next
        
        if done:
            break
            
            
    
    rewards_r.append(reward)
env.close()

In [None]:
'''
n_steps = 10000

ob = env.reset()
rewards_r = [] # reward for each round 
reward = 0.0 # total reward in one round
for step in range(n_steps):
    #env.render()
    
    ob_next, r, done, _ = env.step(env.action_space.sample())
    reward += r
    if done:
        rewards_r.append(reward)
        reward = 0.0
        ob = env.reset()
    else:
        ob = ob_next
        
env.close()
'''

In [None]:
import seaborn as sns
sns.boxplot(x=rewards).set_title('reward for naive_q')

In [None]:
sns.boxplot(x=rewards_r).set_title('reward for random')

In [16]:
import pandas as pd
df = pd.DataFrame(rewards) 
df.to_csv('Q.csv')  
df = pd.DataFrame(rewards_r) 
df.to_csv('Random.csv') 


As we can see from the plot, there is a significant increase on reward after applying Naive Q_learning method. 