In [1]:
import gym
import numpy as np


In [2]:
env = gym.make("MountainCar-v0")
env.reset()


LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 25000
SHOW_EVERY = 2000


epsilon = 0.5
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES // 2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

# we can query the enviornment to find out the possible ranges for each of these state
print("env.observation_space.high",env.observation_space.high)
print("env.observation_space.low",env.observation_space.low)





env.observation_space.high [0.6  0.07]
env.observation_space.low [-1.2  -0.07]


In [3]:
env.action_space 
#Action 1 means push the car left action 2 means do nothing action 3 means push the car right

Discrete(3)

 
` For the value at index 0, we can see the high value is 0.6, the low is -1.2, and then for the value at index 1, the high is 0.07, and the low is -0.07. Okay, so these are the ranges, but from one of the above observation states that we output: [-0.27508804 -0.00268013], we can see that these numbers can become quite granular. Can you imagine the size of a Q Table if we were going to have a value for every combination of these ranges out to 8 decimal places? That'd be huge! And, more importantly, it'd be useless. We don't need that much granularity. So, instead, what we want to do is conver these continuous values to discrete values. Basically, we want to bucket/group the ranges into something more manageable.`

In [4]:
DISCRETE_OS_SIZE = [20]* len(env.observation_space.high) #observation size
# size of len(env.observation_space.high) is 20 so it makes our DISCRETE_OS_SIZE = [20,20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE

print(discrete_os_win_size)

[0.09  0.007]


`It will tells us how large each bucket is, basically how much to increment the range by for each bucket`

## Creating the Q-table

In [5]:
q_table = np.random.uniform(low=-2,high=0,size=(DISCRETE_OS_SIZE + [env.action_space.n]))

print("q_table.shape",q_table.shape)

#it makes q_table of the size (20,20,3)

q_table.shape (20, 20, 3)


`So, this is a 20x20x3 shape, which has initialized random Q values for us. The 20 x 20 bit is every combination of the bucket slices of all possible states. The x3 bit is for every possible action we could take.`

`So these values are random, and the choice to be between -2 and 0 is also a variable. Each step is a -1 reward, and the flag is a 0 reward, so it seems to make sense to make the starting point of random Q values all negative.`

## Next, we need a quick helper-function that will convert our environment "state," which currently contains continuous values that would wind up making our Q-Table absolutely gigantic and take forever to learn.... to a "discrete" state instead:

In [6]:
def get_discrete_state(state):
    discrete_state = (state-env.observation_space.low)/ discrete_os_win_size
    return tuple(discrete_state.astype(np.int32))

discrete_state = get_discrete_state(env.reset())

print(discrete_state)

(6, 10)


In [None]:
#iterating over episodes
for episode in range(EPISODES):
    if episode % SHOW_EVERY == 0:
        print(episode)
        render = True
    else:
        render = False
        
    discrete_state = get_discrete_state(env.reset())
        

    done = False
    while not done:
        
        if np.random.random() > epsilon:
#         Next, we replace action = 2 with:
            # Get action from Q table
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0,env.action_space.n)
        new_state,reward,done,info = env.step(action) #every time we step with an action we get a new_state from the environment
    #     print(f"reward{reward}\n new_state{new_state}\n")
    # here state is it's poition and velocity

#         Then, we want to grab the new discrete state:
        new_discrete_state = get_discrete_state(new_state)

        if render:
            env.render()
        # If simulation did not end yet after last step - update Q table
        if not done:
            # Maximum possible Q value in next step (for new state)
            max_future_q = np.max(q_table[new_discrete_state])
            # Current Q value (for current state and performed action)
            current_q = q_table[discrete_state + (action,)]
            
            # And here's our equation for a new Q value for current state and action
            new_q = (1 - LEARNING_RATE)*current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            
            # Update Q table with new Q value
            q_table[discrete_state + (action,)] = new_q

        # Simulation ended (for any reson) - if goal position is achived - update Q value with reward directly
        elif new_state[0] >= env.goal_position:
#             print(f"we made it on episode {episode}")
            q_table[discrete_state + (action,)] = 0

    #Now, we need to reset the discrete_state variable:
        discrete_state = new_discrete_state
        
    if END_EPSILON_DECAYING >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

env.close()


0
2000
4000
6000
8000


In [None]:
! git add learning_MountainCar.ipynb