### This program will solve the Taxi-Driver problem using RL Q-learning¶
 - Mountain - Car defines "solving" as getting average reward of -110.0 over 100 consecutive trials
 
 - Q-learning formula -

 <img src= "q_learning.svg" align="left">

In [8]:
import gym
import numpy as np

In [9]:
env = gym.make("MountainCar-v0")

In [10]:
# print the action space size
action_size = env.action_space.n
print(action_size)

3


In [11]:
# print state space 
print(env.observation_space.high)
print(env.observation_space.low)
print(env.goal_position)

[0.6  0.07]
[-1.2  -0.07]
0.5


In [12]:
N = 20 # how many buckets
windth_of_buckets = (env.observation_space.high - env.observation_space.low)/N
print (windth_of_buckets)

[0.09  0.007]


In [13]:
# this funcation is given a state and transrom it to corresponding discrete value
def getDiscreteValue(state):
    return (((state - env.observation_space.low) % N) / windth_of_buckets).astype(int)

In [14]:
# initialize a Q - table of size 20X20X3,entry for each of the 3 action and pair of states
q_table = np.random.uniform(low = -2 ,high = 0 , size = (N,N,action_size))

In [19]:
# set standard RL parameters
alpha = 0.1   # Learning Rate
gemma = 0.99    # Discount Factor

# Number of episodes 
num_of_episode = 7000 

#  Exploration/Exploitation  parameters
epsilon = 1.0  # Exploration rate
max_epsilon = 1.0  # Exploration probability at start
min_epsilon = 0.01  # Minimum exploration probability
decay_rate = 0.01  # Exponential decay rate for exploration prob

In [20]:
# train the agent
for episode in range(num_of_episode):
    
    # Reset the environment
    state = env.reset()
    
    # discretize the state  
    position, velocity = getDiscreteValue(state)
    
    done = False
    
    # render laste 10 episodes
    if (episode > (num_of_episode-10)):
        render = True
    else:
        render = False
    
    while not done:
                
        if render:
            env.render()
        
        # choose an action according to epsilon-greedy policy
        # If rand number < less than epsilon - take a random choice (exploration) 
        if np.random.uniform(0,1) < epsilon:
            action = np.random.randint(0,2)
       
        # else take action with maximum expected future reward(exploitation)
        else:
            action = np.argmax(q_table[position,velocity,:])
       
        # take a step 
        new_state, reward, done, info = env.step(action)
       
        # discretize new state  
        new_position, new_velocity = getDiscreteValue(new_state)
        
        # Update Q(s,a):= Q(s,a) + alpha * [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        if not done:
            q_table[position,velocity,action] = q_table[position,velocity,action] + alpha *(reward + gemma 
                                                * np.max(q_table[new_position,new_velocity]) 
                                                 - q_table[position,velocity,action])
                                                
        # if goal is reach update Q-value (reward = 0) 
        elif new_state[0] >= env.goal_position: 
            q_table[position,velocity,action] = 0
        
        # set new state as cuurent state
        position = new_position 
        velocity = new_velocity
   
    # decrease epsilon 
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    
print("End Training")

End Training


In [18]:
# see how agent preforms after training
num_of_episode_after_training = 100
total_reward = 0

for episode in range(num_of_episode_after_training):
    
    state = env.reset()
    
    position, velocity = getDiscreteValue(state)
    
    done = False
    counter = 0

    while not done:
       
        counter += 1
        
        # Take the action that have the maximum expected future reward given that state
        action = np.argmax(q_table[position,velocity,:])

        new_state, reward, done, info = env.step(action)

        new_position, new_velocity = getDiscreteValue(new_state)

        position = new_position 
        velocity = new_velocity

        total_reward += reward

print("Average Reward of 100 games is {}".format(total_reward/num_of_episode_after_training))    
env.close()

Average Reward of 100 games is -142.96
