<a href="https://colab.research.google.com/github/xanasa14/MLImplementations/blob/master/Q_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gym
import random

random.seed(1234)

streets = gym.make("Taxi-v3").env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4
streets.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+



In [2]:
initial_state = streets.encode(2, 3, 2, 0)

streets.s = initial_state

streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [3]:
streets.P[initial_state]

{0: [(1.0, 368, -1, False)],
 1: [(1.0, 168, -1, False)],
 2: [(1.0, 288, -1, False)],
 3: [(1.0, 248, -1, False)],
 4: [(1.0, 268, -10, False)],
 5: [(1.0, 268, -10, False)]}

In [5]:
import numpy as np

q_table = np.zeros([streets.observation_space.n, streets.action_space.n])

learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1
epochs = 10000

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Use the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state
        
        

In [7]:
q_table[initial_state]

array([-2.41763488, -2.42364715, -2.4246418 , -2.3639511 , -7.13926775,
       -6.85431135])

In [32]:
from IPython.display import clear_output
import time

stepsTime = []

for tripnum in range(1, 11):
    state = streets.reset()
   
    done = False
    trip_length = 0
    start = time.time()

    while not done and trip_length < 25:
        #get starting time
          

        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode='ansi'))
        # get time taken to run the for loop code 
        
        state = next_state
        trip_length += 1
    elapsed_time_fl = (time.time() - start)
    print(elapsed_time_fl)
    stepsTime.append(str(tripnum) +  " " + str(elapsed_time_fl))
        

        
        
    sleep(2)

Trip number 10 Step 14
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

0.02913188934326172


In [31]:
stepsTime

['1 0.052512407302856445',
 '2 0.012914657592773438',
 '3 0.02830672264099121',
 '4 0.031193256378173828',
 '5 0.05662202835083008',
 '6 0.023981809616088867',
 '7 0.03397560119628906',
 '8 0.014972925186157227',
 '9 0.03475522994995117',
 '10 0.022643327713012695']