In [1]:
import gym
import numpy as np


In [2]:
env = gym.make("Taxi-v2")
state = env.reset()

[2017-07-13 20:43:05,678] Making new env: Taxi-v2


In [3]:
print(state)
env.render()

249
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



<h1>Possible Actions</h1>

down (0), up (1), right (2), left (3), pick-up (4), and drop-off (5)

In [4]:
n_states = env.observation_space.n
n_actions = env.action_space.n

<h1>How good does behaving completely random do?</h1>

In [5]:
state = env.reset()
counter = 0
g = 0
reward = None
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward
print("Solved in {} Steps with a total reward of {}".format(counter,g))


Solved in 3142 Steps with a total reward of -12229


<h1>Let's look at just one episode and see how the Q values change after each step using the formula below<h1>

In [6]:
Q = np.zeros([n_states, n_actions])

# This multidimensional array will keep a history of our Q-Values for all states
Q_hist = np.zeros([n_states, n_actions, 0])


episodes = 1
G = 0
alpha = 0.618

for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    firstState = state
    print("Initial State = {}".format(state))
    while reward != 20:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action)
        Q[state,action] += alpha * (reward + np.max(Q[state2]) - Q[state,action]) 
        G += reward
        state = state2
        
        #This will keep a history of Q Values in a multi dimensional array
        Q_hist = np.dstack((Q_hist, Q))
finalState = state
print("Final State = {}".format(finalState))


Initial State = 321
Final State = 97


<img src="qlearn.png">

<h1>Let's look at the first step:<h1>

In [7]:
print(firstState)
Q_hist[firstState,:,0]

321


array([-0.618,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ])

<h1>Let's look at the final step:</h1>

In [8]:
Q_hist.shape

(500, 6, 491)

In [9]:
print(finalState)
Q_hist[finalState,:,Q_hist.shape[2]-1]

97


array([ -0.618,  -0.618,  -0.618,  -0.618,  -6.18 ,  12.36 ])

<h1>Do you expect the Q-Table to have a good value for the second last step?<h1>

In [10]:
np.argmax(Q_hist[:,:,Q_hist.shape[2]-2], axis = 0)


array([0, 0, 0, 0, 0, 0])

<h1>No, but it definately knows which actions it thinks are the worst<h1>

In [11]:
np.argmin(Q_hist[:,:,Q_hist.shape[2]-2], axis = 0)

array([401, 401, 301,   1,  17,   1])

<h1>Let's run over multiple episodes so that we can converge on a optimal policy</h1>

In [12]:
episodes = 2000
rewardTracker = []

G = 0
alpha = 0.618

for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action]))
        G += reward
        state = state2
    
    if episode % 100 == 0:
        print('Episode {} Total Reward: {}'.format(episode,G))
    


Episode 100 Total Reward: -69
Episode 200 Total Reward: -46
Episode 300 Total Reward: 9
Episode 400 Total Reward: -18
Episode 500 Total Reward: 11
Episode 600 Total Reward: 8
Episode 700 Total Reward: 8
Episode 800 Total Reward: 8
Episode 900 Total Reward: 14
Episode 1000 Total Reward: 7
Episode 1100 Total Reward: 7
Episode 1200 Total Reward: 9
Episode 1300 Total Reward: 8
Episode 1400 Total Reward: 9
Episode 1500 Total Reward: 9
Episode 1600 Total Reward: 8
Episode 1700 Total Reward: 9
Episode 1800 Total Reward: 9
Episode 1900 Total Reward: 6
Episode 2000 Total Reward: 6


<h1>Now that we have learned the optimal Q Values we have developed a optimal policy and have no need to train the agent anymore<h>

In [13]:
state = env.reset()
done = None

while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    env.render()

+---------+
|R: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :G|
| : : : : |
| : :[42m_[0