# OpenAI `gym` env `Taxi-v2`

In [1]:
import gym

In [2]:
env = gym.make('Taxi-v2')

[2017-10-18 15:02:35,690] Making new env: Taxi-v2


In [3]:
print('Current state: {:,}'.format(env.reset()))

Current state: 307


In [4]:
print('Possible states: {:,}'.format(env.observation_space.n))
print('Possible actions: {:,}'.format(env.action_space.n))

Possible states: 500
Possible actions: 6


In [5]:
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |[35mB[0m: |
+---------+



In [6]:
# setting the current state to 114
env.env.s = 114

In [7]:
env.render()

+---------+
|R: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



### Possible actions

* 0 - down (south)
* 1 - up (north)
* 2 - right (east)
* 3 - left (west) 
* 4 - pickup
* 5 - dropoff

In [8]:
env.step(1)

(14, -1, False, {'prob': 1.0})

In [9]:
env.render()

+---------+
|[43mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (North)


In [10]:
env.step(0)  # move down
env.step(4)  # pick up passanger

(114, -10, False, {'prob': 1.0})

In [11]:
env.render()

+---------+
|R: | : :G|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (Pickup)


### Brute force solution

In [12]:
state = env.reset()
total_rewards, reward = 0, None
counter = 0

while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1  # increment the counter
    total_rewards += reward  # accumulated reward
    if counter >= 5000:   # terminate after 5000 episodes
        print('Giving up after {:,} episodes.'.format(counter))
        break

env.render()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)


## Q Learning

In [13]:
import numpy as np

In [14]:
Q = np.zeros(shape=[env.observation_space.n, env.action_space.n])
print(Q.shape)

(500, 6)


In [15]:
G = 0  # total accumulated reward
alpha = 0.618  # learning rate (Golden ratio)

### Basic Q Learning Algorithm

In [16]:
episodes = 1000
for episode in range(1, episodes):
    done = None
    G, reward = 0, None
    state = env.reset()
    while done != True:  # as long as we ain't done
        action = np.argmax(Q[state])   # select action from Q
        state2, reward, done, info = env.step(action)  # take that action
        Q[state, action] += alpha * (reward + np.max(Q[state2]) - Q[state, action])
        G += reward
        state = state2
    if episode % 50 == 0:
        print('Episode {:,}\tAccumulated reward = {:,}'.format(episode, G))
env.render()

Episode 50	Accumulated reward = -200
Episode 100	Accumulated reward = -7
Episode 150	Accumulated reward = -1
Episode 200	Accumulated reward = 5
Episode 250	Accumulated reward = 11
Episode 300	Accumulated reward = 3
Episode 350	Accumulated reward = 12
Episode 400	Accumulated reward = 11
Episode 450	Accumulated reward = 9
Episode 500	Accumulated reward = 15
Episode 550	Accumulated reward = 5
Episode 600	Accumulated reward = 6
Episode 650	Accumulated reward = 9
Episode 700	Accumulated reward = 10
Episode 750	Accumulated reward = 7
Episode 800	Accumulated reward = 9
Episode 850	Accumulated reward = 6
Episode 900	Accumulated reward = 5
Episode 950	Accumulated reward = 10
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)


In [18]:
Q = np.zeros(shape=[env.observation_space.n, env.action_space.n])
alpha = 0.618
episodes = 1000
for episode in range(1, episodes+1):
    done = False
    G, reward = 0, None
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state])
        state2, reward, done, info = env.step(action)
        Q[state, action] += alpha * (reward + np.max(Q[state2]) - Q[state, action])
        G += reward
        state2 = state
    if episode % 50 == 0:
        print('Episode: {:,}\tAccumulated reward = {:,}'.format(episode, G))
env.render()

Episode: 50	Accumulated reward = -218
Episode: 100	Accumulated reward = -218
Episode: 150	Accumulated reward = -200
Episode: 200	Accumulated reward = -200
Episode: 250	Accumulated reward = -218
Episode: 300	Accumulated reward = -200
Episode: 350	Accumulated reward = -200
Episode: 400	Accumulated reward = -200
Episode: 450	Accumulated reward = -209
Episode: 500	Accumulated reward = -200
Episode: 550	Accumulated reward = -200
Episode: 600	Accumulated reward = -200
Episode: 650	Accumulated reward = -218
Episode: 700	Accumulated reward = -200
Episode: 750	Accumulated reward = -200
Episode: 800	Accumulated reward = -200
Episode: 850	Accumulated reward = -200
Episode: 900	Accumulated reward = -200
Episode: 950	Accumulated reward = -200
Episode: 1,000	Accumulated reward = -200
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
