In [11]:
# Import libraries

import numpy as np # matrices
import gym # enviroment

In [12]:
# Create environment (and understand it)

env = gym.make('Taxi-v2') # creates environment
print('Observation space:', env.observation_space) # dimensions of observation
print('Number of possible actions:', env.action_space) # number of possible actions

[2018-12-10 14:32:12,842] Making new env: Taxi-v2


Observation space: Discrete(500)
Number of possible actions: Discrete(6)


In [13]:
env.reset() # initializes environment; returns initial state

109

In [14]:
env.render() # renders the environment’s current state
env.close() # prevents crashing

+---------+
|R: | : :[35mG[0m|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [15]:
env.step(env.action_space.sample()) # performs a random action; returns variables (state, reward, whether environment is terminated, etc.)

(109, -10, False, {'prob': 1.0})

In [16]:
env.render()
env.close()

+---------+
|R: | : :[35mG[0m|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (Dropoff)


In [17]:
# Implement Q learning algorithm

Q = np.zeros([env.observation_space.n, env.action_space.n]) # Q action value table (dimension of states x number of possible actions)

In [18]:
G = 0 # total accumulated reward (starts at 0)
alpha = 0.618 # learning rate (0.618 for example’s sake)

In [19]:
for episode in range(1, 1001): # for each episode (game)...
    done = False # environment starts unterminated
    G, reward = 0, 0 # total reward and reward for each time step start at 0
    state = env.reset() # game will start as the initial state
    while done != True: # while the episode (game) is not finished...
        action = np.argmax(Q[state]) # the action that the algorithm thinks is "best" in the given state
        state2, reward, done, info = env.step(action) # algorithm executes action; returns the resulting state, 
                                                      #reward, done, and info
        Q[state, action] += alpha * (reward + 
                                     np.max(Q[state2]) - 
                                     Q[state, action]) # Q value for the state and action is updated by the reward;
                                                       #action value formula (Bellman equation)
        G += reward # reward for this time step gets added to total reward
        state = state2 # current state changes
    if episode % 50 == 0: # every 50th episode...
        print('Episode {} Total Reward: {}'.format(episode, G)) # prints the total reward

Episode 50 Total Reward: -200
Episode 100 Total Reward: -87
Episode 150 Total Reward: 4
Episode 200 Total Reward: 3
Episode 250 Total Reward: -43
Episode 300 Total Reward: 9
Episode 350 Total Reward: 5
Episode 400 Total Reward: 15
Episode 450 Total Reward: 6
Episode 500 Total Reward: 10
Episode 550 Total Reward: 5
Episode 600 Total Reward: 3
Episode 650 Total Reward: 9
Episode 700 Total Reward: 6
Episode 750 Total Reward: 7
Episode 800 Total Reward: 8
Episode 850 Total Reward: 3
Episode 900 Total Reward: 8
Episode 950 Total Reward: 6
Episode 1000 Total Reward: 14


In [20]:
done = False
G, reward = 0, 0
state = env.reset()
env.render()
while done != True:
    action = np.argmax(Q[state])
    state2, reward, done, info = env.step(action)
    env.render()
    print('Reward: {}'.format(reward))
    G += reward
    state = state2
print('Total Reward: {}'.format(G))

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
Reward: -1
+---------+
|R: | : :[34;1mG[0m|
| : : : :[43m [0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
Reward: -1
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
Reward: -1
+---------+
|R: | : :[42mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
Reward: -1
+---------+
|R: | : :G|
| : : : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
Reward: -1
+---------+
|R: | : :G|
| : : :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
Reward: -1
+---------+
|R: | : :G|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
Rewa