In [28]:
import numpy as np
import gym
import warnings
from IPython.display import clear_output
import time
warnings.filterwarnings('ignore')


In [2]:
env = gym.make('Taxi-v2').env

In [3]:
# to reset environment
print(env.reset())

# current state
print(env.s)

env.render()

301
301
+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |B: |
+---------+



In [4]:
print('Action Space: {}'.format(env.action_space.n))
print('State Space: {}'.format(env.observation_space.n))


Action Space: 6
State Space: 500


In [5]:
# action table

# output = {action: [(probability, nextstate, reward, done)]}
# 0 = south 
# 1 = north 
# 2 = east 
# 3 = west 
# 4 = pickup 
# 5 = dropoff

env.P[1]

{0: [(1.0, 101, -1, False)],
 1: [(1.0, 1, -1, False)],
 2: [(1.0, 21, -1, False)],
 3: [(1.0, 1, -1, False)],
 4: [(1.0, 17, -1, False)],
 5: [(1.0, 1, -10, False)]}

In [6]:
# generate random action
env.action_space.sample()

4

In [7]:
# step with action 
# output = (next state, reward, done, probability)
env.step(1)

(201, -1, False, {'prob': 1.0})

In [54]:
class QLearning():
    def __init__(self, env, alpha=0.1, gamma=0.6, epsilon=0.1):
        self.env = env
        self.alpha = alpha # learning rate
        self.gamma = gamma # discount factor
        self.epsilon = epsilon # explore or exploit 
        self.q_table = np.zeros([env.observation_space.n, env.action_space.n])
        self.timesteps = []
    
    def run_episode(self, training=False):
        timestep = 0
        frames = []
        state = self.env.reset()
        done = False

        while not done:
            if training:
                if np.random.uniform(0,1) < self.epsilon:
                    action = self.env.action_space.sample() # Explore action space
                else:
                    action = np.argmax(self.q_table[state]) # Exploit learned values
                    
                current_val = self.q_table[state,action]
                next_state, reward, done, prob = self.env.step(action)
                next_max = np.max(self.q_table[next_state,:])
                self.q_table[state, action] = (1 - self.alpha) * current_val + self.alpha * (reward + self.gamma * next_max)
                state = next_state
                
            else:
                action = np.argmax(self.q_table[state])
                state, reward, done, prob = self.env.step(action)

            timestep += 1
            frames.append(self.env.render(mode='ansi'))
        return timestep, frames
    
    def train(self, iterations=1000, print_every=1000):
        for i in range(iterations):
            timestep, _ = self.run_episode(training=True)
            self.timesteps.append(timestep)
            if (i + 1) % print_every == 0:
                print('iter %d. timestep: %.2f' % (i + 1, timestep))
            
    def eval(self, episodes=100, print_every=10):
        total_timestep = 0

        for i in range(episodes):
            timestep, _ = self.run_episode()
            total_timestep += timestep
            if (i + 1) % print_every == 0:
                print('iter %d. timestep: %.2f' % (i + 1, timestep))
        return total_timestep/episodes

    def run_sample(self):
        timestep, frames = self.run_episode()
        for i, frame in enumerate(frames):
            clear_output(wait=True)
            print(frame.getvalue())
            time.sleep(.5)
                

In [57]:
agent = QLearning(env)
agent.train(30000)

iter 1000. timestep: 19.00
iter 2000. timestep: 14.00
iter 3000. timestep: 18.00
iter 4000. timestep: 13.00
iter 5000. timestep: 18.00
iter 6000. timestep: 11.00
iter 7000. timestep: 14.00
iter 8000. timestep: 21.00
iter 9000. timestep: 18.00
iter 10000. timestep: 10.00
iter 11000. timestep: 14.00
iter 12000. timestep: 16.00
iter 13000. timestep: 11.00
iter 14000. timestep: 15.00
iter 15000. timestep: 15.00
iter 16000. timestep: 15.00
iter 17000. timestep: 18.00
iter 18000. timestep: 10.00
iter 19000. timestep: 15.00
iter 20000. timestep: 11.00
iter 21000. timestep: 19.00
iter 22000. timestep: 17.00
iter 23000. timestep: 14.00
iter 24000. timestep: 14.00
iter 25000. timestep: 13.00
iter 26000. timestep: 13.00
iter 27000. timestep: 10.00
iter 28000. timestep: 16.00
iter 29000. timestep: 9.00
iter 30000. timestep: 15.00


In [58]:
avg_timesteps = agent.eval(100)
print('Average timesteps: %d'%(avg_timesteps))

iter 10. timestep: 8.00
iter 20. timestep: 16.00
iter 30. timestep: 10.00
iter 40. timestep: 16.00
iter 50. timestep: 11.00
iter 60. timestep: 14.00
iter 70. timestep: 11.00
iter 80. timestep: 13.00
iter 90. timestep: 11.00
iter 100. timestep: 14.00
Average timesteps: 12


In [60]:
agent.run_sample()

+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

