In [47]:
import numpy as np
import gym
import random
env = gym.make("Taxi-v3")
env.reset()

#Info about space
print("Action space:", env.action_space)
# Action space: Discrete(6)
print("Observation space:", env.observation_space)
# Observation space: Discrete(500)
#500 states

action_size = env.action_space.n
print("Action size ", action_size)

state_size = env.observation_space.n
print("State size ", state_size)

#Q table
qtable = np.zeros((state_size, action_size))
print(qtable)

#  hyperparameters
total_episodes = 50000        # Total episodes
total_test_episodes = 100     # Total test episodes
max_steps = 99                # Max steps per episode

learning_rate = 0.7           # Learning rate
gamma = 0.618                 # Discounting rate

# Exploration parameters
global epsilon 
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

def learn():
    epsilon = 1.0                 # Exploration rate
    for episode in range(total_episodes):
        # Reset the environment
        state = env.reset()
        step = 0
        done = False

        for step in range(max_steps):
            # 3. Choose an action a in the current world state (s)
            ## First we randomize a number
            exp_exp_tradeoff = random.uniform(0,1)

            ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
            if exp_exp_tradeoff > epsilon:
                action = np.argmax(qtable[state,:])

            # Else doing a random choice --> exploration
            else:
                action = env.action_space.sample()

            # Take the action (a) and observe the outcome state(s') and reward (r)
            new_state, reward, done, info = env.step(action)

            # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
            qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * 
                                        np.max(qtable[new_state, :]) - qtable[state, action])

            # Our new state is state
            state = new_state

            # If done : finish episode
            if done == True: 
                break
    
        # Reduce epsilon (because we need less and less exploration)
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 



    
def play_with_q_table():
    env.reset()
    rewards = []
    for episode in range(total_test_episodes):
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        #print("****************************************************")
        #print("EPISODE ", episode)

        for step in range(max_steps):
            # UNCOMMENT IT IF YOU WANT TO SEE OUR AGENT PLAYING
            #env.render()
            # Take the action (index) that have the maximum expected future reward given that state
            action = np.argmax(qtable[state,:])

            new_state, reward, done, info = env.step(action)

            total_rewards += reward

            if done:
                rewards.append(total_rewards)
                #print ("Score", total_rewards)
                break
            state = new_state
    print ("Score over time with q table: " +  str(sum(rewards)/total_test_episodes))
        

def play_with_random_agent():
    env.reset()
    rewards = []
    for episode in range(total_test_episodes):
        state = env.reset()
        step = 0
        done = False
        total_rewards = 0
        #print("****************************************************")
        #print("EPISODE ", episode)

        for step in range(max_steps):
            # UNCOMMENT IT IF YOU WANT TO SEE OUR AGENT PLAYING
            #env.render()
            # Take random action
            action = env.action_space.sample()
            new_state, reward, done, info = env.step(action)

            total_rewards += reward

            if done:
                rewards.append(total_rewards)
                #print ("Score", total_rewards)
                break
            state = new_state
    print ("Score over time with random agent: " +  str(sum(rewards)/total_test_episodes))





def random_agent_example():
    #random agent
    for t in range(1):
        env.render()
        #get random action
        action = env.action_space.sample()
        print("action: {}".format(action))
        observation, reward, done, info = env.step(action)
        print("observation:", observation, " reward:", reward, " done:", done, "info:", info)
        #print(observation, reward, done, info)
        if done:
                print("Episode finished after {} timesteps".format(t+1))
                break


# learn()  
# play_with_q_table()
# play_with_random_agent()
                
                
                
# env.close()




Action space: Discrete(6)
Observation space: Discrete(500)
Action size  6
State size  500
[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [48]:
print(epsilon)
learn()  
play_with_q_table()
play_with_random_agent()
                
                
                
env.close()


1.0
Score over time with q table: 8.4
Score over time with random agent: -1.33


In [5]:
dir(env)


['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_elapsed_steps',
 '_max_episode_steps',
 'action_space',
 'class_name',
 'close',
 'compute_reward',
 'env',
 'metadata',
 'observation_space',
 'render',
 'reset',
 'reward_range',
 'seed',
 'spec',
 'step',
 'unwrapped']

action = env.action_space.sample()
env.action_space.to_jsonable(4)