### OpenAI Taxi-V2 Player via Q-Learning

#### Load Environment

In [12]:
# IMPORT MODULES
# Import Numpy, Gym etc
import numpy as np
import gym
import random
print('Import Modules')

Import Modules


In [31]:
# CREATE ENVIRONMENT
# Load Taxi-V2 Environment
# In this Environment the Yellow Square represents the Taxi, the (“|”) represents a Wall, the Blue Letter represents the Pick-Up Location, and the Purple letter is the Drop-Off Location. The Taxi will turn Green when it has a Passenger Aboard. 
Env=gym.make("Taxi-v2")
Env.render()
print('Load Taxi-V2 Environment') 
print('In this Environment the Yellow Square represents the Taxi, the (“|”) represents a Wall.')
print('The Blue Letter represents the Pick-Up Location, and the Purple letter is the Drop-Off Location.')
print('The Taxi will turn Green when it has a Passenger Aboard.')
print('The Environment gives a -1 Reward for each Step in order for the Agent to try and find the quickest solution.')
print('The Environment gives a -10 Reward if Agent incorrectly Picks Up or Drops Off a Passenger.')
print('The Environment gives a 20 Reward on Success')

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | :[43m [0m|
|[35mY[0m| : |[34;1mB[0m: |
+---------+

Load Taxi-V2 Environment
In this Environment the Yellow Square represents the Taxi, the (“|”) represents a Wall.
The Blue Letter represents the Pick-Up Location, and the Purple letter is the Drop-Off Location.
The Taxi will turn Green when it has a Passenger Aboard.
The Environment gives a -1 Reward for each Step in order for the Agent to try and find the quickest solution.
The Environment gives a -10 Reward if Agent incorrectly Picks Up or Drops Off a Passenger.
The Environment gives a 20 Reward on Success


In [15]:
# LOAD ENVIRONMENT
# Explore Environment
ActionSize=Env.action_space.n
print("Action Size ",ActionSize)
StateSize=Env.observation_space.n
print("State Size ",StateSize)

('Action Size ', 6)
('State Size ', 500)


#### Initialization

In [16]:
# INITIALIZATION
# Initialize Q-Table
QTable=np.zeros((StateSize,ActionSize))
print(QTable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [23]:
# INITIALIZATION
# Add Hyper-Parameters Episodes
TotalEpisodes=50000        # Total Episodes
TotalTestEpisodes=1        # Total Test Episodes
MaxSteps=99                # Max Steps per Episode

# Add Hyper-Parameters Bellman Equation
LearningRate=0.7           # Learning Rate
Gamma=0.618                # Discounting Rate

# Add Exploration Parameters
Epsilon=1.0                # Exploration rate
MaxEpsilon=1.0             # Exploration probability at start
MinEpsilon=0.01            # Minimum exploration probability 
DecayRate=0.01             # Exponential decay rate for exploration prob
print('Add Hyper-Parameters')

Add Hyper-Parameters


#### Q-Learning

In [22]:
# Q-LEARNING
# Perform Learning for each Episode
for Episode in range(TotalEpisodes):
    # Reset the Environment
    State=Env.reset()
    Step=0
    Done=False
    
    # Perform Temporal Difference Learning for each Step
    for Step in range(MaxSteps):
        # Choose an Action (A) in Current World State (S)
        # First Randomize a Number
        ExploreExploitTradeoff=random.uniform(0,1)
        
        # Check if this number is Greater than Epsilon, Then Exploitation (Take the Biggest Q-Value for this State)
        if ExploreExploitTradeoff > Epsilon:
            Action=np.argmax(QTable[State,:])
        
        # Otherwise Exploration (Perform a Random Action)
        else:
            Action=Env.action_space.sample()
        
        # Take the Action (A) and Observe the Outcome State(S') and Reward (R)
        NewState,Reward,Done,Info=Env.step(Action)

        # Update Q(S,A):= Q(S,A) + Learning Rate * [R(S,A) + Gamma * Max Q(S',A') - Q(S,A)]
        QTable[State,Action]=QTable[State,Action]+LearningRate*(Reward+Gamma*np.max(QTable[NewState,:])-QTable[State,Action])
                
        # Update State
        State=NewState
        
        # Check if Episode is Finished
        if Done==True: 
            break
            
    # Increment Episode    
    Episode+=1
    
    # Reduce Epsilon (We Need Less and Less Exploration after each Episode)
    Epsilon=MinEpsilon+(MaxEpsilon-MinEpsilon)*np.exp(-DecayRate*Episode)

# Print Final Q-Table
print(QTable)

[[  0.           0.           0.           0.           0.
    0.        ]
 [ -1.89494435  -1.44813002  -1.89494435  -1.44813002  -0.72512948
  -10.44813001]
 [ -0.72512972   0.4447743   -0.72516003   0.44477417   2.33782249
   -8.55522572]
 ...
 [  1.04882142   5.40100727   1.01684373  -2.11715326 -10.50803626
   -6.68950693]
 [ -1.94725029  -1.54576149  -2.47610015  -1.44813002 -10.88646532
   -7.        ]
 [ 12.65461466  -0.7         16.66099906  31.35602094   0.
    6.56461466]]


#### Test

In [25]:
# TEST
# Test the Q-Learning via Playing
Env.reset()
Rewards=[]

# Run Player for each Episode
for Episode in range(TotalTestEpisodes):
    State=Env.reset()
    Step=0
    Done=False
    TotalRewards=0
    print("")
    print("Episode ",Episode)
    for Step in range(MaxSteps):
        Env.render()
        # Take the Action (A) that have the Maximum Expected Future Reward given that State
        Action=np.argmax(QTable[State,:])
        
        # Update State
        NewState,Reward,Done,Info=Env.step(Action)
        
        # Update Rewards
        TotalRewards +=Reward
        
        # Check if Task Completed
        if Done:
            Rewards.append(TotalRewards)
            print ("Score ",TotalRewards)
            break
        
        # Update State
        State=NewState
        
# Close Environment        
Env.close()
print ("Score Over Time: "+str(sum(Rewards)/TotalTestEpisodes))


('Episode ', 0)
+---------+
|R: | : :[34;1mG[0m|
|[43m [0m: : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

+---------+
|R: | : :[34;1mG[0m|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[34;1mG[0m|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (East)
+---------+
|R: | : :[42mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | :[42m_[0m:G|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: |[42m_[0m: :G|
|