### Frozen-Lake Player

#### Load Environment

In [2]:
# IMPORT MODULES
# Import Numpy, Gym etc
import numpy as np
import gym
import random
print('Import Modules')

Import Modules


In [32]:
# LOAD ENVIRONMENT
# Load Environment from OpenAI Gym and Render 
print('Load Environment')
Env=gym.make('FrozenLake-v0')
Env.render()
print('')
print('Note that there is a Wind which occasionally blows the Agent onto a State they didn’t choose.')
print('Hence, Perfect Performance every single time is impossible.')
print('However, Learning to avoid the Holes and reach the Goal is certainly still doable.')
print('The Reward at every Step is 0, except for entering the Goal, which provides a Reward of 1.')

Load Environment

[41mS[0mFFF
FHFH
FFFH
HFFG

Note that there is a Wind which occasionally blows the Agent onto a State they didn’t choose.
Hence, Perfect Performance every single time is impossible.
However, Learning to avoid the Holes and reach the Goal is certainly still doable.
The Reward at every Step is 0, except for entering the Goal, which provides a Reward of 1.


In [33]:
# LOAD ENVIRONMENT
# Explore Environment
ActionSize=Env.action_space.n
print("Action Size ",ActionSize)
StateSize=Env.observation_space.n
print("State Size ",StateSize)

('Action Size ', 4)
('State Size ', 16)


#### Initialization

In [34]:
# INITIALIZATION
# Initialize Q-Table
QTable=np.zeros((StateSize,ActionSize))
print(QTable)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [37]:
# INITIALIZATION
# Add Hyper-Parameters Episodes
TotalEpisodes=5000         # Total Episodes
TotalTestEpisodes=100      # Total Test Episodes
MaxSteps=99                # Max Steps per Episode

# Add Hyper-Parameters Bellman Equation
LearningRate=0.8           # Learning Rate
Gamma=0.95                 # Discounting Rate

# Add Exploration Parameters
Epsilon=0.0                # Exploration rate
MaxEpsilon=1.0             # Exploration probability at start
MinEpsilon=0.01            # Minimum exploration probability 
DecayRate=0.01             # Exponential decay rate for exploration prob
print('Add Hyper-Parameters')

Add Hyper-Parameters


#### Q-Learning

In [36]:
# Q-LEARNING
# Perform Learning for each Episode
for Episode in range(TotalEpisodes):
    # Reset the Environment
    State=Env.reset()
    Step=0
    Done=False
    
    # Perform Temporal Difference Learning for each Step
    for Step in range(MaxSteps):
        # Choose an Action (A) in Current World State (S)
        Action=np.argmax(QTable[State,:]+np.random.randn(1,Env.action_space.n)*(1./(Episode+1)))
        
        # Take the Action (A) and Observe the Outcome State(S') and Reward (R)
        NewState,Reward,Done,_=Env.step(Action)

        # Update Q(S,A):= Q(S,A) + Learning Rate * [R(S,A) + Gamma * Max Q(S',A') - Q(S,A)]
        QTable[State,Action]=QTable[State,Action]+LearningRate*(Reward+Gamma*np.max(QTable[NewState,:])-QTable[State,Action])
                
        # Update State
        State=NewState
        
        # Check if Episode is Finished
        if Done==True: 
            break
            
    # Increment Episode    
    Episode+=1
    
    # Reduce Epsilon (We Need Less and Less Exploration after each Episode)
    Epsilon=MinEpsilon+(MaxEpsilon-MinEpsilon)*np.exp(-DecayRate*Episode)

# Print Final Q-Table
print(QTable)

[[5.19948688e-01 8.23319735e-03 1.79867399e-02 7.82866567e-03]
 [4.60763320e-05 4.69635942e-05 1.84485150e-04 2.58371205e-01]
 [2.47662960e-03 3.59950291e-03 5.49790126e-03 2.03679434e-01]
 [1.37698027e-03 5.21093765e-05 8.40868385e-06 8.14831456e-02]
 [4.15984748e-01 6.90482617e-04 3.15055572e-03 1.08535089e-03]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.40550556e-02 4.68121415e-06 1.27882882e-05 3.24711799e-07]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [2.92806460e-03 5.71251977e-05 1.15993774e-03 4.37512129e-01]
 [2.02421131e-03 5.08271963e-01 1.43972753e-03 0.00000000e+00]
 [8.22284701e-01 0.00000000e+00 0.00000000e+00 1.28870879e-04]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [4.51033598e-03 2.23150357e-03 6.48571670e-01 5.48181690e-04]
 [0.00000000e+00 9.35324886e-01 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.000000

#### Test

In [40]:
# TEST
# Test the Q-Learning via Playing
Env.reset()
Rewards=[]

# Run Player for each Episode
for Episode in range(TotalTestEpisodes):
    State=Env.reset()
    Step=0
    Done=False
    TotalRewards=0
    print("")
    print("Episode ",Episode)
    for Step in range(MaxSteps):
        # Take the Action (A) that have the Maximum Expected Future Reward given that State
        Action=np.argmax(QTable[State,:])
        
        # Update State
        NewState,Reward,Done,Info=Env.step(Action)
        
        # Update Rewards
        TotalRewards +=Reward
        
        # Check if Task Completed
        if Done:
            Rewards.append(TotalRewards)
            print("Score ",TotalRewards)
            print("Steps ", Step)
            print("You Win!")
            break
        
        # Update State
        State=NewState
        
# Close Environment        
Env.close()
print("Percentage Wins: "+str(sum(Rewards)*100/TotalTestEpisodes))


('Episode ', 0)
('Score ', 1.0)
('Steps ', 45)
You Win!

('Episode ', 1)
('Score ', 0.0)
('Steps ', 45)
You Win!

('Episode ', 2)

('Episode ', 3)
('Score ', 1.0)
('Steps ', 9)
You Win!

('Episode ', 4)
('Score ', 1.0)
('Steps ', 17)
You Win!

('Episode ', 5)

('Episode ', 6)
('Score ', 1.0)
('Steps ', 36)
You Win!

('Episode ', 7)
('Score ', 0.0)
('Steps ', 18)
You Win!

('Episode ', 8)
('Score ', 1.0)
('Steps ', 52)
You Win!

('Episode ', 9)
('Score ', 1.0)
('Steps ', 36)
You Win!

('Episode ', 10)
('Score ', 1.0)
('Steps ', 26)
You Win!

('Episode ', 11)
('Score ', 1.0)
('Steps ', 29)
You Win!

('Episode ', 12)
('Score ', 0.0)
('Steps ', 17)
You Win!

('Episode ', 13)
('Score ', 1.0)
('Steps ', 79)
You Win!

('Episode ', 14)

('Episode ', 15)
('Score ', 1.0)
('Steps ', 17)
You Win!

('Episode ', 16)
('Score ', 1.0)
('Steps ', 16)
You Win!

('Episode ', 17)
('Score ', 1.0)
('Steps ', 77)
You Win!

('Episode ', 18)
('Score ', 0.0)
('Steps ', 60)
You Win!

('Episode ', 19)
('Score ', 