In [1]:
from gym import Env
from gym.spaces import Discrete, Box
import gym
import numpy as np
import random
import time
from IPython.display import clear_output
from matplotlib import pyplot as plt

In [8]:
#create environment where the observation space is a 5 elements list
#action space is 2 discrete, 0 leads to cell 0 in the list with reward +2, and 1 leads to one step forward. When we reach
#the 5th cell in the list we get +10 reward
#landing on any other state leads to reward=0
#each episode ends after 60 steps
#applying action 0 in state 0 leads to staying in state 0
#applying action 1 in state 5 leads to staying in state 1
class DarkDungeon(Env):
    
    def __init__(self):
        self.observation_space=Discrete(5)
        self.action_space=Discrete(2)
        self.state = self.observation_space.sample()
        self.steps=60
        
    
    def step(self, action):
        reward=0
        if action==0:
            new_state = 0
            reward += 2
        else:
            new_state = self.state+1
            
        self.steps -= 1
        
            
        if new_state>=4:
            reward += 10
            new_state=4
            
        if self.steps==0:
            done=True
        else:
            done=False
            
        info={}
        return new_state, reward, done, info
    
    def render(self):
        pass
    
    def reset(self):
        self.state= self.observation_space.sample()
        self.steps=60
        return self.state
        

In [9]:
#create q table containing 0 values
env = DarkDungeon()
env.reset()
env.render()

In [10]:
q_table = np.zeros((env.observation_space.n, env.action_space.n))

In [11]:
num_episodes=10000
epsilon=1
gamma = 0.95
learning_rate = 0.1

In [12]:
#Q learning
rewards= 0
rewardsAcrossEpisodes = []
#go through episodes
#at each episode we reset the environment, and don't forget to set done to False
for episode in range(num_episodes):
    state = env.reset()
    done = False
    r = 0
    
    while(True):
        #apply epsilon-greedy strategy
        num = random.uniform(0,1)
        if num < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state,:])
        #apply the chosen action to the environment
        new_state, reward, done, info = env.step(action)
        #update the value of the q_table according to: old_q = old_q + learning_rate*(reward + discount_factor*max(new_q) - old_q)
        q_table[state, action] += learning_rate*(reward + gamma * np.max(q_table[new_state, :]) - q_table[state, action])
        
        #update state to become next state
        #update reward
        state = new_state
        r += reward
        rewards += reward
        
        if done:
            rewardsAcrossEpisodes.append(r)
            epsilon = epsilon*0.9
            break
            

In [7]:
#learning_rate = 0.1
#epsilon=0.017
q_table

array([[ 40.        ,  39.92172721],
       [ 40.        ,  29.16199156],
       [ 40.        ,  22.89602341],
       [ 40.        ,  19.855799  ],
       [ 54.15349765, 200.        ]])

In [66]:
#epsilon=1
#decay *0.8
q_table

array([[ 200.        ,  199.92364052],
       [ 200.        ,  161.14544863],
       [ 200.        ,  169.69356421],
       [ 200.        ,  197.8396305 ],
       [ 581.95146909, 1000.        ]])

In [215]:
#epsilon=1
#decay *0.9
q_table

array([[ 40.        ,  39.94262726],
       [ 40.        ,  29.70858818],
       [ 40.        ,  14.98555348],
       [ 41.88458789, 200.        ],
       [ 79.34358404, 200.        ]])

In [82]:
#learning_rate = 0.1
#epsilon=1 decay 0.99
q_table

array([[ 40.        ,  39.66091185],
       [ 40.        ,  39.89858087],
       [ 40.        ,  39.89914819],
       [ 40.35017992, 200.        ],
       [ 61.00768426, 200.        ]])

In [72]:
#epsilon=1
#decay *0.99
q_table

array([[ 200.        ,  199.76342597],
       [ 200.        ,  199.51399788],
       [ 200.        ,  199.5170842 ],
       [ 200.40249308, 1000.        ],
       [ 554.51272084, 1000.        ]])

In [67]:
#extract optimal policy
optimal_policy = []
for state in range(env.observation_space.n):
    optimal_policy.append(np.argmax(q_table[state, :]))
    
optimal_policy

[0, 0, 0, 0, 1]