# Importing libraries

In [13]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output
from tqdm import tqdm

In [14]:
env = gym.make("FrozenLake-v0")           #Frozen lake env

#env = gym.make("FrozenLake8x8-v0")        # if we want 8x8 enviroment

  result = entry_point.load(False)


# Our Q Learning Agent Code

In [15]:
# Q-Learning agent here
class QLearningAgent():
    def agent_init(self, agent_init_info):
        """Setup for the agent called when the experiment first starts.
        
        Args:
        agent_init_info (dict), the parameters used to initialize the agent. The dictionary contains:
        {
            num_states (int): The number of states,
            num_actions (int): The number of actions,
            epsilon (float): The epsilon parameter for exploration,
            step_size (float): The step-size,
            discount (float): The discount factor,
        }
        
        """
        # Store the parameters provided in agent_init_info.
        self.num_actions = agent_init_info["num_actions"]
        self.num_states = agent_init_info["num_states"]
        self.epsilon = agent_init_info["epsilon"]
        self.step_size = agent_init_info["step_size"]
        self.discount = agent_init_info["discount"]
        self.rand_generator = np.random.RandomState(agent_info["seed"])
        
        # Create an array for action-value estimates and initialize it to zero.
        self.q = np.zeros((self.num_states, self.num_actions)) # The array of action-value estimates.

        
    def agent_start(self, state):
        """The first method called when the episode starts, called after
        the environment starts.
        Args:
            state (int): the state from the
                environment's evn_start function.
        Returns:
            action (int): the first action the agent takes.
        """
        
        # Choose action using epsilon greedy.
        current_q = self.q[state,:]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            state (int): the state from the
                environment's step based on where the agent ended up after the
                last step.
        Returns:
            action (int): the action the agent is taking.
        """
        
        # Choose action using epsilon greedy.
        current_q = self.q[state, :]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        
        # Perform an update (1 line)
        ### START CODE HERE ###
        self.q[self.prev_state,self.prev_action]=self.q[self.prev_state,self.prev_action] + self.step_size * ((reward + (self.discount*np.max(current_q)))-self.q[self.prev_state,self.prev_action])
        ### END CODE HERE ###
        
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """
        # Perform the last update in the episode (1 line)
        ### START CODE HERE ###
        self.q[self.prev_state,self.prev_action]=self.q[self.prev_state,self.prev_action] + self.step_size * ((reward)-self.q[self.prev_state,self.prev_action])
        ### END CODE HERE ###
        
    def argmax(self, q_values):
        """argmax with random tie-breaking
        Args:
            q_values (Numpy array): the array of action-values
        Returns:
            action (int): an action with the highest value
        """
        top = float("-inf")
        ties = []

        for i in range(len(q_values)):
            if q_values[i] > top:
                top = q_values[i]
                ties = []

            if q_values[i] == top:
                ties.append(i)

        return self.rand_generator.choice(ties)

In [16]:
action_space_size = env.action_space.n                #number of actions in each state
state_space_size = env.observation_space.n            #number of states
# passing info to agent about number of actions and states
agent_info = {"num_actions": action_space_size, "num_states": state_space_size, "epsilon": 0.1, "step_size": 0.01, "discount": 1, "seed": 0}
current_agent = QLearningAgent()            #Using Q learning Agent
current_agent.agent_init(agent_info)
num_episodes=70000
max_steps_per_episode=5000

P.S  See My Other project on github in which i use Expected Sarsa agent

In [17]:
# joining Open AI GYM WITH Q LEARNING Agent
rewards_all_episodes=list()
for episode in tqdm(range(num_episodes)):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    action=current_agent.agent_start(state)
    new_state, reward, done, info=env.step(action)
    state = new_state
    rewards_current_episode += reward 
    for step in range(max_steps_per_episode):
        if(done == True):
            current_agent.agent_end(reward)
            break
        else:
            action=current_agent.agent_step(reward, state)
            new_state, reward, done, info=env.step(action)
            rewards_current_episode += reward 
            state=new_state
    rewards_all_episodes.append(rewards_current_episode)

100%|███████████████████████████████████████████████████████████████████████████| 70000/70000 [02:11<00:00, 532.14it/s]


In [18]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.014000000000000005
2000 :  0.017000000000000008
3000 :  0.017000000000000008
4000 :  0.016000000000000007
5000 :  0.02000000000000001
6000 :  0.016000000000000007
7000 :  0.035000000000000024
8000 :  0.02900000000000002
9000 :  0.07200000000000005
10000 :  0.07600000000000005
11000 :  0.06200000000000005
12000 :  0.06800000000000005
13000 :  0.046000000000000034
14000 :  0.06600000000000004
15000 :  0.06600000000000004
16000 :  0.07300000000000005
17000 :  0.08900000000000007
18000 :  0.060000000000000046
19000 :  0.06700000000000005
20000 :  0.08000000000000006
21000 :  0.10200000000000008
22000 :  0.10000000000000007
23000 :  0.08900000000000007
24000 :  0.08300000000000006
25000 :  0.08600000000000006
26000 :  0.10800000000000008
27000 :  0.10100000000000008
28000 :  0.08800000000000006
29000 :  0.09600000000000007
30000 :  0.12000000000000009
31000 :  0.12800000000000009
32000 :  0.11900000000000009
33000 :  0.12100000

In [19]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(current_agent.q)



********Q-table********

[[0.76131572 0.72154676 0.71523068 0.727399  ]
 [0.47640302 0.43534265 0.39271261 0.6534174 ]
 [0.53944246 0.40335159 0.36540187 0.45228239]
 [0.06581023 0.2274506  0.04938301 0.09341621]
 [0.7626714  0.50777255 0.50179235 0.51212646]
 [0.         0.         0.         0.        ]
 [0.350699   0.21193628 0.41917845 0.15153827]
 [0.         0.         0.         0.        ]
 [0.51291181 0.50556576 0.46625105 0.76656245]
 [0.53596611 0.77567405 0.48044763 0.4922064 ]
 [0.71502504 0.54315217 0.42706361 0.36887915]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.50839219 0.55173777 0.83831227 0.54305716]
 [0.83800002 0.91975289 0.87921099 0.86324902]
 [0.         0.         0.         0.        ]]


# Behaving Using learned Target Policy

In [20]:
for episode in range(3):
    # initialize new episode params
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        # Show current state of environment on screen
        # Choose action with highest Q-value for current state       
        # Take new action
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(current_agent.q[state,:])        
        new_state, reward, done, info = env.step(action)
        state=new_state
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break
        # Set new state
env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****


# Checking in How many Epsiodes our agent reaches goal using learned target policy using Q learning

In [21]:
win=0
lose=0
for episode in range(10000):
    # initialize new episode params
    state = env.reset()
    done = False
    #print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    #time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        
        action = np.argmax(current_agent.q[state,:])        
        new_state, reward, done, info = env.step(action)
        state=new_state
        
        if done:
            #clear_output(wait=True)
            #env.render()
            if reward == 1:
                #print("****You reached the goal!****")
                #time.sleep(3)
                win+=1
            else:
                #print("****You fell through a hole!****")
                #time.sleep(3)
                #clear_output(wait=True)
                lose+=1
            break
        # Set new state
env.close()
print("win",win,"Lose",lose)

win 7323 Lose 2677
