# Importing libraries

In [156]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output
import agent
from tqdm import tqdm

In [157]:
env = gym.make("FrozenLake-v0")           #Frozen lake env

#env = gym.make("FrozenLake8x8-v0")        # if we want 8x8 enviroment

# Our Q Learning Agent Code

In [158]:
# Q-Learning agent here
class QLearningAgent(agent.BaseAgent):
    def agent_init(self, agent_init_info):
        """Setup for the agent called when the experiment first starts.
        
        Args:
        agent_init_info (dict), the parameters used to initialize the agent. The dictionary contains:
        {
            num_states (int): The number of states,
            num_actions (int): The number of actions,
            epsilon (float): The epsilon parameter for exploration,
            step_size (float): The step-size,
            discount (float): The discount factor,
        }
        
        """
        # Store the parameters provided in agent_init_info.
        self.num_actions = agent_init_info["num_actions"]
        self.num_states = agent_init_info["num_states"]
        self.epsilon = agent_init_info["epsilon"]
        self.step_size = agent_init_info["step_size"]
        self.discount = agent_init_info["discount"]
        self.rand_generator = np.random.RandomState(agent_info["seed"])
        
        # Create an array for action-value estimates and initialize it to zero.
        self.q = np.zeros((self.num_states, self.num_actions)) # The array of action-value estimates.

        
    def agent_start(self, state):
        """The first method called when the episode starts, called after
        the environment starts.
        Args:
            state (int): the state from the
                environment's evn_start function.
        Returns:
            action (int): the first action the agent takes.
        """
        
        # Choose action using epsilon greedy.
        current_q = self.q[state,:]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            state (int): the state from the
                environment's step based on where the agent ended up after the
                last step.
        Returns:
            action (int): the action the agent is taking.
        """
        
        # Choose action using epsilon greedy.
        current_q = self.q[state, :]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        
        # Perform an update (1 line)
        ### START CODE HERE ###
        self.q[self.prev_state,self.prev_action]=self.q[self.prev_state,self.prev_action] + self.step_size * ((reward + (self.discount*np.max(current_q)))-self.q[self.prev_state,self.prev_action])
        ### END CODE HERE ###
        
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """
        # Perform the last update in the episode (1 line)
        ### START CODE HERE ###
        self.q[self.prev_state,self.prev_action]=self.q[self.prev_state,self.prev_action] + self.step_size * ((reward)-self.q[self.prev_state,self.prev_action])
        ### END CODE HERE ###
        
    def argmax(self, q_values):
        """argmax with random tie-breaking
        Args:
            q_values (Numpy array): the array of action-values
        Returns:
            action (int): an action with the highest value
        """
        top = float("-inf")
        ties = []

        for i in range(len(q_values)):
            if q_values[i] > top:
                top = q_values[i]
                ties = []

            if q_values[i] == top:
                ties.append(i)

        return self.rand_generator.choice(ties)

In [161]:
action_space_size = env.action_space.n                #number of actions in each state
state_space_size = env.observation_space.n            #number of states
# passing info to agent about number of actions and states
agent_info = {"num_actions": action_space_size, "num_states": state_space_size, "epsilon": 0.1, "step_size": 0.01, "discount": 1, "seed": 0}
current_agent = QLearningAgent()            #Using Q learning Agent
current_agent.agent_init(agent_info)
num_episodes=70000
max_steps_per_episode=5000

P.S  See My Other project on github in which i use Expected Sarsa agent

In [162]:
# joining Open AI GYM WITH Q LEARNING Agent
rewards_all_episodes=list()
for episode in tqdm(range(num_episodes)):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    action=current_agent.agent_start(state)
    new_state, reward, done, info=env.step(action)
    state = new_state
    rewards_current_episode += reward 
    for step in range(max_steps_per_episode):
        if(done == True):
            current_agent.agent_end(reward)
            break
        else:
            action=current_agent.agent_step(reward, state)
            new_state, reward, done, info=env.step(action)
            rewards_current_episode += reward 
            state=new_state
    rewards_all_episodes.append(rewards_current_episode)

100%|███████████████████████████████████████████████████████████████████████████| 70000/70000 [03:03<00:00, 381.75it/s]


In [163]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.05100000000000004
2000 :  0.058000000000000045
3000 :  0.04400000000000003
4000 :  0.05600000000000004
5000 :  0.047000000000000035
6000 :  0.04300000000000003
7000 :  0.057000000000000044
8000 :  0.08700000000000006
9000 :  0.09300000000000007
10000 :  0.07100000000000005
11000 :  0.11500000000000009
12000 :  0.11000000000000008
13000 :  0.1560000000000001
14000 :  0.20600000000000016
15000 :  0.23100000000000018
16000 :  0.2970000000000002
17000 :  0.3140000000000002
18000 :  0.32400000000000023
19000 :  0.3180000000000002
20000 :  0.33100000000000024
21000 :  0.4100000000000003
22000 :  0.4280000000000003
23000 :  0.4160000000000003
24000 :  0.3800000000000003
25000 :  0.3930000000000003
26000 :  0.4040000000000003
27000 :  0.4140000000000003
28000 :  0.4110000000000003
29000 :  0.4030000000000003
30000 :  0.4030000000000003
31000 :  0.3810000000000003
32000 :  0.3980000000000003
33000 :  0.4120000000000003
34000 :  0.4

In [164]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(current_agent.q)



********Q-table********

[[0.40987954 0.38082047 0.38311356 0.38409398]
 [0.23310797 0.22937261 0.21953898 0.35671729]
 [0.29617451 0.28467442 0.27107701 0.30935657]
 [0.18684501 0.18040792 0.15595081 0.28666116]
 [0.42585448 0.30192443 0.28240295 0.25949214]
 [0.         0.         0.         0.        ]
 [0.23414336 0.13958608 0.26565235 0.10661849]
 [0.         0.         0.         0.        ]
 [0.3124551  0.31600368 0.31108977 0.46202819]
 [0.32874337 0.54247702 0.37009162 0.31296692]
 [0.56233744 0.4004389  0.36393346 0.25611765]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.37873504 0.51490529 0.64749893 0.47313953]
 [0.66742334 0.80544676 0.7751423  0.70475864]
 [0.         0.         0.         0.        ]]


# Behaving Using learned Target Policy

In [165]:
for episode in range(3):
    # initialize new episode params
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        # Show current state of environment on screen
        # Choose action with highest Q-value for current state       
        # Take new action
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(current_agent.q[state,:])        
        new_state, reward, done, info = env.step(action)
        state=new_state
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break
        # Set new state
env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
****You reached the goal!****


# Checking in How many Epsiodes our agent reaches goal using learned target policy using Q learning

In [167]:
win=0
lose=0
for episode in range(10000):
    # initialize new episode params
    state = env.reset()
    done = False
    #print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    #time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        
        action = np.argmax(current_agent.q[state,:])        
        new_state, reward, done, info = env.step(action)
        state=new_state
        
        if done:
            #clear_output(wait=True)
            #env.render()
            if reward == 1:
                #print("****You reached the goal!****")
                #time.sleep(3)
                win+=1
            else:
                #print("****You fell through a hole!****")
                #time.sleep(3)
                #clear_output(wait=True)
                lose+=1
            break
        # Set new state
env.close()
print("win",win,"Lose",lose)

win 7323 Lose 2677
