# Importing libraries

In [4]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output
import agent
from tqdm import tqdm

In [5]:
env = gym.make("FrozenLake-v0")           #Frozen lake env

#env = gym.make("FrozenLake8x8-v0")        # if we want 8x8 enviroment

  result = entry_point.load(False)


# On-Policy Expected sarsa Agent Code

In [12]:
# Expected Sarsa agent here
class ExpectedSarsaAgent(agent.BaseAgent):
    def agent_init(self, agent_init_info):
        """Setup for the agent called when the experiment first starts.
        
        Args:
        agent_init_info (dict), the parameters used to initialize the agent. The dictionary contains:
        {
            num_states (int): The number of states,
            num_actions (int): The number of actions,
            epsilon (float): The epsilon parameter for exploration,
            step_size (float): The step-size,
            discount (float): The discount factor,
        }
        
        """
        # Store the parameters provided in agent_init_info.
        self.num_actions = agent_init_info["num_actions"]
        self.num_states = agent_init_info["num_states"]
        self.epsilon = agent_init_info["epsilon"]
        self.step_size = agent_init_info["step_size"]
        self.discount = agent_init_info["discount"]
        self.rand_generator = np.random.RandomState(agent_info["seed"])
        
        # Create an array for action-value estimates and initialize it to zero.
        self.q = np.zeros((self.num_states, self.num_actions)) # The array of action-value estimates.

        
    def agent_start(self, state):
        """The first method called when the episode starts, called after
        the environment starts.
        Args:
            state (int): the state from the
                environment's evn_start function.
        Returns:
            action (int): the first action the agent takes.
        """
        
        # Choose action using epsilon greedy.
        current_q = self.q[state, :]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_step(self, reward, state):
        """A step taken by the agent.
        Args:
            reward (float): the reward received for taking the last action taken
            state (int): the state from the
                environment's step based on where the agent ended up after the
                last step.
        Returns:
            action (int): the action the agent is taking.
        """
        
        # Choose action using epsilon greedy.
        current_q = self.q[state,:]
        if self.rand_generator.rand() < self.epsilon:
            action = self.rand_generator.randint(self.num_actions)
        else:
            action = self.argmax(current_q)
        
        # Perform an update (~5 lines)
        ### START CODE HERE ###
        m = max(current_q)
        greedies=[i for i, j in enumerate(current_q) if j == m]
        probability= np.ones(self.num_actions) * (self.epsilon/self.num_actions)
        for i in greedies:
            probability[i] += ((1 - self.epsilon)/len(greedies))
        expectation = np.sum(current_q * probability)
        target = (self.discount*expectation) + reward
        self.q[self.prev_state,self.prev_action]=self.q[self.prev_state,self.prev_action] + self.step_size * (target-self.q[self.prev_state,self.prev_action])
        ### END CODE HERE ###
        
        self.prev_state = state
        self.prev_action = action
        return action
    
    def agent_end(self, reward):
        """Run when the agent terminates.
        Args:
            reward (float): the reward the agent received for entering the
                terminal state.
        """
        # Perform the last update in the episode (1 line)
        ### START CODE HERE ###
        self.q[self.prev_state,self.prev_action]=self.q[self.prev_state,self.prev_action] + self.step_size * ((reward)-self.q[self.prev_state,self.prev_action])
        ### END CODE HERE ###
        
    def argmax(self, q_values):
        """argmax with random tie-breaking
        Args:
            q_values (Numpy array): the array of action-values
        Returns:
            action (int): an action with the highest value
        """
        top = float("-inf")
        ties = []

        for i in range(len(q_values)):
            if q_values[i] > top:
                top = q_values[i]
                ties = []

            if q_values[i] == top:
                ties.append(i)

        return self.rand_generator.choice(ties)

In [13]:
action_space_size = env.action_space.n                #number of actions in each state
state_space_size = env.observation_space.n            #number of states
# passing info to agent about number of actions and states
agent_info = {"num_actions": action_space_size, "num_states": state_space_size, "epsilon": 0.1, "step_size": 0.01, "discount": 1, "seed": 0}
current_agent = ExpectedSarsaAgent()            #Using Expected Sarsa Agent
current_agent.agent_init(agent_info)
num_episodes=70000
max_steps_per_episode=5000

P.S  See My Other project on github in which i use Q Learning agent

In [14]:
# joining Open AI GYM WITH Expected Sarsa Agent
rewards_all_episodes=list()
for episode in tqdm(range(num_episodes)):
    state = env.reset()
    done = False
    rewards_current_episode = 0
    
    action=current_agent.agent_start(state)
    new_state, reward, done, info=env.step(action)
    state = new_state
    rewards_current_episode += reward 
    for step in range(max_steps_per_episode):
        if(done == True):
            current_agent.agent_end(reward)
            break
        else:
            action=current_agent.agent_step(reward, state)
            new_state, reward, done, info=env.step(action)
            rewards_current_episode += reward 
            state=new_state
    rewards_all_episodes.append(rewards_current_episode)

100%|███████████████████████████████████████████████████████████████████████████| 70000/70000 [02:49<00:00, 412.74it/s]


In [15]:
# Calculate and print the average reward per thousand episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("********Average reward per thousand episodes********\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000

********Average reward per thousand episodes********

1000 :  0.024000000000000014
2000 :  0.02000000000000001
3000 :  0.022000000000000013
4000 :  0.03000000000000002
5000 :  0.017000000000000008
6000 :  0.022000000000000013
7000 :  0.04400000000000003
8000 :  0.04000000000000003
9000 :  0.04900000000000004
10000 :  0.06200000000000005
11000 :  0.07100000000000005
12000 :  0.08400000000000006
13000 :  0.10400000000000008
14000 :  0.11100000000000008
15000 :  0.1580000000000001
16000 :  0.18200000000000013
17000 :  0.25100000000000017
18000 :  0.33100000000000024
19000 :  0.3100000000000002
20000 :  0.3070000000000002
21000 :  0.2930000000000002
22000 :  0.2880000000000002
23000 :  0.3170000000000002
24000 :  0.32000000000000023
25000 :  0.3980000000000003
26000 :  0.4060000000000003
27000 :  0.4290000000000003
28000 :  0.3890000000000003
29000 :  0.3960000000000003
30000 :  0.3920000000000003
31000 :  0.4110000000000003
32000 :  0.3830000000000003
33000 :  0.3910000000000003
34000 :  

In [16]:
# Print updated Q-table
print("\n\n********Q-table********\n")
print(current_agent.q)



********Q-table********

[[0.40228217 0.38379747 0.38270601 0.37872023]
 [0.24683992 0.22555915 0.19801417 0.34424902]
 [0.2926299  0.2309869  0.22349642 0.2480392 ]
 [0.1392673  0.06974012 0.04440527 0.08551391]
 [0.41937304 0.28388277 0.30385142 0.2738895 ]
 [0.         0.         0.         0.        ]
 [0.25283855 0.22271445 0.22205421 0.07576774]
 [0.         0.         0.         0.        ]
 [0.27866125 0.33258665 0.30776647 0.45723094]
 [0.34705839 0.5216258  0.4131865  0.33447838]
 [0.52206365 0.46661646 0.37114987 0.25553435]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.37121508 0.50665051 0.65887584 0.448794  ]
 [0.66800414 0.82485241 0.78386452 0.73006513]
 [0.         0.         0.         0.        ]]


# Behaving Using learned Target Policy

In [17]:
for episode in range(3):
    # initialize new episode params
    state = env.reset()
    done = False
    print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        # Show current state of environment on screen
        # Choose action with highest Q-value for current state       
        # Take new action
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        # We use epsilon policy as it was our target policy
        # Choose action using epsilon greedy.
        current_q = current_agent.q[state, :]
        if current_agent.rand_generator.rand() < current_agent.epsilon:
            action = current_agent.rand_generator.randint(current_agent.num_actions)
        else:
            action = current_agent.argmax(current_q)
            
        new_state, reward, done, info = env.step(action)
        state=new_state
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("****You reached the goal!****")
                time.sleep(3)
            else:
                print("****You fell through a hole!****")
                time.sleep(3)
                clear_output(wait=True)
            break
        # Set new state
env.close()

  (Down)
SFFF
FHFH
FFFH
[41mH[0mFFG
****You fell through a hole!****


# Checking in How many Epsiodes our agent reaches goal using learned target policy using Expected Sarsa algorthim

In [11]:
win=0
lose=0
for episode in range(10000):
    # initialize new episode params
    state = env.reset()
    done = False
    #print("*****EPISODE ", episode+1, "*****\n\n\n\n")
    #time.sleep(1)
    
    for step in range(max_steps_per_episode):        
        
        current_q = current_agent.q[state, :]
        if current_agent.rand_generator.rand() < current_agent.epsilon:
            action = current_agent.rand_generator.randint(current_agent.num_actions)
        else:
            action = current_agent.argmax(current_q)       
        new_state, reward, done, info = env.step(action)
        state=new_state
        
        if done:
            if reward == 1:
                win+=1
            else:
                lose+=1
            break
env.close()
print("win",win,"Lose",lose)

win 131 Lose 9869


Win times are very low because this env is very small and due to randomness in our policy and enviroment chances of felling into hole are greater as compare to Q learning agent in which Only randomness is in enviroment and not in target policy, So thats why in Q learning chances of falling down into hole are low