In [38]:
# cell for training
#Code was made by me using the lunar landing and k-armed bandit problem as template
# You can change training map and choose desired one in the code below,currently it is set to random map generation
import gymnasium as gym
import random
import numpy as np
from gymnasium.envs.toy_text.frozen_lake import generate_random_map
class Agent:
    def __init__(self,expl,lr,gamma,size):
        self.Q=np.zeros((size*size,4))
        self.V=np.zeros(size*size)
        self.expl=expl
        self.initialexpl=expl
        self.gamma=gamma
        self.lr=lr
        self.size=size
    def upd_expl(self,no_episodes):
        self.expl-=9/(10*no_episodes)*self.initialexpl #decreasing tendency to explore as time progresses,to use more exploitation
    def upd_Q(self,prev_state,next_state,action,reward):
        self.Q[prev_state,action] = self.Q[prev_state,action]*(1-self.lr) + self.lr*(reward+self.gamma*np.max(self.Q[next_state]))
    def get_action(self,state):
        if np.random.random() < self.expl: # explore
            return np.random.randint(4)
        else: # exploit
            return np.random.choice(np.flatnonzero(self.Q[state] == self.Q[state].max()))
    def get_action1(self,state):#action used by trained agent doesnt require exploration
        return np.random.choice(np.flatnonzero(self.Q[state] == self.Q[state].max()))


def experiment(no_episodes,descr,mp_name,size):
    env = gym.make(
        'FrozenLake-v1',
        desc=descr,
        map_name=mp_name,
        is_slippery=False,
        reward_schedule=(reward_goal,reward_hole ,reward_move)
        
    )
    agent=Agent(expl,lr,gamma,size)
    for episode in range(no_episodes):
    # Get the first observation -> Initial State
        state1, info = env.reset()
        reward_10list=[]
        total_reward = 0
        run =True
        while (run):
            action=agent.get_action(state1)
            state2, reward, terminated, truncated, info = env.step(action)
            total_reward += reward
            agent.upd_Q(state1,state2,action,reward)
            state1=state2
            # If the episode has ended then we can reset to start a new episode
            if terminated or truncated:
                run = False
        reward_10list.append(total_reward)     
        total_reward=0
        agent.upd_expl(no_episodes)#decreasing tendency to explore as more episodes are completed,to use more exploitation
        #For checking progress intermitedly
        if((episode+1)%100==0): 
            print("Finished",episode+1,"episodes")
            print("Reward_avg for last 100=",np.sum(reward_10list)/len(reward_10list))
            reward_10list=[]
    env.close()
    return agent #Trained agent returned

# Rewards set up to minimize unnecessary movement and termination due to falling in holes
reward_move=-1
reward_hole=-30
reward_goal=100
expl=0.1
lr=0.8
gamma=1  # as total reward sums reward of all steps,and goal is to achieve the total reward so i believe discount factor here is not needed,so it is set to 1
no_episodes_to_train=1000 # can be increased for more fine tuning but seems to be sufficient
mp_name=None
descr=generate_random_map(size=8)# map can be changed to any map by manually creating or using mp_name
size=len(descr) # if we use a 4x4 map size is said to be 4
agent=experiment(no_episodes_to_train,descr,mp_name,size)#agent is returned from function after training


Finished 100 episodes
Reward_avg for last 100= 87.0
Finished 200 episodes
Reward_avg for last 100= 87.0
Finished 300 episodes
Reward_avg for last 100= 87.0
Finished 400 episodes
Reward_avg for last 100= 87.0
Finished 500 episodes
Reward_avg for last 100= 87.0
Finished 600 episodes
Reward_avg for last 100= 87.0
Finished 700 episodes
Reward_avg for last 100= 87.0
Finished 800 episodes
Reward_avg for last 100= 87.0
Finished 900 episodes
Reward_avg for last 100= 87.0
Finished 1000 episodes
Reward_avg for last 100= 87.0


In [None]:
#Cell for running final trained agent
env = gym.make(
      'FrozenLake-v1',
       desc=descr,
       map_name=mp_name,
       is_slippery=False,
       reward_schedule=(reward_goal,reward_hole ,reward_move),
       render_mode="human" 
    )
run=True
state1, info = env.reset()
total_reward = 0
while (run):
    action=agent.get_action1(state1)#getaction1 doesnt involve exploration
    state1, reward, terminated, truncated, info = env.step(action)
    total_reward += reward
    if terminated or truncated:
        run = False
print("Total reward=",total_reward)
if (total_reward==(reward_goal-2*size+3)):
    print("Goal Reached")
else :
    print("Mission failed....")
env.close()
      


Total reward= 87
Goal Reached
