In [80]:
import gymnasium as gym
from gymnasium import ObservationWrapper, RewardWrapper
import pandas as pd

In [75]:
env = gym.make('FrozenLake-v1', desc=None, map_name="4x4", is_slippery=True, render_mode="human")

In [76]:
def interact_with_env(env) -> 'pd.DataFrame':    
    l_obs = []
    l_action = []
    l_next_obs = []
    l_reward = []
    l_terminated = []
    l_truncated = []
    
    observation, info = env.reset(seed=42)
    for _ in range(80):
       l_obs.append(observation)
       action = env.action_space.sample() # # policy(observation)  # User-defined policy function
       l_action.append(action) 
       observation, reward, terminated, truncated, info = env.step(action)
       l_next_obs.append(observation)
       l_reward.append(reward)
       l_terminated.append(terminated)
       l_truncated.append(truncated)
        
       if terminated or truncated:
          observation, info = env.reset()

    return pd.DataFrame({
        'obs': l_obs,
        'action': l_action,
        'next_obs': l_next_obs,
        'reward': l_reward,
        'terminated': l_terminated,
        'truncated': l_truncated,
    })

In [72]:
df = interact_with_env(env)

In [73]:
df.style

Unnamed: 0,obs,action,next_obs,reward,terminated,truncated
0,0,0,0,0.0,False,False
1,0,3,0,0.0,False,False
2,0,1,1,0.0,False,False
3,1,1,0,0.0,False,False
4,0,1,1,0.0,False,False
5,1,3,0,0.0,False,False
6,0,3,0,0.0,False,False
7,0,1,0,0.0,False,False
8,0,2,1,0.0,False,False
9,1,1,5,0.0,True,False


In [93]:
#env.close()

In [30]:
env.observation_space

Discrete(16)

In [31]:
env.action_space

Discrete(4)

In [57]:
class CartesianObservationWrapper(ObservationWrapper):
    def __init__(self, env: gym.Env):
        super().__init__(env)

    def observation(self, observation):
        row = observation // 4
        col = observation % 4
        return row, col

    @property
    def observation_space(self):
        return gym.spaces.Tuple((gym.spaces.Discrete(4), gym.spaces.Discrete(4)))

In [77]:
cartesian_env = CartesianObservationWrapper(env)

In [49]:
obs, info = cartesian_env.reset(seed=42)

In [50]:
info

{'prob': 1}

In [51]:
obs

(0, 0)

In [59]:
cartesian_env.observation_space

Tuple(Discrete(4), Discrete(4))

In [62]:
cartesian_env.observation_space.sample()

(np.int64(0), np.int64(3))

In [78]:
df = interact_with_env(cartesian_env)

In [79]:
df.style

Unnamed: 0,obs,action,next_obs,reward,terminated,truncated
0,"(0, 0)",0,"(0, 0)",0.0,False,False
1,"(0, 0)",0,"(1, 0)",0.0,False,False
2,"(1, 0)",3,"(1, 0)",0.0,False,False
3,"(1, 0)",2,"(2, 0)",0.0,False,False
4,"(2, 0)",2,"(1, 0)",0.0,False,False
5,"(1, 0)",3,"(1, 0)",0.0,False,False
6,"(1, 0)",1,"(1, 1)",0.0,True,False
7,"(0, 0)",2,"(0, 1)",0.0,False,False
8,"(0, 1)",1,"(1, 1)",0.0,True,False
9,"(0, 0)",2,"(0, 1)",0.0,False,False


In [88]:
class ManhattanDistanceRewardWrapper(RewardWrapper):
    def __init__(self, env: gym.Env):
        super().__init__(env)
        self.current_obs = None

    def reward(self, reward):
        manhattan_distance = (3 - self.current_obs[0]) + (3 - self.current_obs[1])
        return reward * 100 - manhattan_distance 

    def reset(self, **kwargs):
        obs, info = self.env.reset(**kwargs)
        self.current_obs = obs
        return obs, info

    def step(self, action):
        obs, reward, terminated, truncated, info = self.env.step(action)
        self.current_obs = obs
        return obs, self.reward(reward), terminated, truncated, info

In [89]:
md_env = ManhattanDistanceRewardWrapper(cartesian_env)

In [90]:
df = interact_with_env(md_env)

In [91]:
df.style

Unnamed: 0,obs,action,next_obs,reward,terminated,truncated
0,"(0, 0)",3,"(0, 0)",-6.0,False,False
1,"(0, 0)",0,"(1, 0)",-5.0,False,False
2,"(1, 0)",0,"(2, 0)",-4.0,False,False
3,"(2, 0)",1,"(2, 0)",-4.0,False,False
4,"(2, 0)",2,"(1, 0)",-5.0,False,False
5,"(1, 0)",2,"(0, 0)",-6.0,False,False
6,"(0, 0)",0,"(1, 0)",-5.0,False,False
7,"(1, 0)",2,"(2, 0)",-4.0,False,False
8,"(2, 0)",0,"(2, 0)",-4.0,False,False
9,"(2, 0)",2,"(2, 1)",-3.0,False,False
