<a href="https://colab.research.google.com/github/yafo1948/Reinforcement_Learning_RUNI.TASHPAG/blob/main/Maze2D_Getting_the_transition_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [None]:
%%capture
!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install gym
!pip install pygame
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet
!pip install -U --no-cache-dir gdown --pre
!gdown --id 1FeuIx5OVLmfCx0dxxwU-7Xn8gpPc-53D
!unzip /content/maze_mid.zip

# Imports

In [None]:
import numpy as np
import gym
from gym import logger as gymlogger
from gym.utils import seeding
from gym import error, spaces, utils
gymlogger.set_level(40) # error only
import glob
import io
import base64
import os
import random
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
from pyvirtualdisplay import Display
from IPython.display import HTML
from IPython import display as ipythondisplay
import pygame
from maze_mid import *
from maze_mid.cust_maze import MazeEnvCast5x5, MazeEnvCast15x15, MazeEnvCast25x25
import pyvirtualdisplay
import imageio
import IPython
import time

In [None]:
def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

**Helpers and utilities**
The next cell holds a function that verifies which actions are available from your current state and a mapping from actions to logical names, directions and movements.

In [None]:
# Numbers to action mapping.
nTa = {
       0:("UP","N", (0, -1)),
       1:("DOWN","S", (0, 1)),
       2:("RIGHT","E", (1, 0)),
       3:("LEFT","W",(-1, 0))
       };

def get_available_actions(state):
    available_actions = [];
    for action in actions:
        # Get the correct letter of the action.
        action_letter = nTa[action][1];
        
        # Check if the move is legit.
        legit = env.unwrapped.maze_view.maze.is_open(state,action_letter);
        if(legit): available_actions.append(action);

    return available_actions;      

This section demonstrates how to artificially generate the transition model
for the Maze game which does not implicitly supply it.
We also use our own custom logic to make the actions stochastic.

In [None]:
# Initialize the environment.
env = MazeEnvCast5x5();

# --------------------------------------------- #
# Get the basic information of the environment.
# --------------------------------------------- #

# Number of states.
number_of_states = np.prod((env.observation_space.high - env.observation_space.low) + 1 );

# Define the terminal state.
terminal_state = (env.observation_space.high[0],env.observation_space.high[1]);

# Actions.
actions = np.arange(env.action_space.n);

# Maze rows and cols.
rows = env.observation_space.high[0] + 1;
cols = env.observation_space.high[1] + 1;

In [None]:
# Get the maze as a matrix.
maze = env.unwrapped.maze_view.maze.maze_cells;

print("The maze is:\n",maze);

In [None]:
# Initialize R - The rewards.
R = np.ones_like(maze,dtype = float) * -(0.1/number_of_states);
R[terminal_state] = 1;

print(R);

In [None]:
# Our custom transition probability.
p = 0.9;

# 2. Create a placeholder for the model P which mimics env.P.
P = {};
for i in range(rows):
    for j in range(cols):
            P.update({(i,j): {action : [] for action in actions}});

# Create the model.
for state in P.keys():
    # Get the available actions.
    a_actions = get_available_actions(state);
    # Get the states.
    next_states = [];
    
    # The action the agent chose.
    for chosen_action in actions:
        # The action the agent actually takes.
        for actual_action in actions:
            # Find out if youre staying in place or actually moving.
            next_state = tuple(np.add(state,nTa[actual_action][2])) if (actual_action in a_actions) else state;
            # Append the transition model.
            if(actual_action == chosen_action):
                P[state][chosen_action].append([p,next_state,R[next_state],0]);
            else:
                P[state][chosen_action].append([(1 - p) / 3,next_state,R[next_state],0]);

**Treating P as env.P**

We can now use P just like we would use env.P

In [None]:
# Create the uniform random policy.
π = {state:np.ones_like(actions) / len(actions) for state in P.keys()};

# Observe the initial position.
state = (0,0);

for action,action_probability in enumerate(π[state]):
  print(action,action_probability)
  for transition_probability, next_state, reward, done in P[state][action]:
    print(transition_probability, next_state, reward, done);