In [1]:
import gym

# defining and showing my environment 

In [2]:
env = gym.make("FrozenLake-v1", is_slippery=False,render_mode='human')
env.reset()
env.render()

In [3]:
# no of our possible states
print(env.observation_space)
# print the states in the environment
print(env.observation_space.n)

# no of possible actions
print(env.action_space)
# what are the actions in the environment
print(env.action_space.n)


Discrete(16)
16
Discrete(4)
4


In [4]:
# analysing the transition probability and reward function
# it means that in state 0 what is the probability of taking a action 2
env.P[0][3]
# it returns (probability, nextstate, reward, done)

[(1.0, 0, 0.0, False)]

In [5]:
# lets take the first step
# env.reset()
env.step(1)
# it returns (nextstate, reward, done, _,info)
# info contains the probability of the action


(4, 0.0, False, False, {'prob': 1.0})


4x4=16 States 

Left, Right, Up, Down 4 actions.

In [6]:
import numpy as np
np.random.randint(0,2)
import time

## random action selection and also generating episodes


In [7]:
env.reset()
now=time.time()
while(1):
    if time.time()-now>10:
        break
    # random_action = env.action_space.sample()
    # print("our random_action is ", random_action)
    random_action=np.random.randint(1,3)
    print("our random_action is ", random_action)
    next_state, reward, done,_ ,info = env.step(random_action)
    if next_state in [5,7,11,12]:
        env.reset()
    if next_state==15:
        print("hurray we reached the goal")
        time.sleep(5)
        env.reset()  
    if done==True:
        print("we are done")
        time.sleep(5)
        env.reset()  
    print(f"the current state is  {next_state}")

# if next_state==15:
    

our random_action is  1
the current state is  4
our random_action is  2
we are done
the current state is  5
our random_action is  1
the current state is  4
our random_action is  1
the current state is  8
our random_action is  1
we are done


KeyboardInterrupt: 

# value iteration method

In [8]:
env.P[14]

{0: [(1.0, 13, 0.0, False)],
 1: [(1.0, 14, 0.0, False)],
 2: [(1.0, 15, 1.0, True)],
 3: [(1.0, 10, 0.0, False)]}

In [9]:
def value_iteration(env):
    num_iterations = 1000
    # break point 
    threshold=1e-20
    # discount factor
    gamma = 0.9
    
    # initialize the value function- it will equal to no of state possible
    value_function_table=np.zeros(env.observation_space.n)    
    for i in range(num_iterations):
        updated_value_function_table=np.copy(value_function_table)
        # computing the q_value for all possible action for each state
        for state in range(env.observation_space.n):
            # printing how the value function is changing

            q_value=[sum([prob*(reward+gamma*updated_value_function_table[next_state]) for prob,next_state,reward,_ in env.P[state][action]]) for action in range(env.action_space.n)]
            # update the value function
            value_function_table[state]=max(q_value)
        # check the convergence
        if (np.sum(np.fabs(updated_value_function_table-value_function_table))<threshold):
            print("converged at iteration ",i+1)
            print("final value function table is ",updated_value_function_table)
            break
    return value_function_table

# getting the optimal value function
optimal_value_function=value_iteration(env)

converged at iteration  7
final value function table is  [0.59049 0.6561  0.729   0.6561  0.6561  0.      0.81    0.      0.729
 0.81    0.9     0.      0.      0.9     1.      0.     ]


In [10]:
optimal_value_function

array([0.59049, 0.6561 , 0.729  , 0.6561 , 0.6561 , 0.     , 0.81   ,
       0.     , 0.729  , 0.81   , 0.9    , 0.     , 0.     , 0.9    ,
       1.     , 0.     ])

In [11]:
# # lets see the optimal policy
# env = gym.make("FrozenLake-v1", is_slippery=False,render_mode='human')
# env.reset()
# env.render()
# curr_state=1
# while(1):
#     # choosing the optimal_value -we will move where the value_function is max
#     if max(optimal_value_function[curr_state+1],optimal_value_function[curr_state+2],optimal_value_function[curr_state+3],optimal_value_function[curr_state+4])==optimal_value_function[curr_state+1] and curr_state<=15:
#         # move right
#         action=2
#     elif max(optimal_value_function[curr_state+1],optimal_value_function[curr_state+2],optimal_value_function[curr_state+3],optimal_value_function[curr_state+4])==optimal_value_function[curr_state+2]:
#         # move left
#         action=3
#     elif max(optimal_value_function[curr_state+1],optimal_value_function[curr_state+2],optimal_value_function[curr_state+3],optimal_value_function[curr_state+4])==optimal_value_function[curr_state+3]:
#         # move up
#         action=0
#     else:
#         # move down
#         action=1
#     next_state,reward,done,_,_=env.step(action)
#     curr_state=next_state
#     if curr_state==16:
#         print("hurray we reached the goal")
#         break


In [12]:
# using value function to generate the policy now
def extract_policy(value_function_table, gamma=1.0):
    # initialize the policy with zeros
    policy=np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        # initialize the q_table for each state
        q_table=np.zeros(env.action_space.n)
        # compute the q_value for all possible actions in that state
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                # next_sr is a tuple of (probability, nextstate, reward, done)
                prob,next_state,reward,done=next_sr
                q_table[action]+=(prob*(reward+gamma*value_function_table[next_state]))
        # select the action which has the maximum q_value
        policy[state]=np.argmax(q_table)
    return policy


In [13]:
optimum_policy=extract_policy(optimal_value_function)

In [14]:
optimum_policy

array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 1., 0.])

In [15]:
i=0
optimum_policy[14]=2

In [16]:
curr_state=env.reset()[0]

In [17]:
optimum_policy

array([1., 2., 1., 0., 1., 0., 1., 0., 2., 1., 1., 0., 0., 2., 2., 0.])

In [20]:
env = gym.make("FrozenLake-v1", is_slippery=False,render_mode='human')
env.reset()
env.render()
while(1):
    next_state,reward,done,_,_=env.step(int(optimum_policy[curr_state]))
    # next_state,reward,done,_,_=env.step(2)
    curr_state=next_state
    print(curr_state)
    if curr_state==15:
        print("hurray we reached the goal")
        break



0
4
8
9
13
14
15
hurray we reached the goal


#### policy_iteration

In [None]:
def policy_iteration(env):
    num_iterations = 1000
    threshold = 1e-20
    gamma = 0.9
    
    policy_table = np.zeros(env.observation_space.n)
    value_function_table = np.zeros(env.observation_space.n)
    
    for i in range(num_iterations):
        # policy evaluation
        for state in range(env.observation_space.n):
            q_value = [sum([prob * (reward + gamma * value_function_table[next_state]) for prob, next_state, reward, _ in env.P[state][policy_table[state]]])]
            value_function_table[state] = q_value
        
        # policy improvement
        policy_stable = True
        for state in range(env.observation_space.n):
            old_action = policy_table[state]
            q_value = [sum([prob * (reward + gamma * value_function_table[next_state]) for prob, next_state, reward, _ in env.P[state][action]]) for action in range(env.action_space.n)]
            policy_table[state] = np.argmax(q_value)
            if old_action != policy_table[state]:
                policy_stable = False
        
        if policy_stable:
            print("converged at iteration ", i + 1)
            print("final value function table is ", value_function_table)
            break
            
    return policy_table

# getting the optimal policy function
optimal_policy_function = policy_iteration(env)
