# Environment

In [1]:
import numpy as np
import copy
from gym import Env
import datetime

class FrozenLake(Env):
    def __init__(self,random_num:int=256, nonStationary = False):
        self.random_num = random_num
        self.nonStationary = nonStationary
        
        np.random.seed(self.random_num)
        self.beginMap = make_map(self.random_num) #*2
        self.beginMap[self.beginMap>1] = 1
        self.endMap = make_map(self.random_num + 100)
        
        self.changeDir = self.endMap - self.beginMap
        self.changeDir *= 1/11000

        self.fixedMap = self.beginMap

        np.random.seed(datetime.datetime.now().microsecond)
        
        self.map = copy.deepcopy(self.fixedMap)
        self.time = 0
        self.reset()

    def reset(self):
        self.NSreset()
        if not self.nonStationary:
            self.map = copy.deepcopy(self.fixedMap)
            self.time = 0

        return self.state

    def NSreset(self):
        self.time += 1
        self.map += self.changeDir

        self.map[self.map>0.95]=0.95
        self.map[self.map<0.0]=0.0

        self.state = (0,0)
        self.done = False
        return self.state
    
    def states_transitions(self, state, action):
        x = state[0]
        y = state[1]
        states = np.array([[x,y-1], [x,y+1], [x-1 ,y], [x+1,y] ])


        if action == UP:
            selected = states[2]
        if action == DOWN:
            selected = states[3]
        if action == RIGHT:
            selected = states[1]
        if action == LEFT:
            selected = states[0]

        zero = np.zeros((4,2)).astype(int)
        three = (3 * np.ones((4,2))).astype(int)
        output = np.maximum(np.minimum(states, three),zero)
        output, indices = np.unique(output, axis = 0, return_counts= True)

        
        selected = np.maximum(np.minimum(selected, three[0]), zero[0])
        probs = indices * 0.025
        probs[np.argmax(np.sum(selected == output, axis = 1))] += 0.9

        return list(zip(output[:,0],output[:,1])), probs
    
    def possible_consequences(self,action:int,state_now=None):

        if state_now==None:
            state_now = self.state

        state = [state_now[0],state_now[1]]
        states, probs = self.states_transitions(state, action)
        aa = np.array(states) 
        fail_probs = self.map[(aa[:,0]),(aa[:,1])]
        dones = np.sum(aa == 3, axis = 1) == 2
        return states, probs, fail_probs,dones
    
    def step(self, a:int):
        if not (a in range(4)):
            raise Exception("action is not available!!!")
        
        states, probs, fail_probs,dones = self.possible_consequences(a)
        
        next_idx = np.random.choice(np.arange(len(states)), p = probs)
        next_state = states[next_idx]
        self.state = tuple(next_state)
        
        self.done = dones[next_idx]

        r = -1

        if self.done:
            r += 60
        elif np.random.rand()< fail_probs[next_idx]:
            r -= 15
            self.done = True

        return (self.state, r, self.done, {})

    def render(self,state=None):
        if state == None:
            state = self.state

        out = ""
        for i in range(4):
            out += "\n------------------------------\n| "
            for j in range(4):
                if (i,j) == state:
                    out += "\033[44m{:.3f}\033[0m | ".format(self.map[i,j])
                else :
                    out += "{:.3f} | ".format(self.map[i,j])

        out += "\n------------------------------"
        print(out)

    def environment_states(self):
        env_states = []
        for state_index in range(16):
            s0 = state_index % 4
            s1 = state_index//4
            env_states.append((s0,s1))
        return env_states

        
def set_max_min(var,maximum,minimum):
    return min(max(var,minimum),maximum)

def make_map(random_num):
    np.random.seed(random_num)  
    move = np.zeros(6)
    idx = np.random.choice(range(6),size=3,replace=False)
    move[idx] = 1

    point = [0,0]
    lowprobs = [tuple(point)]

    for m in move:
        if m:
            point[0] += 1
        else:
            point[1] += 1
        lowprobs.append(tuple(point))
    
    map = np.random.rand(4,4)
    idx = np.array(lowprobs)

    map[idx[:,0],idx[:,1]] = 0.001 
    map[0,0] = 0.0
    map[3,3] = 0.0 

    return map

# HyperParameters

In [2]:
#%% allowed actions
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

ACTIONS = [LEFT,DOWN,RIGHT,UP]

#%% hyperparameters
EPISODES = 10000
EPSILON = 0.1
LEARNING_RATE = 0.1
DISCOUNT = 0.9

## Map of environment

In [3]:
RAND_NUM = 2022

In [4]:
environment = FrozenLake(random_num=RAND_NUM)

print("Environment with fail probabilities :")
environment.render()

Environment with fail probabilities :

------------------------------
| [44m0.000[0m | 0.179 | 0.053 | 0.079 | 
------------------------------
| 0.001 | 0.001 | 0.441 | 0.774 | 
------------------------------
| 0.302 | 0.001 | 0.879 | 0.328 | 
------------------------------
| 0.138 | 0.001 | 0.001 | 0.000 | 
------------------------------


## <h2><font color=indigo> Agent Implementation
Implement your q-learning (off-policy TD) agent here. You need to utilize the step function provided in the Environment class to interact with frozen lake environment.

In [5]:
import sys
import itertools
import random

class Q_Learning:
    def __init__(self, id, environment, discount , learning_rate = 0.1 , epsilon = 0.1 ,episodes=10000):

        self.environment = environment
        self.discount = discount
        self.episodes = episodes
        self.learning_rate = learning_rate
        self.environment = environment
        self.epsilon = epsilon
        self.n_actions = 4
    
    def qLearning(self):
        # List of rewards
        rewards = []
        max_epsilon = 1.0             # Exploration probability at start
        min_epsilon = 0.01            # Minimum exploration probability 
        decay_rate = 0.005            # Epsilon decay rate
        
        # create q table 16 in 4
        qtable = np.zeros((len(environment.environment_states()), self.n_actions))


        # For life or until learning is stopped
        for episode in range(self.episodes):
            # Reset the environment
            state = 0
            step = 0
            done = False
            total_rewards = 0
            max_step = 300

            for step in range(max_step):
                # Choose an action a in the current world state (s)
                # First we randomize a number
                exp_tradeoff = random.uniform(0, 1)

                # If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
                if exp_tradeoff > self.epsilon:
                    action = np.argmax(qtable[state,:])
                  
                # Else doing a random choice for action(0 to 3) --> exploration
                else:
                    action = random.randint(0, self.n_actions-1)
           

                # Take the action (a) and observe the outcome state(s') and reward (r)
                new_state, reward, done, info = self.environment.step(action)
                  
                    
                # convert state(i,j) to state k
                # k is 0 to 15
                k = -1   
                while k < 16:
                    for i in range(4):
                        for j in range(4):
                            k = k + 1
                            if new_state == (i,j):
                                new_state = k               
                    
                    
                # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
                # qtable[new_state,:] : all the actions we can take from new state
                qtable[state, action] = qtable[state, action] + self.learning_rate * (reward + self.discount * np.max(qtable[new_state, :]) - qtable[state, action])

                total_rewards += reward

                # Our new state is state
                state = new_state

                # If done (if we're dead) : finish episode
                if done == True: 
                    break

            # Reduce epsilon (because we need less and less exploration)
            epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
            rewards.append(total_rewards)

            
        print("Q table is:\n\n", qtable)
        print ("\nScore over time: " +  str(sum(rewards)/self.episodes))
        
        
        policy = []
        for state in range(16):  
            action = np.argmax(qtable[state,:])
            if action == 0:
                policy.append('LEFT')
            if action == 1:
                policy.append('DOWN')
            if action == 2:
                policy.append('RIGHT')
            if action == 3:
                policy.append('UP')
                

        return qtable,policy

## <h2><font color=indigo> Q Values
Return the Q values that your agent learns in here:

In [6]:
agent = Q_Learning('zahrasadat sajjadi', environment, 0.9 , learning_rate = 0.5 , epsilon = 0.1 ,episodes= 10000)
Q , policy = agent.qLearning()

Q table is:

 [[ 37.63600323  49.50180441  30.2027829   37.1977268 ]
 [ 23.34894751  10.91455897  -1.6918125   -1.61965082]
 [  5.65388265  -8.87        -5.035625    -1.8549375 ]
 [-13.39803125 -14.2875     -17.62142748 -17.27408109]
 [  1.67394865  -1.28573195  16.68799591  10.00324472]
 [ -1.34792828  37.34677353  14.52889747  -1.4825    ]
 [ 28.10131613  -8.          -8.25        -0.975     ]
 [ 16.40130057  37.91685756  -8.          -4.6664375 ]
 [ -8.          -1.0875      -1.15962969   8.94468287]
 [ -1.70323321  45.05354138  17.30236464  14.66798277]
 [ 32.65298904  47.77168971   0.           0.        ]
 [ 28.70350642  38.32810555  32.78340658  15.95327612]
 [ -0.975       -8.          24.75022394  -0.93204172]
 [ -1.38125     16.86419735  51.92113822  35.23543774]
 [ 40.32527456  47.02678837  55.28093077  27.99798611]
 [  0.           0.           0.           0.        ]]

Score over time: 55.6269


## <h2><font color=darkcyan> Policy
Return the optimal policy that your agent learns in here:

In [7]:
policy

['DOWN',
 'LEFT',
 'LEFT',
 'LEFT',
 'RIGHT',
 'DOWN',
 'LEFT',
 'DOWN',
 'UP',
 'DOWN',
 'DOWN',
 'DOWN',
 'RIGHT',
 'RIGHT',
 'RIGHT',
 'LEFT']

In [8]:
# policy in table shape
for i in range(len(policy)):
    if i % 4!=0:
        print(policy[i],'',end = '')
        
    else:
        print('\n',policy[i],'',end = '')
        


 DOWN LEFT LEFT LEFT 
 RIGHT DOWN LEFT DOWN 
 UP DOWN DOWN DOWN 
 RIGHT RIGHT RIGHT LEFT 