In [1]:
import sys
sys.path.append("./python") 
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import Environment

## Agent Environment Loops for SuttonSimplest and CliffWalking

#### Sutton Simplest MDP environment
States = {A,B} cells ids <br>
Actions = {1 in pyhton 0,  2 in python 1} <br>
<img style="float: left;" src="imgs/SuttonSimple.png" width="320">

In [2]:
env = Environment.Environment("SuttonSimplest")
print("Initial State: %d"%env.reset())

isteps, sum_reward, done = 0, 0, False
observation = env.reset()
while not done and isteps < 10:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print("New State: " + str(observation) + "      Reward: " + str(reward))
    sum_reward += reward
    isteps += 1
print("Total obtained reward was: ", sum_reward)
env.close()

Initial State: 0
New State: 0      Reward: 10
New State: 0      Reward: -10
New State: 0      Reward: -10
New State: 0      Reward: 10
New State: 1      Reward: -10
New State: 0      Reward: 20
New State: 1      Reward: -10
New State: 1      Reward: 20
New State: 0      Reward: 20
New State: 1      Reward: -10
Total obtained reward was:  30


#### Cliff Environment from the Sutton and Barto Book:  cliff_walking.py file
States = {0,...,33} cells ids <br>
Actions = {UP 0,RIGHT 1,DOWN 2,LEFT 3} <br>
<img style="float: left;" src="imgs/cliff_env.png" width="320">

In [3]:
env = Environment.Environment("CliffWalking")
print("Initial State: %d"%env.reset())

sum_reward, done = 0, False
observation = env.reset()
while not done:
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    print("New State: " + str(observation) + "      Reward: " + str(reward))
    sum_reward += reward
    
print("Total obtained reward was: ", sum_reward)
env.close()

Initial State: 36
New State: 37      Reward: -100.0
Total obtained reward was:  -100.0


## TD policy evaluation - Value function V

In [4]:
class V_Class():  
    """ Class to store the state Value function
        V(s) = expected future discounted reward from s onwards (the return Gt)
        Stores it as a dictionnary and adds states as encounteded (get method)
        Two methods: get and set
    """
    def __init__(self):       
        self.f = {}
        
    def get(self, s):        
        if(s not in self.f):
            self.f[s] = 0             
        return self.f[s]
    
    def set(self, s, y):
        self.f[s] = y  

In [5]:
env = Environment.Environment("CliffWalking")
V = V_Class()

iepisode = 0
while iepisode < 1000:
    obs = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        new_obs, reward, done, info = env.step(action)
        new_Vs = V.get(obs) + 0.3*(reward + 0.9*V.get(new_obs) - V.get(obs)) 
        V.set(obs,new_Vs)
        obs = new_obs
        
    iepisode += 1

print("")
print("State Value Function evaluation for Random Policy")

for key, value in sorted(V.f.items()):
    print(key, round(value,2))


State Value Function evaluation for Random Policy
0 -22.55
1 -24.28
2 -25.1
3 -23.86
4 -24.79
5 -23.47
6 -22.02
7 -26.36
8 -19.14
9 -10.58
10 -5.37
11 -1.83
12 -25.03
13 -36.7
14 -36.35
15 -50.21
16 -32.67
17 -31.31
18 -30.67
19 -48.81
20 -20.64
21 -14.88
22 -13.52
23 -4.04
24 -51.58
25 -60.79
26 -68.98
27 -76.73
28 -55.45
29 -51.78
30 -70.05
31 -73.55
32 -65.45
33 -30.43
34 -47.26
35 -15.59
36 -76.24
37 0
38 0
39 0
40 0
41 0
42 0
43 0
44 0
45 0
46 0
47 0


## TD policy evaluation (state, action) Value function Q

In [6]:
class Q_Class():  
    """ Class to store the state action Value function Q(s,a)
        Q(s,a) = expected future discounted reward from s taking a onwards 
        Stores it as a dictionnary and adds states as encounteded with all its actions
        Two methods: get and set
    """
    
    def __init__(self, env):       
        self.nactions = env.action_space.n
        self.f = {}
        
    def get(self, s, a=None):        
        if(s not in self.f):
            self.f[s] = [0 for _ in range(self.nactions)]  
        return self.f[s] if a is None else self.f[s][a]
    
    def set(self, s, a, y):
        self.f[s][a] = y  

In [7]:
env = Environment.Environment("CliffWalking")
Q = Q_Class(env)

iepisode = 0
while iepisode < 1000:
    s = env.reset()
    done = False
    a = env.action_space.sample()
    while not done:        
        s_new, reward, done, info = env.step(a)

        a_new = env.action_space.sample()
        #print(s_new, a_new, Q.get(s_new,a_new))

        new_Qa = Q.get(s,a) + 0.3*(reward + 0.9*Q.get(s_new,a_new) - Q.get(s,a))         
        Q.set(s,a,new_Qa)
        
        s,a = s_new,a_new
        
    iepisode += 1

print("")
print("State Value Function evaluation for Random Policy")

for key, value in sorted(Q.f.items()):
    print(key, [round(v,2) for v in value])


State Value Function evaluation for Random Policy
0 [-26.7, -24.11, -36.01, -24.81]
1 [-26.71, -23.95, -32.11, -26.43]
2 [-25.3, -31.71, -29.26, -23.63]
3 [-29.99, -18.41, -53.76, -23.69]
4 [-14.31, -13.84, -23.25, -22.41]
5 [-10.2, -6.4, -16.04, -17.82]
6 [-8.42, -2.13, -11.27, -10.17]
7 [-2.73, -1.35, -11.18, -3.09]
8 [-0.94, -1.24, -1.55, -1.59]
9 [-0.95, -1.84, -0.94, -0.41]
10 [-0.3, -0.51, -3.74, -0.54]
11 [-0.3, 0, -0.46, -0.44]
12 [-25.82, -42.79, -45.97, -33.6]
13 [-25.54, -35.19, -49.58, -33.66]
14 [-23.88, -39.63, -44.26, -29.6]
15 [-19.42, -31.4, -76.65, -31.54]
16 [-18.57, -22.82, -40.8, -27.92]
17 [-11.73, -18.26, -43.54, -28.41]
18 [-6.78, -7.88, -57.38, -13.27]
19 [-0.75, -1.92, -20.13, -21.46]
20 [-1.12, -2.21, -0.65, -1.42]
21 [-1.43, -0.72, -17.06, -1.49]
22 [-0.97, -0.65, -20.2, -1.67]
23 [0, -0.74, 0, -1.16]
24 [-31.17, -37.1, -80.33, -33.05]
25 [-30.36, -58.43, -100.0, -36.83]
26 [-32.08, -56.11, -100.0, -54.06]
27 [-38.4, -42.69, -100.0, -62.42]
28 [-26.16, -64.

## SARSA Optimal Policy through epsilon - Greedy Policy 

In [8]:
class egreedy_Class():
    """ Class for the epsilon greedy policy
        Gets at initialization:
        -environment to store the number of actions in initialization 
        -The Q function
        
        Has method act: 
        -gets state and epsilon as input: acts randomly 
        -
    """
    
    def __init__(self, env, Q):       
        self.nactions = env.action_space.n
        self.Q = Q
    
    def act(self, s, epsilon):                
        if np.random.uniform(0, 1) < epsilon:
            return env.action_space.sample()
        else:
            q_values = self.Q.get(s)
            action = np.argmax(q_values)
            return action


In [9]:
env = Environment.Environment("CliffWalking")
Q = Q_Class(env)
policy = egreedy_Class(env,Q) 

iepisode, epsilon = 0,1
while iepisode < 1000:
    s = env.reset()
    done = False
    a = policy.act(s,epsilon)
    while not done:        
        s_new, reward, done, info = env.step(a)        
        a_new = policy.act(s_new,epsilon)
        new_Qa = Q.get(s,a) + 0.3*(reward + 0.9*Q.get(s_new,a_new) - Q.get(s,a))         
        Q.set(s,a,new_Qa)
        s,a = s_new,a_new
        
    iepisode += 1
    epsilon = max(0.1, epsilon - 1/1000.0)

print("")
print("State Value Function evaluation for Random Policy")

for key, value in sorted(Q.f.items()):
    print(key, [round(v,2) for v in value])


State Value Function evaluation for Random Policy
0 [-8.93, -8.37, -10.01, -9.21]
1 [-8.79, -7.97, -9.19, -9.09]
2 [-8.95, -7.64, -9.05, -9.07]
3 [-8.4, -7.32, -8.66, -8.51]
4 [-8.2, -7.06, -10.55, -8.38]
5 [-7.42, -6.83, -11.29, -7.99]
6 [-6.93, -6.37, -7.42, -7.96]
7 [-6.34, -5.89, -13.39, -7.63]
8 [-6.05, -5.01, -7.53, -6.74]
9 [-5.6, -4.29, -6.56, -6.58]
10 [-5.14, -3.5, -5.0, -5.9]
11 [-4.3, -4.07, -2.72, -5.55]
12 [-8.74, -9.17, -10.76, -10.24]
13 [-8.49, -10.77, -20.0, -10.89]
14 [-8.27, -9.89, -16.38, -11.06]
15 [-7.92, -10.26, -15.96, -11.41]
16 [-8.42, -11.97, -43.84, -11.25]
17 [-9.13, -11.75, -23.59, -11.84]
18 [-6.73, -12.11, -52.92, -12.97]
19 [-6.35, -12.24, -23.73, -12.66]
20 [-9.12, -5.74, -36.59, -15.76]
21 [-7.83, -4.09, -11.14, -11.43]
22 [-7.58, -3.24, -12.71, -8.55]
23 [-4.08, -3.09, -1.9, -4.44]
24 [-8.94, -15.6, -16.58, -9.83]
25 [-8.99, -27.02, -100.0, -13.05]
26 [-12.15, -15.22, -100.0, -29.78]
27 [-11.57, -36.37, -100.0, -16.92]
28 [-17.13, -21.31, -99.89, -

## Q-Learning - Off-policy through 1 step look ahead - max operation 

In [10]:
#Notes about Q learning.
#Q-learning means that the maximum future reward for an action is the immediate reward plus
#the maximum future reward for the next state.
#The Q table is initialized randomly, the agent then interacts with the environment.
#From interacting with the environment the agent will observe the reward that came from its action 
#and the transition that took place in the state.
#The agent will then compute the observed Q- value and update its own estimate of Q. 

env = Environment.Environment("CliffWalking")

#The Q table is initialized randomly.
Q = Q_Class(env)
policy = egreedy_Class(env,Q) 

#the agent then interacts with the environment
iepisode,epsilon = 0,1
while iepisode < 1000:
    s = env.reset()
    done = False
    
    #From interacting with the environment the agent will observe the reward that came from its action 
    #and the transition that took place in the state.
    while not done:        
        a = policy.act(s,epsilon)
        s_new, reward, done, info = env.step(a)

        #The agent will then compute the observed Q- value and update its own estimate of Q. 
        new_Qa = Q.get(s,a) + 0.3*(reward + 0.9*Q.get(s_new,a_new) - Q.get(s,a))         
        Q.set(s,a,new_Qa)
        
        s,a = s_new,a_new
        
    iepisode += 1
    epsilon = max(0.1, epsilon - 1/1000.0)

print("")
print("State Value Function evaluation for Random Policy")

for key, value in sorted(Q.f.items()):
    print(key, [round(v,2) for v in value])


State Value Function evaluation for Random Policy
0 [-9.54, -9.53, -9.55, -9.54]
1 [-9.49, -9.49, -9.51, -9.52]
2 [-9.46, -9.44, -9.44, -9.44]
3 [-9.39, -9.38, -9.4, -9.39]
4 [-9.31, -9.3, -9.31, -9.34]
5 [-9.23, -9.22, -9.24, -9.21]
6 [-9.15, -9.15, -9.15, -9.17]
7 [-9.07, -9.07, -9.09, -9.08]
8 [-9.04, -9.0, -9.02, -9.0]
9 [-8.92, -8.9, -8.91, -8.96]
10 [-8.85, -8.85, -8.87, -8.84]
11 [-8.81, -8.8, -8.81, -8.8]
12 [-9.55, -9.55, -9.57, -9.55]
13 [-9.52, -9.5, -9.51, -9.52]
14 [-9.45, -9.45, -9.45, -9.5]
15 [-9.41, -9.39, -9.41, -9.41]
16 [-9.32, -9.31, -9.32, -9.42]
17 [-9.23, -9.23, -9.24, -9.33]
18 [-9.16, -9.16, -9.2, -9.24]
19 [-9.08, -9.07, -9.1, -9.14]
20 [-9.0, -9.01, -9.02, -9.01]
21 [-8.91, -8.92, -8.93, -8.96]
22 [-8.84, -8.81, -8.83, -8.84]
23 [-8.74, -8.74, -8.73, -8.73]
24 [-9.58, -9.58, -9.6, -9.61]
25 [-9.53, -9.53, -100.0, -9.56]
26 [-9.48, -9.47, -100.0, -9.5]
27 [-9.43, -9.41, -100.0, -9.5]
28 [-9.36, -9.35, -100.0, -9.39]
29 [-9.28, -9.26, -100.0, -9.33]
30 [-9.18