In [22]:
import gym
import random
import numpy as np

from gridworld.environment import GridWorldEnv
from gridworld.utils import plotVTableInGrid, plotQTableInGrid
from gridworld.utils import VTableVisualizer, QTableVisualizer
from collections import defaultdict
from tqdm import tqdm

In [23]:
STATES = [0, 1, 2, 3, 4]
P = { 0 : { 'facebook' : [(1.0, 2, -1., False)],
            'study' : [(1.0, 1, -2., False)] },
      1 : { 'study' : [(1.0, 3, -2., False)],
            'sleep' : [(1.0, 4, 0., False)] },
      2 : { 'facebook' : [(1.0, 2, -1., False)],
            'quit' : [(1.0, 0, 0., False)] },
      3 : { 'study' : [(1.0, 4, 10., False)],
            'pub' : [(0.2, 0, 1., False), (0.4, 1, 1., False), (0.4, 3, 1., False)] },
      4 : { 'end' : [(1.0, 4, 0., True)] } }

In [24]:
# policy evaluation: how good is my policy???

def policy_random( s, a ) :
    return 1. / len( list( P[s].keys() ) )

def policy_responsible( s, a ) :
    if 'study' in P[s] :
        return 1.0 if a == 'study' else 0.0
    elif 'quit' in P[s] :
        return 1.0 if a == 'quit' else 0.0
    else :
        return policy_random( s, a )

policy_fn = policy_responsible
## policy_fn = policy_random

V_pi = defaultdict( lambda : 0.0 )
N_iters = 100
GAMMA = 1.0

for i_iter in tqdm( range( N_iters ), desc = 'Backing-up...' ) : 
    # keep a copy for the bellman backups (no in-place updates)
    V_old = V_pi.copy()
    # do a bellman backup for every state
    for s in STATES :
        V_pi[s] = 0.0 # clear the state-value to be accumulated in the table
        actions = list( P[s].keys() )
        for a in actions :
            a_prob = policy_fn( s, a )
            for t_prob, snext, reward, _ in P[s][a] :
                V_pi[s] += a_prob * t_prob * ( reward + GAMMA * V_old[snext] )

print( dict(V_pi) )
                

Solving...: 100%|██████████| 100/100 [00:00<00:00, 45358.54it/s]

{0: 6.0, 1: 8.0, 2: 6.0, 3: 10.0, 4: 0.0}





In [32]:
# policy iteration: what is the optimal policy???
pi = { 0 : { 'facebook' : 0.5, 'study' : 0.5 },
       1 : { 'study' : 0.5, 'sleep' : 0.5 },
       2 : { 'quit' : 0.5, 'facebook' : 0.5 },
       3 : { 'pub' : 0.5, 'study' : 0.5 }, 
       4 : { 'end' : 1.0 } }

V_pi = defaultdict( lambda : 0.0 )
N_iter_improv = 10

for _ in tqdm( range( N_iter_improv ), desc = 'Improving...' ) :
    # policy-evaluation
    for i_iter in tqdm( range( N_iters ), desc = 'Backing-up...', leave = False ) : 
        # keep a copy for the bellman backups (no in-place updates)
        V_old = V_pi.copy()
        # do a bellman backup for every state
        for s in STATES :
            V_pi[s] = 0.0 # clear the state-value to be accumulated in the table
            actions = list( P[s].keys() )
            for a in actions :
                a_prob = pi[s][a]
                for t_prob, snext, reward, _ in P[s][a] :
                    V_pi[s] += a_prob * t_prob * ( reward + GAMMA * V_old[snext] )
    
    # policy-improvement
    for s in STATES :
        actions = list( P[s].keys() )
        Qs = defaultdict( lambda : 0.0 )
        for a in actions :
            a_prob = pi[s][a]
            for t_prob, snext, reward, _ in P[s][a] :
                Qs[a] += a_prob * t_prob * ( reward + GAMMA * V_pi[snext] )
                
        for a in actions :
            pi[s][a] = 1.0 if a == max(Qs) else 0.

print( dict(V_pi) )
print( pi )

Improving...:   0%|          | 0/10 [00:00<?, ?it/s]
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%|          | 0/100 [00:00<?, ?it/s][A
                                                      [A
Backing-up...:   0%

{0: 6.0, 1: 8.0, 2: 6.0, 3: 10.0, 4: 0.0}
{0: {'facebook': 0.0, 'study': 1.0}, 1: {'study': 1.0, 'sleep': 0.0}, 2: {'quit': 1.0, 'facebook': 0.0}, 3: {'pub': 0.0, 'study': 1.0}, 4: {'end': 1.0}}





In [3]:
# value iteration: what is the optimal value function???