In [4]:
import numpy as np
import random
random.seed(0)

In [11]:
class MDP:
    
    def __init__(self):
        
        self.states = [1,2,3,4,5,6,7,8]
        
        self.terminal_states = dict()
        self.terminal_states[6] = 1
        self.terminal_states[7] = 1
        self.terminal_states[8] = 1
        
        self.actions = ['n', 'e', 's', 'w']
        
        self.rewards = dict()
        self.rewards['1_s'] = -1.0
        self.rewards['3_s'] = 1.0
        self.rewards['5_s'] = -1.0
        
        self.t = dict()
        self.t['1_s'] = 6
        self.t['1_e'] = 2
        self.t['2_w'] = 1
        self.t['2_e'] = 3
        self.t['3_s'] = 7
        self.t['3_w'] = 2
        self.t['3_e'] = 4
        self.t['4_w'] = 3
        self.t['4_e'] = 5
        self.t['5_s'] = 8
        self.t['5_w'] = 4
        
        self.gamma = 0.8
        
    def transform(self, state, action):
        
        if state in self.terminal_states:
            return True, state, 0
        
        key = '{}_{}'.format(state, action)
        
        if key in self.t.keys():
            next_state = self.t[key]
        else:
            next_state = state
            
        is_terminal = False
        if next_state in self.terminal_states.keys():
            is_terminal = True
        
        if key not in self.rewards.keys():
            r = 0.0
        else:
            r = self.rewards[key]
            
        return is_terminal, next_state, r

In [12]:
def random_pi():
    actions = ['n', 'w', 'e', 's']
    r = int(random.random()*4)
    return actions[r]

In [64]:
def compute_random_pi_state_value():
    
    value = [0.0 for r in range(9)]
    num = 10000
    
    for k in range(1, num):
        for i in range(1,6):
            
            mdp = MDP()
            s = i
            is_terminal = False
            gamma = 1.0
            v = 0.0
            
            while False == is_terminal:
                a = random_pi()
                is_terminal, s, r = mdp.transform(s, a)
                v += gamma*r
                gamma *= 0.5
                
#             value[i] = (value[i]*(k-1) +v)/k
            value[i] += v
        
#     print value
    print [i/k for i in value][1:-3]

In [65]:
compute_random_pi_state_value()

[-0.3394371614446666, -0.01228487538436094, 0.28274287779097057, -0.008091038756461267, -0.3360884527952411]


![](http://www.algorithmdog.com/wp-content/uploads/2016/04/mdp-value.png)

![](http://www.algorithmdog.com/wp-content/uploads/2016/04/mdp.png)