### Let us consider a simple example for Markov decision process

<img src='./mdp_example.png' width=500px>

In [1]:
# states
S = ['s0', 's1', 's2']

# actions
A = ['a0', 'a1']

# Probability transition
P = {'a0': [[0.5, 0, 0.5],
           [0.7, 0.1, 0.2],
           [0.4, 0.6, 0]],
     'a1': [[0, 0, 1],
           [0, 0.95, 0.05],
           [0.3, 0.3, 0.4]]}

# rewards
# let us assume we have a static reward setting
           
R = {'s1_a0_s0': 5, 's2_a1_s0': -1}
# all other transitions ends up with zero reward


### Usually each state is connected with only few other states, the connections can be sparse. <br> So the above model can be better represented in a dict

In [2]:
transition_probs = {
        's0':{
        'a0': {'s0': 0.5, 's2': 0.5},
        'a1': {'s2': 1}
        },
        's1':{
        'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2},
        'a1': {'s1': 0.95, 's2': 0.05}
        },
        's2':{
        'a0': {'s0': 0.4, 's1': 0.6},
        'a1': {'s0': 0.3, 's1': 0.3, 's2':0.1, 'T':0.3}
        },
        'T':{}
    }
rewards = {
    's1': {'a0': {'s0': +5}},
    's2': {'a1': {'s0': -1}}
}

In [3]:
from mdp import MDP

mdp = MDP(transition_probs, rewards)

In [4]:
import pandas as pd
pd.set_option('display.max_colwidth',-1)

def get_return(episode, t=0, gamma=1):
    if len(episode) > t:  
        return episode[t][2] + gamma * get_return(episode, t+1, gamma)
    else:
        return 0
    
def generate_episodes(count=10,start_state=None):
    episodes = [mdp.generate_episodes(start_state=start_state) for i in range(count)]
    episodes_df = pd.Series(episodes).to_frame()
    episodes_df['length'] = episodes_df[0].apply(len)
    episodes_df['state_t_zero'] = episodes_df[0].apply(lambda x:x[0][0])
    episodes_df['return_t_zero'] =  episodes_df[0].apply(get_return)
    return episodes_df

In [5]:
episodes_df = generate_episodes()
episodes_df

Unnamed: 0,0,length,state_t_zero,return_t_zero
0,"[(s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), ...]",101,s0,60.0
1,"[(s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",21,s0,15.0
2,"[(s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, T)]",27,s0,4.0
3,"[(s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",11,s1,9.0
4,"[(s1, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",21,s1,8.0
5,"[(s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, T)]",12,s2,3.0
6,"[(s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, T)]",7,s0,5.0
7,"[(s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",5,s0,5.0
8,"[(s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",7,s1,5.0
9,"[(s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",5,s1,5.0


In [7]:
episodes_df = generate_episodes(count=10,start_state='s2')
episodes_df

Unnamed: 0,0,length,state_t_zero,return_t_zero
0,"[(s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s2), (s2, a1, 0.0, T)]",19,s2,9.0
1,"[(s2, a1, 0.0, T)]",1,s2,0.0
2,"[(s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",52,s2,24.0
3,"[(s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",25,s2,20.0
4,"[(s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, T)]",46,s2,28.0
5,"[(s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",33,s2,12.0
6,"[(s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",10,s2,5.0
7,"[(s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, T)]",7,s2,-2.0
8,"[(s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",8,s2,5.0
9,"[(s2, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a1, -1, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, -1, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 0.0, s2), (s2, a1, 0.0, s2), (s2, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s0), (s0, a0, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a0, 0.0, s1), (s1, a0, 5, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, s1), (s1, a1, 0.0, s1), (s1, a0, 5, s0), (s0, a0, 0.0, s0), (s0, a1, 0.0, s2), (s2, a1, 0.0, T)]",100,s2,60.0


In [19]:
episodes_df = generate_episodes(count=10000,start_state='s2')
value_s0 = episodes_df.return_t_zero.mean()
value_s0

9.9692