In [8]:
import numpy as np


class MDP():
    def __init__(self):
        # Discount factor
        self.γ = 0.95
        self.A = [0, 1]
        self.S = [0, 1, 2]

        # Transition matrix if dont' invest
        P0 = np.array([[1, 0, 0],
                       [0.1, .75, 0.15],
                       [0.05, .1, 0.85]])

        R0 = np.array([0, 1, 2])

        # Transition matrix if invest
        P1 = np.array([[1, 0, 0],
                       [0.05, .75, 0.2],
                       [0.02, .06, 0.92]])
        R1 = np.array([0, 0.5, 1.5])

        self.P = [P0, P1]
        self.R = [R0, R1]

    def step(self, s, a):
        s_prime = np.random.choice(len(self.S), p=self.P[a][s])
        R = self.R[a][s]
        if s_prime == 0:
            done = True
        else:
            done = False
        return s_prime, R, done

    def simulate(self, s, a, π):
        done = False
        t = 0
        history = []
        while not done:
            if t > 0:
                a = π[s]
            s_prime, R, done = self.step(s, a)
            history.append((s, a, R))
            s = s_prime
            t += 1

        return history

In [9]:
# Markov decision process (mathematical formalism of sequential decision making)
mdp = MDP()
s = 1 #initial state
a = 0 #initial action
π = [0, 0, 0] #policy π
# output of the simulation / history of what happened
H = mdp.simulate(s, a, π)
H  # columns are State, Action, Reward

[(1, 0, 1), (1, 0, 1), (2, 0, 2), (2, 0, 2)]

In [14]:
H

[(1, 0, 1), (1, 0, 1), (2, 0, 2), (2, 0, 2)]

In [17]:
mdp = MDP()

# Estimating vπ
S = np.zeros(3)
N = np.zeros(3)
vπ = np.zeros(3)
π = [0, 1, 1]


def update(S, N):
    s = 2
    H = mdp.simulate(s, π[s], π)
    T = len(H)
    G = 0
    for t in np.arange(T - 1, -1, -1):
        s, a, R = H[t]
        G = mdp.γ * G + R
        S[s] += G #calculating the sum of G
        N[s] += 1 #count
        # vπ[s] = vπ[s] + 1 / N[s] * (G - vπ[s])

    return S, N

In [16]:
for episodes in range(10000):
    S, N = update(S, N)

vπ = S / N
print(vπ)

[        nan 13.69525648 18.06026759]


  vπ = S / N
