In [None]:
import numpy as np


class MDP():
  def __init__(self):
    # Discount factor
    self.γ = 0.95
    self.A = [0, 1]
    self.S = [0, 1, 2]

    # Transition matrix if dont' invest
    P0 = np.array([[1, 0, 0],
                   [0.1, .75, 0.15],
                   [0.05, .1, 0.85]])

    R0 = np.array([0, 1, 2])

    # Transition matrix if invest
    P1 = np.array([[1, 0, 0],
                   [0.05, .75, 0.2],
                   [0.02, .06, 0.92]])
    R1 = np.array([0, 0.5, 1.5])

    self.P = [P0, P1]
    self.R = [R0, R1]

  def step(self, s, a):
    s_prime = np.random.choice(len(self.S), p=self.P[a][s])
    R = self.R[a][s]
    if s_prime == 0:
      done = True
    else:
      done = False
    return s_prime, R, done

  def simulate(self, s, a, π):
    done = False
    t = 0
    history = []
    while not done:
      if t > 0:
        a = π[s]
      s_prime, R, done = self.step(s, a)
      history.append((s, a, R))
      s = s_prime
      t += 1

    return history

In [None]:
mdp = MDP()

# Estimating vπ
S = np.zeros(3)
vπ = np.zeros(3)
π = [0, 0, 0]

s = 1


α = 0.005

for iteration in range(200000):
  s_prime, R, done = mdp.step(s, π[s])

  TD_target = R + mdp.γ * vπ[s_prime]
  δ = TD_target - vπ[s] # TD error

  # TD update
  vπ[s] = vπ[s] + α * δ

  if done:
    s = 1
  else:
    s = s_prime

print(vπ)


[ 0.         11.37563946 15.80011157]
