In [None]:
import numpy as np


class MDP():
  def __init__(self):
    # Discount factor
    self.γ = 0.95
    self.A = [0, 1]
    self.S = [0, 1, 2]

    # Transition matrix if dont' invest
    P0 = np.array([[1, 0, 0],
                   [0.1, .75, 0.15],
                   [0.05, .1, 0.85]])

    R0 = np.array([0, 1, 2])

    # Transition matrix if invest
    P1 = np.array([[1, 0, 0],
                   [0.05, .75, 0.2],
                   [0.02, .06, 0.92]])
    R1 = np.array([0, 0.5, 1.5])

    self.P = [P0, P1]
    self.R = [R0, R1]

  def step(self, s, a):
    s_prime = np.random.choice(len(self.S), p=self.P[a][s])
    R = self.R[a][s]
    if s_prime == 0:
      done = True
    else:
      done = False
    return s_prime, R, done

  def simulate(self, s, a, π):
    done = False
    t = 0
    history = []
    while not done:
      if t > 0:
        a = np.random.choice(2, p=π[s])
      s_prime, R, done = self.step(s, a)
      history.append((s, a, R))
      s = s_prime
      t += 1

    return history


In [None]:
S = np.zeros((3, 2))
N = np.zeros((3, 2))
Q = np.zeros((3, 2))

mdp = MDP()
π = np.array([[0.5, 0.5],
              [0.5, 0.5],
              [0.5, 0.5]])


def update(π, ε=0.01):
  s = 1
  a = np.random.choice(2, p=π[s])
  H = mdp.simulate(s, a, π)
  T = len(H)
  G = 0
  for t in np.arange(T - 1, -1, -1):
    s, a, R = H[t]
    G = mdp.γ * G + R
    N[s, a] += 1
    Q[s, a] += 1 / N[s, a] * (G - Q[s, a])

    # Policy improvement
    Astar = np.argmax(Q[s])
    for a in range(2):
      if a == Astar:
        π[s, a] = 1 - ε
      else:
        π[s, a] = ε
  return π


π = np.array([[0.5, 0.5],
              [0.5, 0.5],
              [0.5, 0.5]])

for _ in range(10000):
    π = update(π)

print(π)


[[0.5  0.5 ]
 [0.99 0.01]
 [0.01 0.99]]


In [None]:
for _ in range(1000):
    π = update(π, ε=0.00001)
print(π)

[[5.0000e-01 5.0000e-01]
 [1.0000e-05 9.9999e-01]
 [1.0000e-05 9.9999e-01]]


In [None]:
π

array([[0.5 , 0.5 ],
       [0.01, 0.99],
       [0.01, 0.99]])