In [5]:
import numpy as np


class MDP():
    def __init__(self):
        # Discount factor
        self.γ = 0.95
        self.A = [0, 1]
        self.S = [0, 1, 2]

        # Transition matrix if dont' invest
        P0 = np.array([[1, 0, 0],
                       [0.1, .75, 0.15],
                       [0.05, .1, 0.85]])

        R0 = np.array([0, 1, 2])

        # Transition matrix if invest
        P1 = np.array([[1, 0, 0],
                       [0.05, .75, 0.2],
                       [0.02, .06, 0.92]])
        R1 = np.array([0, 0.5, 1.5])

        self.P = [P0, P1]
        self.R = [R0, R1]

    def step(self, s, a):
        s_prime = np.random.choice(len(self.S), p=self.P[a][s])
        R = self.R[a][s]
        if s_prime == 0:
            done = True
        else:
            done = False
        return s_prime, R, done

    def simulate(self, s, a, π):
        done = False
        t = 0
        history = []
        while not done:
            if t > 0:
                a = π[s]
            s_prime, R, done = self.step(s, a)
            history.append((s, a, R))
            s = s_prime
            t += 1

        return history

In [10]:
# Estimating qπ
mdp = MDP()
π = [0, 0, 0]

S = np.zeros((3, 2))
N = np.zeros((3, 2))
qπ = np.zeros((3, 2))


def update(π):
    # s = np.random.randint(3)
    s = 1
    a = π[s]
    # a = np.random.randint(2)
    H = mdp.simulate(s, a, π)
    T = len(H)
    G = 0
    for t in np.arange(T - 1, -1, -1):
        s, a, R = H[t]
        G = mdp.γ * G + R
        S[s, a] += G
        N[s, a] += 1
        qπ[s, a] = qπ[s, a] + 1 / N[s, a] * (G - qπ[s, a])
        π[s] = np.argmax(qπ[s])

    return π


for episodes in range(100000):
    π = update(π)
print(qπ)

[[ 0.          0.        ]
 [11.40657443  0.        ]
 [16.01964417  0.        ]]
