In [5]:
import numpy as np

In [169]:
class Env:
    def __init__(self):
        self.action_space = [0, 1, 2, 3]
        self.rewards = [-2, 1, 3, -1]
        self.std = 1

    def step(self, action: int) -> int:
        r = self.rewards[action]
        return np.random.normal(r, self.std)


class Bandit:
    def __init__(self, env: Env):
        self.env = env
        self.action_space = env.action_space
        self.q_table = {}
        self.epsilon = 0.1
        self.step_size = 0.1
        self.q_init = 1e-8

        for action in self.action_space:
            self.q_table[action] = self.q_init

    def get_action(self, epsilon=None):
        if epsilon is None:
            epsilon = self.epsilon

        if np.random.rand() <= epsilon:
            return np.random.randint(len(self.action_space))
        else:
            max_action = None
            max_val = None

            for action in self.action_space:
                val = self.q_table[action]
                if not max_action or not max_val or val > max_val:
                    max_action = action
                    max_val = val

            return max_action

    def pick_and_take_action(self):
        action = self.get_action()
        r = self.env.step(action)

        old = self.q_table[action]
        self.q_table[action] = old + self.step_size * (r - old)

In [170]:
env = Env()
bandit = Bandit(env)


for i in range(100):
    bandit.pick_and_take_action()

2