In [1]:
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [3]:
class BanditEnv:
    def __init__(self, n_arms):
        self.n_arms = n_arms
        self.probs = np.random.rand(n_arms)  # Random probabilities for each arm

    def step(self, action):
        if action < 0 or action >= self.n_arms:
            raise ValueError("Invalid action. Must be between 0 and n_arms - 1.")
        
        win_prob = self.probs[action]  # Get probability of winning for the selected arm
        reward = 1 if np.random.rand() < win_prob else 0  # Sample a random value
        
        return reward

In [100]:
n_arms = 1
env = BanditEnv(n_arms)
reward_table = np.zeros((n_arms))
attempt_table = np.zeros((n_arms))

print(env.probs)

n_steps = 0

[0.54232661]


In [129]:

for _ in range(100):

    action = np.random.randint(0, n_arms)
    # print(action)
        
    reward = env.step(action)
    # print(reward)

    reward_table[action] = reward_table[action] + reward
    attempt_table[action] = attempt_table[action] + 1




In [130]:
print(reward_table)
print(attempt_table)

estimate_value = reward_table/attempt_table

print(estimate_value)

[791.]
[1500.]
[0.52733333]


I wonder how long it will take to converege to a certain value? I bet you could say something about it...

Super beutiful trick here to do it "online"

Instead of keeping to tallies...

Do more of a weighted average...

if fully update each time...

value = last_reward

but.. if you add a small step in there...

value = curr_value + 0.1 * curr_reward -> the impact of previous rewards exponentially decays... you can write it out and you will it's a series!

value = curr_value + 1/n * curr_reward -> the impact of previous rewards decays and this value convereges. Not good for if the state is changing though.

Is that equivlent to taking the average at each time step? I feel not entirely. beaucse you weigh the eariler rewards more than the later rewards... I guess that is how first impressions work though :)

Ok you can do it! (see my notebook)

new_value = curr_value * (n-1/n) + curr_reward/n

This just requires you to remember the current value and the number of times you tried it

Using the other way.

I still need to remeber the number of times I tried it... but I need to hold this very large value of total rewards to date, and then I calculate the value...

Doing the online version. I don't need to keep track of total rewards to date. I just remember the value. This is more biologically plausiable as I don't remember every single icecream I had, but I do remember some overall sense of goodness... and I know how to update it...


In [140]:
n_arms = 1
env = BanditEnv(n_arms)
value_table = np.zeros((n_arms))
attempt_table = np.zeros((n_arms))

print(env.probs)

n_steps = 0

[0.24500936]


In [141]:

for _ in range(100):

    action = np.random.randint(0, n_arms)
    # print(action)
        
    reward = env.step(action)
    # print(reward)
    n_attempt = attempt_table[action] + 1
    value_table[action] = value_table[action] * (n_attempt-1)/n_attempt + reward/n_attempt
    attempt_table[action] =  n_attempt




In [142]:
print(value_table)

[0.25]
