In [1]:
import numpy as np

# Define the reward probabilities for 3 slot machines (bandits)
true_probabilities = [0.2, 0.5, 0.75]
num_actions = len(true_probabilities)

# Number of times each machine (action) was tried
action_count = np.zeros(num_actions)
# Estimated value of each machine (Q-values)
estimated_values = np.zeros(num_actions)

# Hyperparameters
epsilon = 0.1  # Exploration rate
num_trials = 1000  # Number of trials

# Function to simulate pulling a lever
def pull_lever(action):
    return 1 if np.random.random() < true_probabilities[action] else 0

# Simulate trials
for trial in range(num_trials):
    # Explore (random) vs Exploit (best known action)
    if np.random.random() < epsilon:
        action = np.random.choice(num_actions)  # Explore: choose a random machine
    else:
        action = np.argmax(estimated_values)  # Exploit: choose the best known machine
    
    # Pull the lever and get a reward
    reward = pull_lever(action)
    
    # Update counts and Q-values
    action_count[action] += 1
    estimated_values[action] += (reward - estimated_values[action]) / action_count[action]

# Display the results
print(f"True probabilities: {true_probabilities}")
print(f"Estimated values after {num_trials} trials: {estimated_values}")


True probabilities: [0.2, 0.5, 0.75]
Estimated values after 1000 trials: [0.15217391 0.4516129  0.73997833]
