In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

In [2]:
# Define the reward probabilities for 3 slot machines (bandits)
true_probabilities  = [0.2, 0.5, 0.75]
num_actions         = len(true_probabilities)

In [3]:
# Hyperparameters
epsilon             = 0.1  # Exploration rate
num_trials          = 2000  # Number of trials
gamma               = 0.99  # Discount factor
alpha               = 0.01  # Learning rate

In [4]:
# Create a simple neural network model
# One input node for the action, one output node for the reward probability
model = models.Sequential([
    layers.Input(shape=(1,)),  # Input layer with one node (action)
    layers.Dense(20, activation='relu'),  # Hidden layer
    layers.Dense(1)  # Output layer: estimated reward probability
])

In [5]:
# Optimizer and loss function
optimizer           = keras.optimizers.Adam(learning_rate=alpha)
loss_fn             = tf.keras.losses.MeanSquaredError()  

In [6]:
# Function to simulate pulling a lever
def pull_lever(action):
    return 1 if np.random.random() < true_probabilities[action] else 0

In [7]:
# Simulate trials
for trial in range(num_trials):
    # Explore (random) vs Exploit (best known action)
    if np.random.random() < epsilon:
        action = np.random.choice(num_actions)  # Explore: choose a random machine
    else:
        # Exploit: Choose the action with the highest estimated reward probability
        q_values = [model.predict(np.array([[a]]))[0, 0] for a in range(num_actions)]
        action = np.argmax(q_values)
    
    # Pull the lever and get a reward
    reward = pull_lever(action)

    # Calculate the target (reward)
    target = np.array([[reward]])  # We want the network to predict this reward

    # Update the neural network
    with tf.GradientTape() as tape:
        # Predict the reward probability for the selected action
        q_value = model(np.array([[action]]), training=True)  
        # Calculate the loss
        loss    = loss_fn(target, q_value)
    
    # Backpropagate the error and update the model's weights
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [8]:
# Display the estimated probabilities for each action
for action in range(num_actions):
    estimated_prob = model.predict(np.array([[action]]))[0, 0]
    print(f"True probability of action {action}: {true_probabilities[action]}")
    print(f"Estimated probability after {num_trials} trials: {estimated_prob}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
True probability of action 0: 0.2
Estimated probability after 2000 trials: 0.3114534318447113
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
True probability of action 1: 0.5
Estimated probability after 2000 trials: 0.5338101983070374
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
True probability of action 2: 0.75
Estimated probability after 2000 trials: 0.7587900757789612
