In [4]:

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models

# Define the reward probabilities for 3 slot machines (bandits)
true_probabilities = [0.2, 0.5, 0.75]
num_actions = len(true_probabilities)

# Hyperparameters
epsilon = 0.1  # Exploration rate
num_trials = 1000  # Number of trials
gamma = 0.99  # Discount factor
alpha = 0.01  # Learning rate

# Create the neural network model to approximate Q-values
#model = keras.Sequential([
#    layers.Dense(10, input_shape=(num_actions,), activation='relu'),
#    layers.Dense(num_actions)  # Output layer with one neuron per action
#])

# Create a simple neural network model
# One input node for the action, one output node for the reward probability
model = models.Sequential([
    layers.Input(shape=(1,)),  # Input layer with one node (action)
    layers.Dense(10, activation='relu'),  # Hidden layer
    layers.Dense(1)  # Output layer: estimated reward probability
])

# Optimizer and loss function
optimizer = keras.optimizers.Adam(learning_rate=alpha)
loss_fn = tf.keras.losses.MeanSquaredError()  # Use tf.keras.losses.MeanSquaredError()

# Function to simulate pulling a lever
def pull_lever(action):
    return 1 if np.random.random() < true_probabilities[action] else 0

# One-hot encode the action to feed into the network
def one_hot_encode(action, num_actions):
    return np.eye(num_actions)[action].reshape(1, -1)

# Apply softmax to Q-values to get action probabilities
def q_values_to_probabilities(q_values):
    exp_q_values = np.exp(q_values - np.max(q_values))  # Subtracting max Q-value for numerical stability
    return exp_q_values / np.sum(exp_q_values)

# Simulate trials
for trial in range(num_trials):
    # Explore (random) vs Exploit (best known action)
    if np.random.random() < epsilon:
        action = np.random.choice(num_actions)  # Explore: choose a random machine
    else:
        '''# Predict Q-values for all actions in the current state.
        # We pass a one-hot encoded representation of the current state 
        # to the model to get Q-values for all actions.
        # Since we don't have an explicit state in this case, 
        # we use an identity matrix to represent all possible actions as individual states. 
        q_values = model.predict(np.eye(num_actions))  
        # Get the action with the highest predicted Q-value.
        action = np.argmax(q_values)  
        # Ensure the action is within the valid range.
        action = action % num_actions # This ensures 'action' is always within the valid range (0, 1, 2)'''
        # Exploit: Choose the action with the highest estimated reward probability
        q_values = [model.predict(np.array([[a]]))[0, 0] for a in range(num_actions)]
        action = np.argmax(q_values)
    
    # Pull the lever and get a reward
    reward = pull_lever(action)

    # Calculate the target (reward)
    target = np.array([[reward]])  # We want the network to predict this reward

    # Update the neural network
    with tf.GradientTape() as tape:
        # Predict the reward probability for the selected action
        q_value = model(np.array([[action]]), training=True)  
        # Calculate the loss
        loss = loss_fn(target, q_value)
    
    # Backpropagate the error and update the model's weights
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [5]:
# Display the estimated probabilities for each action
for action in range(num_actions):
    estimated_prob = model.predict(np.array([[action]]))[0, 0]
    print(f"True probability of action {action}: {true_probabilities[action]}")
    print(f"Estimated probability after {num_trials} trials: {estimated_prob}")

'''# Display the final estimated Q-values
q_values = model.predict(np.eye(num_actions)) 
print(f"True probabilities: {true_probabilities}")
print(f"Estimated Q-values after {num_trials} trials: {q_values}")'''

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
True probability of action 0: 0.2
Estimated probability after 1000 trials: 0.3220093250274658
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
True probability of action 1: 0.5
Estimated probability after 1000 trials: 0.5452567934989929
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
True probability of action 2: 0.75
Estimated probability after 1000 trials: 0.76850426197052


'# Display the final estimated Q-values\nq_values = model.predict(np.eye(num_actions)) \nprint(f"True probabilities: {true_probabilities}")\nprint(f"Estimated Q-values after {num_trials} trials: {q_values}")'

In [6]:
print(target)


[[1]]


In [7]:
print(one_hot_encode(0, 3))

'''    # Update Q-values using neural network
    with tf.GradientTape() as tape:
        # Get the Q-value for the selected action.
        q_values = model(one_hot_encode(action, num_actions), training=True)  
        # Calculate the target Q-value.
        # Keep target calculation within the tape's context
        target = reward + gamma * tf.reduce_max(model(np.eye(num_actions)))  
        # Get the predicted Q-value for the selected action.
        # Reshape target to match q_values shape
        target = tf.reshape(target, [1, 1]) # Reshape target to have 
        # Get the predicted Q-value for the selected action.
        q_value_for_action = q_values[0, action] 
        # Calculate the loss.
        # Ensure loss calculation is part of the TensorFlow graph
        loss = loss_fn(target, q_value_for_action) 

    # Backpropagate the error and update the model's weights.
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))'''

[[1. 0. 0.]]
