In [6]:
import numpy as np
import random

def greedy_action_selection(Q, state):
    return np.argmax(Q[state])

def epsilon_greedy_action_selection(Q, state, epsilon=0.1):
    if random.uniform(0, 1) < epsilon:
        return random.choice(range(Q.shape[1]))
    else:
        return np.argmax(Q[state])

def softmax_action_selection(Q, state, tau=1.0):
    q_values = Q[state]
    exp_q = np.exp(q_values / tau)
    action_probabilities = exp_q / np.sum(exp_q)
    return np.random.choice(len(q_values), p=action_probabilities)

def ucb_action_selection(Q, state, action_counts, total_counts, c=1.0):
    ucb_values = np.zeros(Q.shape[1])
    for a in range(Q.shape[1]):
        if action_counts[state, a] == 0:
            return a  
        else:
            ucb_values[a] = Q[state, a] + c * np.sqrt(np.log(total_counts) / (action_counts[state, a] + 1e-5))
    return np.argmax(ucb_values)

Q = np.array([[1.0, 0.5, 0.2, 0.8], [0.1, 2.0, 0.3, 0.4], [0.5, 0.4, 3.0, 1.0]])
action_counts = np.zeros(Q.shape)
total_counts = 1
state = 0

action_greedy = greedy_action_selection(Q, state)
print(f"Greedy action selected: {action_greedy}")
action_epsilon_greedy = epsilon_greedy_action_selection(Q, state, epsilon=0.1)
print(f"Epsilon-greedy action selected: {action_epsilon_greedy}")
action_softmax = softmax_action_selection(Q, state, tau=1.0)
print(f"Softmax action selected: {action_softmax}")
action_ucb = ucb_action_selection(Q, state, action_counts, total_counts, c=2.0)
print(f"UCB action selected: {action_ucb}")

Greedy action selected: 0
Epsilon-greedy action selected: 3
Softmax action selected: 2
UCB action selected: 0
