In [1]:
import numpy as np

class MultiArmedBandit:
    def __init__(self, k=10, stationary=True, alpha=None, c=2, gradient=False, baseline=True):
        self.k = k  # Number of arms
        self.stationary = stationary  # Whether rewards change over time
        self.alpha = alpha  # Step size for nonstationary problems
        self.c = c  # UCB exploration parameter
        self.gradient = gradient  # Use gradient bandit method
        self.baseline = baseline  # Use baseline for gradient method

        # True action values (stationary or nonstationary)
        self.q_true = np.random.normal(0, 1, self.k)
        self.q_est = np.zeros(self.k)  # Estimated values
        self.action_count = np.zeros(self.k)  # Action selection count
        self.time = 0  # Time step
        self.average_reward = 0  # Baseline for gradient bandit
        self.preferences = np.zeros(self.k)  # Preferences for gradient method

    def select_action(self, method="epsilon-greedy", epsilon=0.1):
        if method == "epsilon-greedy":
            if np.random.rand() < epsilon:
                return np.random.choice(self.k)  # Exploration
            return np.argmax(self.q_est)  # Exploitation

        elif method == "optimistic":
            return np.argmax(self.q_est)  # Always pick best-known option

        elif method == "ucb":
            if np.any(self.action_count == 0):
                return np.argmin(self.action_count)  # Pick an untried action first
            ucb_values = self.q_est + self.c * np.sqrt(np.log(self.time + 1) / (self.action_count + 1e-5))
            return np.argmax(ucb_values)

        elif method == "gradient":
            exp_prefs = np.exp(self.preferences)
            probabilities = exp_prefs / np.sum(exp_prefs)
            return np.random.choice(self.k, p=probabilities)

    def update_estimate(self, action, reward, method="epsilon-greedy"):
        self.time += 1
        self.action_count[action] += 1

        if method == "gradient":
            probabilities = np.exp(self.preferences) / np.sum(np.exp(self.preferences))
            if self.baseline:
                self.average_reward += (reward - self.average_reward) / self.time
            self.preferences[action] += self.alpha * (reward - self.average_reward) * (1 - probabilities[action])
            self.preferences -= self.alpha * (reward - self.average_reward) * probabilities

        else:
            step_size = 1 / self.action_count[action] if self.alpha is None else self.alpha
            self.q_est[action] += step_size * (reward - self.q_est[action])

        if not self.stationary:
            self.q_true += np.random.normal(0, 0.01, self.k)  # Small random drift in true values

In [2]:
if __name__ == "__main__":
    bandit = MultiArmedBandit(k=10, stationary=False, alpha=0.1, c=2, gradient=True, baseline=True)

    rewards = []
    for _ in range(1000):  # Run for 1000 steps
        action = bandit.select_action(method="epsilon-greedy", epsilon=0.1)
        reward = np.random.normal(bandit.q_true[action], 1)  # Sample reward
        bandit.update_estimate(action, reward, method="epsilon-greedy")
        rewards.append(reward)

    print("Final Estimated Values of Arms:", bandit.q_est)


Final Estimated Values of Arms: [ 1.66387785  0.65580474 -0.47443214  0.20600693 -0.443106   -0.46891081
  0.17439698  0.37757008 -0.34369471 -0.46814821]
