In [1]:
import numpy as np
import random

class TicTacToe:
    def __init__(self):
        self.board = np.zeros((3,3))
        self.value_function = {}

    def get_state(self):
        return str(self.board.reshape(-1))

    def update_value(self, reward):
        state = self.get_state()
        if state not in self.value_function:
            self.value_function[state] = 0
        self.value_function[state] += 0.1 * (reward - self.value_function[state])

In [6]:
class QLearningAgent:
    def __init__(self, actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = {}  # Stores Q-values for state-action pairs
        self.actions = actions
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def get_q_value(self, state, action):
        return self.q_table.get((state, action), 0)  # Default to 0 if unseen state-action

    def update_q_value(self, state, action, reward, next_state):
        best_next_q = max([self.get_q_value(next_state, a) for a in self.actions], default=0)
        self.q_table[(state, action)] = (1 - self.alpha) * self.get_q_value(state, action) + self.alpha * (reward + self.gamma * best_next_q)

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.actions)  # Exploration
        else:
            q_values = [self.get_q_value(state, a) for a in self.actions]
            return self.actions[np.argmax(q_values)]  # Exploitation


In [3]:
class MultiArmedBandit:
    def __init__(self, k=10):
        self.k = k
        self.q_true = np.random.normal(0, 1, k)
        self.q_est = np.zeros(k)
        self.action_counts = np.zeros(k)

    def select_action(self, epsilon=0.1):
        if random.random() < epsilon:
            return np.random.randint(self.k)
        return np.argmax(self.q_est)

    def update_estimate(self, action, reward):
        self.action_counts[action] += 1
        self.q_est[action] += (1/self.action_counts[action]) * (reward - self.q_est[action])

In [7]:
if __name__ == "__main__":
    # Tic Tac Toe Simulation
    tic_tac_toe = TicTacToe()
    print("Tic Tac Toe State Representation:", tic_tac_toe.get_state())

    # Q-Learning Agent Test
    actions = list(range(9))  # Tic Tac Toe board positions
    q_agent = QLearningAgent(actions)
    state = tic_tac_toe.get_state()  # Get state from TicTacToe
    action = q_agent.select_action(state)
    print(f"Selected Action by Q-Agent: {action}")

    # 10-Armed Bandit Test
    bandit = MultiArmedBandit()
    for _ in range(1000):  # Simulate 1000 rounds
        action = bandit.select_action(epsilon=0.1)
        reward = np.random.normal(bandit.q_true[action], 1)  # Sample from the reward distribution
        bandit.update_estimate(action, reward)

    print("Final Estimated Values of Arms:", bandit.q_est)


Tic Tac Toe State Representation: [0. 0. 0. 0. 0. 0. 0. 0. 0.]
Selected Action by Q-Agent: 0
Final Estimated Values of Arms: [-0.34450019  1.13758546  0.48896538  0.95154961 -0.51816908 -0.17645907
  0.98262685  0.56456066 -1.04570107  1.24710448]
