# 10-Armed Testbed

# Import required libraries

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import time
%matplotlib inline

# Define classes

## Class: TestBed

Class to defint TestBed containing actions and true action values

In [25]:
class TestBed:
    def __init__(self, n_arms, av_mean, av_std):
        self.n_arms = n_arms # Number of arms in the bandit
        self.av_mean = av_mean # Mean value to use for all action values
        self.av_std = av_std # Standard deviation to use for all action values
        self.av = np.zeros(nArms) # Placeholder to store action values of all arms
        self.opt_act = 0 # Placeholder to store index of optimal action
        self.initialize()

    def initialize(self): # Initialize a new testbed (a new bandit problem)
        # True action values for all actions sampled from a normal distribution
        # with mean = av_mean and standard deviation = av_std
        self.av = np.random.normal(self.av_mean, self.av_std, self.n_arms)
        self.opt_act = np.argmax(self.av) # Action with highest (true) action value

## Class: Agent

In [None]:
class Agent:
    def __init__(self, n_arms, eps = 0):
        self.n_arms = n_arms # Number of arms in the bandit
        self.eps = eps # Define probability of choosing non-greedy action in eps-greedy algorithm
        self.timeStep = 0 # Placeholder to store current time step (T)
        self.prev_action = None # Placeholder to store action taken at previous time step (T-1)
        self.action_count = np.zeros(n_arms) # Count of actions taken till time step (T - 1)
        self.cum_reward = np.zeros(n_arms) # Cumulative reward obtained for each action till 
                                           # time step (T - 1)
        self.av_est = np.zeros(n_arms) # Estimated action value at time step T

    def __str__(self): # Define string to use as legend while plotting
        if self.eps == 0:
            return "Greedy"
        else:
            return "epsilon = " + str(self.eps)

    # Selects action based on a epsilon-greedy behaviour,
    # if epsilon equals zero, then the agent performs a greedy selection
    def action(self):
        
        # Choose action based on eps-greedy algorithm.
        # eps = 0 => Greedy selection.

        randProb = np.random.random()   # Pick random probability between 0-1
        if randProb < self.eProb:
            a = np.random.choice(len(self.valEstimates))    # Select random action

        # Greedy Method
        else:
            maxAction = np.argmax(self.valEstimates)     # Find max value estimate
            # identify the corresponding action, as array containing only actions with max
            action = np.where(self.valEstimates == np.argmax(self.valEstimates))[0]

            # If multiple actions contain the same value, randomly select an action
            if len(action) == 0:
                a = maxAction
            else:
                a = np.random.choice(action)

        # save last action in variable, and return result
        self.lastAction = a
        return a


    # Interpreter - updates the value extimates amounts based on the last action
    def interpreter(self, reward):
        # Add 1 to the number of action taken in step
        At = self.lastAction

        self.kAction[At] += 1       # Add 1 to action selection
        self.rSum[At] += reward     # Add reward to sum array

        # Calculate new action-value, sum(r)/ka
        self.valEstimates[At] = self.rSum[At]/self.kAction[At]

        # Increase time step
        self.timeStep += 1


    # Reset all variables for next iteration
    def reset(self):
        self.timeStep = 0                    # Time Step t
        self.lastAction = None               # Store last action

        self.kAction[:] = 0                  # count of actions taken at time t
        self.rSum[:] = 0
        self.valEstimates[:] = 0   # action value estimates Qt ~= Q*(a)

In [17]:
start_time = time.time() # Start timer
n_arms = 10 # Number of arms in bandit
iterations = 2000 # Number of repeated and independent bandit problems.
plays = 1000 # Number of timesteps to run for each problem

In [24]:
testbed = TestBed(n_arms, av_mean = 0, av_std = 1)
print(testbed.av)
print(testbed.opt_act)

[ 0.23374298 -1.42394939  1.56231347 -2.36682809 -0.11530937 -1.66648644
 -0.12181757  0.15724511  0.48356595 -0.85652382]
2


In [62]:
out = []
for ind in range(1000):
    out.append(np.random.choice((0, 1), size = 1, p = (0.1, 0.9))[0])
sum(out)/len(out)    

0.905

0.55