# A parameter study of various bandit algorithms in nonstationary case
Exercise 2.9 (programming) Make a figure analogous to Figure 2.6 for the non-stationary case outlined in Exercise 2.5. Include the constant-step-size ε-greedy algorithm with α = 0.1. Use runs of 200,000 steps and, as a performance measure for each algorithm and parameter setting, use the average reward over the last 100,000 steps.

![avactor](Fig2_6.png)

In [2]:
import numpy as np
import pandas as pd
from numpy.random import normal, uniform

In [8]:
#from Ex2.5
np.random.seed(2021)
def generate_r(num_of_arms = 10, centre = 0, sd = 0):
    return normal(centre, sd, num_of_arms)

def update_r(r, step_centre = 0, step_sd = 0.01):
    step = normal(step_centre, step_sd, r.shape)
    return np.add(r, step)

def get_r(r, sd = 1):
    return normal(r, sd)

In [6]:
class rl(object):
    
    def __init__(self, num_of_arms = 10, init_q = None):
        self.num_of_arms = num_of_arms
        self.q = np.zeros(num_of_arms)
        if(init_q != None):
            self.q = init_q
        self.actions = []
        self.count = np.zeros(num_of_arms)
        self.rewards = []
        
    def step(self, r):
        action = choose_action()
        self.actions.append(action)
        self.rewards.append(r[action])
        self.count[action] += 1
        update_q(r, action)
    
    def choose_action(self):
        return np.random.randint(0, self.num_of_arms)
    
    def update_q(self, r, action):
        self.q[action] += (1/self.count[action]) * (r[action] - self.q[action])

In [7]:
# epsilon_greedy
class epsilon_greedy(rl):
    def __init__(self, num_of_arms = 10, init_q = None, epsilon = 0.1):
        rl.__init__(self, num_of_arms, init_q)
        self.epsilon = epsilon
        
    def choose_action(self):
        if unifrom() > epsilon:
            return np.argmax(self.q)
        return rl.choose_action(self)
# UCB
class UCB(rl):
    def __init__(self, num_of_arms = 10, init_q = None, c = 1):
        rl.__init__(self, num_of_arms, init_q)
        self.c = c
        
    def choose_action(self, t):
        if len(self.actions) < num_of_arms:
            return len(self.actions)
        return np.argmax(self.q + self.c * np.sqrt(np.ln(t) / count))
    
    def step(self, r, t):
        action = choose_action(t)
        self.actions.append(action)
        self.rewards.append(r[action])
        self.count[action] += 1
        update_q(r, action)
    
    
#grandient_bandit
def softmax(H):
    H_ = np.exp(H)
    return H_ / np.sum(H_)

class gradient_bandit(rl):
    def __init__(self, num_of_arms = 10, init_q = None, alpha = 0.25):
        rl.__init__(self, num_of_arms, init_q)
        self.alpha = alpha
        self.R = 0
        self.T = 0
        
    def choose_action(self, p):
        return np.random.choice(self.num_of_arms, p = p)
        
    
    def step(self, r):
        p = softmax(self.q)
        action = choose_action(p)
        self.actions.append(action)
        self.rewards.append(r[action])
        self.count[action] += 1
        
#         update average reward
        self.T += 1
        self.R += 1/T * r[action]
        
        update_q(r, action, p)
    
    def update_q(self, r, action, p):
        HAt = self.q[action] + self.alpha * (r[action] - self.R) * ( 1 - p[action])
        self.q = self.q - self.alpha * (r[action] - self.R) * p
        self.q[action] = HAt
        
#greedyWithInit  alpha = 0.1
class greedyWithInit(rl):
    def __init__(self, num_of_arms = 10, init_q = None, Q0 = 1, alpha = 0.1):
        rl.__init__(self, num_of_arms, init_q)
        self.q = np.full(num_of_arms, Q0)
    
    def choose_action():
        return np.argmax(self.q)
    
    def update_q(self, r, action):
        self.q[action] += self.alpha * (r[action] - self.q[action])

#constant_step_size epsilon greedy
class css_epsilon_greedy(epsilon_greedy):
    def __init__(self, num_of_arms = 10, init_q = None, epsilon = 0.1, alpha = 0.1):
        epsilon_greedy.__init__(self, num_of_arms, init_q, epsilon)
        self.alpha = alpha
    
    def update_q(self, r, action):
        self.q[action] += self.alpha * (r[action] - self.q[action])

array([0.09003057, 0.24472847, 0.66524096])

2

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])