# Different agents for the multi-armed bandit problem

This notebook contains different learning agents for the multi-armed bandit problem: A random agent, an epsilon-greedy agent and an upper confidence bound agent

In [None]:
import gym
import gym_bandits
import pandas as pd
import numpy as np

In [None]:
env = gym.make("BanditTenArmedGaussian-v0")

In [None]:
print("States",env.observation_space)
print("Actions",env.action_space)

In [None]:
class Agent:
    def __init__(self,n_bandits):
        self.n_bandits = n_bandits
        self.reset()
    def reset(self):
        self.all_n = np.zeros(self.n_bandits)
        self.all_Q = np.zeros(self.n_bandits)
    def update(self,state,action,reward,t):
        self.all_n[action]+=1
        self.all_Q[action]+=(reward-self.all_Q[action])/self.all_n[action]

In [None]:
def run_multi_armed_bandit(env,time_steps,agent):
    observation = env.reset()
    agent.reset()
    rewards = []
    for t in range(time_steps):
        action = agent.next_action(observation)
        observation, reward, done, info = env.step(action)
        agent.update(observation,action,reward,t)
        rewards.append(reward)
    rewards = np.array(rewards)
    return np.mean(rewards)

RANDOM AGENT

In [None]:
class RandomAgent(Agent):
    def next_action(self,observation):
        return np.random.randint(0,self.n_bandits)

In [None]:
random_agent = RandomAgent(n_bandits=10)

GREEDY AGENT

In [None]:
class GreedyAgent(Agent):
    def next_action(self,observation):
        return self.all_Q.argmax()

In [None]:
greedy_agent = GreedyAgent(10)

EPSILON-GREEDY AGENT

In [None]:
class EpsilonGreedyAgent(Agent):
    def __init__(self,n_bandits,epsilon):
        self.n_bandits = n_bandits
        self.epsilon = epsilon
        self.reset()
    def next_action(self,observation):
        if np.random.random()<self.epsilon:
            return np.random.randint(self.n_bandits)
        else:
            return self.all_Q.argmax()

In [None]:
epsilon_greedy_agent = EpsilonGreedyAgent(10,0.1)

UPPER CONFIDENCE BOUND AGENT

In [None]:
class UCB_LearningAgent:
    def __init__(self, n_bandits):
        self.n_bandits = n_bandits
        self.reset()
    def reset(self):
        self.Q_all = np.random.rand(self.n_bandits)
        self.N_all = np.ones(self.n_bandits) # intialize number of selections for all bandits as ones
        self.ucb_all = np.array([self.Q_all[i] + np.sqrt(2*(np.log(1)/self.N_all[i])) for i in range(self.n_bandits)])
    def update(self, observation,action, reward, t):
        self.N_all[action]+=1 # increment number of selections for selected bandit (action)
        self.Q_all[action]+=(reward - self.Q_all[action])/self.N_all[action] # update Q with the formula. Average so far
        self.ucb_all = np.array([self.Q_all[i] + np.sqrt(2*(np.log(t)/self.N_all[i])) for i in range(10)])
    def next_action(self,observation):
        return self.ucb_all.argmax()

In [None]:
ucb_agent = UCB_LearningAgent(10)

TESTING

In [None]:
print("Random agent",run_multi_armed_bandit(env,1000,random_agent))
print("Greedy agent",run_multi_armed_bandit(env,1000,greedy_agent))
print("Epsilon-greedy agent",run_multi_armed_bandit(env,1000,epsilon_greedy_agent))
print("Upper Confidence Bound agent",run_multi_armed_bandit(env,1000,ucb_agent))