In [1]:
import sys
import numpy as np
import random
import matplotlib.pyplot as plt

In [3]:
class NArmBandit(object):
    def __init__(self,n,mean,var):
        self.num_arms = n
        self.mean = mean
        self.var = var
        self.actions = np.random.normal(mean,var,n)
        self.optimal_action = np.where(self.actions==np.max(self.actions))[0]
        return
    
    def reward_for(self,action_index):
        reward = self.actions[action_index]
        noise = np.random.normal(0,1,1)[0]
        return reward + noise

In [21]:
class Agent(object):
    def __init__(self):
        return
                
    def update_q_vals(self,action,reward):
        self.q_vals[action] = (self.q_vals[action] * self.k_vals[action] + reward) / (self.k_vals[action] + 1)
        self.k_vals[action] += 1
        
        return
    
    def reset(self):
        self.q_vals = [0.0 for x in range(self.num_actions)]
        self.k_vals = [0 for x in range(self.num_actions)]

class EpsilonGreedyAgent(Agent):
    def __init__(self,num_actions,e):
        self.num_actions = num_actions
        self.reset()
        self.epsilon = e
        return
    
    def name(self):
        return "E-Greedy Agent {}".format(self.epsilon)
   
    def select_action(self,bandit):
        r = random.random()
        action = None
        if(r < self.epsilon):
            action = int(random.random() * self.num_actions)
        else:
            action = self.q_vals.index(max(self.q_vals))
        
        reward = bandit.reward_for(action)
        self.update_q_vals(action,reward)
        
        return reward, (1 if action == bandit.optimal_action else 0)
    
class SoftmaxAgent(Agent):
    def __init__(self,num_actions,t):
        self.num_actions = num_actions
        self.temperature = t
        self.reset()
        return
    
    def name(self):
        return "Softmax Agent {}".format(self.temperature)
    
    def reset(self):
        self.q_vals = [0.0 for x in range(self.num_actions)]
        self.k_vals = [0 for x in range(self.num_actions)]
        self.temp_vals = [1.0 for x in range(self.num_actions)]
        self.gibbs_sum = self.num_actions
        
    def update_temp_vals(self,action):
        self.gibbs_sum -= self.temp_vals[action]
        self.temp_vals[action] = np.exp(self.q_vals[action]/self.temperature)
        self.gibbs_sum += self.temp_vals[action]
        return
    
    def select_action(self,bandit):
        r = random.random() * self.gibbs_sum
        i = self.temp_vals[0]
        action = 0
        while i < r and action+1 < len(self.temp_vals):
            i += self.temp_vals[action+1]
            action += 1
        
        reward = bandit.reward_for(action)
        self.update_q_vals(action,reward)
        self.update_temp_vals(action)
        
        return reward, (1 if action == bandit.optimal_action else 0)
    

In [23]:
num_bandits = 2000
num_steps = 1000
bandit_arms = 10
bandit_mean = 0.0
bandit_var = 1.0

In [26]:
agents = [
    EpsilonGreedyAgent(num_actions=bandit_arms,e=0.0),
    EpsilonGreedyAgent(num_actions=bandit_arms,e=0.01),
    EpsilonGreedyAgent(num_actions=bandit_arms,e=0.1),
    SoftmaxAgent(num_actions=bandit_arms,t=0.01),
    SoftmaxAgent(num_actions=bandit_arms,t=0.1),
    SoftmaxAgent(num_actions=bandit_arms,t=1)
]

bandits = [NArmBandit(n=bandit_arms,mean=bandit_mean,var=bandit_var) for i in range(num_bandits)]

rewards = [] # array of 8 agents, with 2k trials of 1k steps for each agent 8 x 2k x 1k
optimal_action = [] # then we average the 2k trials to end up with an 8 x 1k array of rewards or actions

ax1 = plt.subplot(211)
ax2 = plt.subplot(212)
x_data = range(num_steps)
colors = ['r','g','b','c','y','m','k','r']

for a, agent in enumerate(agents):
    print "Running agent {}".format(agent.name())
    rewards = []
    opts = []
    append_r = rewards.append
    append_o = opts.append
    for bandit in bandits:
        agent.reset()
        bandit_results = [agent.select_action(bandit) for t in x_data]
        append_r([b[0] for b in bandit_results])
        append_o([b[1] for b in bandit_results])
        
    avg_r = [float(sum(col))/len(col) for col in zip(*rewards)]
    avg_o = [float(sum(col))/len(col) for col in zip(*opts)]
    
    ax1.plot(x_data, avg_r, label=agent.name(),color=colors[a])
    ax2.plot(x_data, avg_o, label=agent.name(),color=colors[a])
    
box1 = ax1.get_position()
ax1.set_position([box1.x0, box1.y0, box1.width * 0.7, box1.height])

box2 = ax2.get_position()
ax2.set_position([box2.x0, box2.y0, box2.width * 0.7, box2.height])

ax1.set_ylabel('Average Reward')
ax1.set_title("Average Reward per Agent")

ax2.set_ylabel('% Opt Action')
ax2.set_title("% Opt Action per Agent")

ax1.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)
plt.show()

Running agent E-Greedy Agent 0.0
Running agent E-Greedy Agent 0.01
Running agent E-Greedy Agent 0.1
Running agent Softmax Agent 0.01
Running agent Softmax Agent 0.1
Running agent Softmax Agent 1
