In [1]:
import numpy as np

In [2]:
class Game:
    player = None
    slot = None
    arms = 0
    times = 0
    
    def __init__(self, player, slot, arms, times):
        self.player = player
        self.slot = slot
        self.arms = arms
        self.times = times
        
    def play(self):
        for _ in range(self.times):
            self.turn()
    
    def turn(self):
        self.player.update_probs()
        
        rewards = self.slot.return_rewards(self.player.get_probs())
        self.player.pull_arm(rewards)
        
    def result(self):
        return self.player.earned_rewards.sum()

class Player:
    weightss = None
    probss = None
    cumulative_rewardss = None
    choiced_arms = np.array([])
    earned_rewards = np.array([])
    
    def __init__(self, arms):
        initial_weights = np.ones(arms) / arms
        self.weightss = initial_weights.reshape(1,-1)
        self.probss = np.empty((0,arms),float)
        self.cumulative_rewardss = np.zeros(arms).reshape(1,-1)
        
    def get_weights(self):
        return self.weightss[-1]

    def get_probs(self):
        return self.probss[-1]

class Hedge(Player):
    def update_probs(self):
        weights = self.get_weights()
        probs = weights / weights.sum()
        self.probss = np.vstack((self.probss, probs))
    
    def update_weights(self, rewards):
        weights = self.get_weights()
        new_weights = weights * np.exp(rewards)
        self.weightss = np.vstack((self.weightss, new_weights))
        
    def pull_arm(self, rewards):
        probs = self.get_probs()
        choiced_arm = np.random.choice(len(probs), p=probs)
        self.choiced_arms = np.append(self.choiced_arms, choiced_arm)
        reward = rewards[choiced_arm]
        self.earned_rewards = np.append(self.earned_rewards, reward)
        
        self.update_weights(rewards)
        
class Slot:
    
    rewardss = None
    
    def __init__(self,arms):
        self.rewardss = np.empty((0,arms), float)
        
class Random(Slot):
    def return_rewards(self, probs):
        rewards = np.zeros(len(probs))
        idx = np.random.randint(0, len(probs))
        rewards[idx] = 1
        self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        
        return rewards

class FixedProbs(Slot):
    probs = None
    def __init__(self, probs):
        self.rewardss = np.empty((0,len(probs)), float)
        self.probs = probs
        
    def return_rewards(self, _):
        idx = np.random.choice(len(self.probs), p=self.probs)
        rewards = np.zeros(len(self.probs))
        rewards[idx] = 1
        self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        
        return rewards
    
class AdaptiveAdversary(Slot):
    def return_rewards(self, probs):
        # 最も確率が低いアームのindexを取得。複数ある場合はrandomに選ぶ。
        min_index = np.random.choice(np.flatnonzero(probs == probs.min()))
        rewards = np.zeros(len(probs))
        rewards[min_index] = 1
        self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        return rewards

In [3]:
# 1アームだけ報酬でやすい
arms = 5
times = 1000
player = Hedge(arms)

probs = np.array([0.5,0.125,0.125,0.125,0.125])
slot = FixedProbs(probs)

game = Game(player, slot, arms, times)
game.play()

In [4]:
game.result()

516.0

In [5]:
# 敵対的
arms = 5
times = 1000
player = Hedge(arms)

slot = AdaptiveAdversary(arms)
game = Game(player, slot, arms, times)

In [6]:
game.play()

In [7]:
game.result()

111.0

In [8]:
game.slot.rewardss.sum(axis=0)

array([200., 200., 200., 200., 200.])

In [9]:
# プレイヤーがアームを引く確率は累積報酬のみで決まる。
# プレイヤーが引く確率が低いアーム（＝累積報酬が最も少ないアーム）に報酬を設定することになるため
# 敵対者は各アームの累積報酬は等しくなるように設定する