In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import numpy as np

In [3]:
class Games:
    
    player_class = None
    slot_class = None
    arms = 0
    times = 0
    count = 0
    expected_rewards_arms = None
    expected_earned_rewards = None

    def __init__(self, player_class, slot_class, arms, times, count):
        self.player_class = player_class
        self.slot_class = slot_class
        self.arms = arms
        self.times = times
        self.count = count
        self.expected_rewards = np.empty(0, float)
        self.expected_rewards_arms = np.empty((0, arms), float)
        
    def execute(self):
        for _ in range(count):
            game = Game(self.player_class, self.slot_class, self.arms, self.times)
            game.play()
            
            expected_reward_arms = game.player.cal_expected_reward_arms()
            self.expected_rewards_arms = np.append(self.expected_rewards_arms, expected_reward_arms)
            
            expected_reward = game.player.cal_expected_reward()
            self.expected_earned_rewards = np.append(self.expected_rewards, expected_reward)
            
            
    def pseudo_regret(self):
        return self.max_arm_expected_rewards_mean() + self.expected_rewards_mean()
        
    def max_arm_expected_rewards_mean(self):
        return self.expected_rewardss_arms.mean(axis=0).max()
    
    def expected_rewards_mean(self):
        return self.expected_rewards.mean()

class Game:
    player = None
    slot = None
    arms = 0
    times = 0
    
    def __init__(self, player_class, slot, arms, times):
        self.player = player_class(arms,)
        self.slot = slot_class()
        self.arms = arms
        self.times = times
        
    def play(self):
        for _ in range(self.times):
            self.turn()
    
    def turn(self):
        self.player.update_probs()
        
        rewards = self.slot.return_rewards(self.player.get_probs())
        self.player.pull_arm(rewards)
        
    def result(self):
        return self.player.earned_rewards.sum()
    
class Player:
    weightss = None
    probss = None
    cumulative_rewardss = None
    choiced_arms = np.array([])
    earned_rewards = np.array([])
    expected_rewardss = None
    
    def __init__(self, arms):
        initial_weights = np.ones(arms) / arms
        self.weightss = initial_weights.reshape(1,-1)
        self.probss = np.empty((0,arms),float)
        self.cumulative_rewardss = np.zeros(arms).reshape(1,-1)
        self.expected_rewardss = np.empty((0,arms), float)
        
    def get_weights(self):
        return self.weightss[-1]

    def get_probs(self):
        return self.probss[-1]
    
    def cal_expected_rewards(self, probs, rewards):
        expected_rewards = probs * rewards
        return expected_rewards
    
    def cal_expected_reward(self):
        reward = self.expected_rewardss.sum()
        return reward

    def cal_expected_reward_arms(self):
        reward_arms = self.expected_rewardss.sum(axis=0)
        return reward_arms

        
class Slot:
    
    rewardss = None
    
    def __init__(self,arms):
        self.rewardss = np.empty((0,arms), float)
        
class Random(Slot):
    def return_rewards(self, probs):
        rewards = np.zeros(len(probs))
        idx = np.random.randint(0, len(probs))
        rewards[idx] = 1
        self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        
        return rewards

# class FixedProbs(Slot):
#     probs = None
#     def __init__(self, probs):
#         self.rewardss = np.empty((0,len(probs)), float)
#         self.probs = probs
        
#     def return_rewards(self, _):
#         idx = np.random.choice(len(self.probs), p=self.probs)
#         rewards = np.zeros(len(self.probs))
#         rewards[idx] = 1
#         self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        
#         return rewards
    
class AdaptiveAdversary(Slot):
    def return_rewards(self, probs):
        # 最も確率が低いアームのindexを取得。複数ある場合はrandomに選ぶ。
        min_index = np.random.choice(np.flatnonzero(probs == probs.min()))
        rewards = np.zeros(len(probs))
        rewards[min_index] = 1
        self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        return rewards

In [9]:
def create_player_hedge(eta):
    class Hedge(Player):

        eta = 1

        def __init__(self, arms, eta=1):
            super().__init__(arms)
            self.eta = eta

        def update_probs(self):
            weights = self.get_weights()
            probs = weights / weights.sum()
            self.probss = np.vstack((self.probss, probs))

        def update_weights(self, rewards):
            weights = self.get_weights()
            new_weights = weights * np.exp(rewards * self.eta)
            self.weightss = np.vstack((self.weightss, new_weights))

        def pull_arm(self, rewards):
            probs = self.get_probs()
            choiced_arm = np.random.choice(len(probs), p=probs)
            self.choiced_arms = np.append(self.choiced_arms, choiced_arm)
            reward = rewards[choiced_arm]
            self.earned_rewards = np.append(self.earned_rewards, reward)

            self.update_weights(rewards)

            expected_rewards = self.cal_expected_rewards(probs, rewards)
            self.expected_rewardss = np.vstack((self.expected_rewardss, expected_rewards))

    return Hedge

In [4]:
def create_fixed_probs_class(probs):
    class FixedProbs(Slot):
        def __init__(self):
            self.rewardss = np.empty((0,len(self.probs)), float)
            self.probs = probs
            
        def return_rewards(self, _):
            idx = np.random.choice(len(self.probs), p=self.probs)
            rewards = np.zeros(len(self.probs))
            rewards[idx] = 1
            self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)

            return rewards
    
    return FixedProbs

In [10]:
# 1アームだけ報酬でやすい
arms = 5 
times = 1000
count = 100

eta = 0.7
player = create_player_hedge(eta)

probs = np.array([0.5,0.125,0.125,0.125,0.125])
slot = create_fixed_probs_class(probs)

games = Games(player, slot, arms, times, count)
games.execute()

TypeError: update_probs() missing 1 required positional argument: 'self'

In [None]:
# 1アームだけ報酬でやすい
arms = 5 
times = 1000
eta = 0.7
player = Hedge(arms,eta)

probs = np.array([0.5,0.125,0.125,0.125,0.125])
slot = FixedProbs(probs)

game = Game(player, slot, arms, times)
game.play()

In [None]:
game.result()

In [None]:
game.player.

In [None]:
game.player.expected_rewardss

In [None]:
# etaを微小な値にすると重みが更新されない
arms = 5
times = 1000
eta = 0.000001
player = Hedge(arms,eta)

probs = np.array([0.5,0.125,0.125,0.125,0.125])
slot = FixedProbs(probs)

game = Game(player, slot, arms, times)
game.play()

In [None]:
game.result()

In [None]:
game.player.weightss

In [None]:
# 敵対的
arms = 5
times = 1000
eta = 0.7
player = Hedge(arms, eta)

slot = AdaptiveAdversary(arms)
game = Game(player, slot, arms, times)

In [None]:
game.play()

In [None]:
game.result()

In [None]:
game.slot.rewardss.sum(axis=0)

In [None]:
# プレイヤーがアームを引く確率は累積報酬のみで決まる。
# プレイヤーが引く確率が低いアーム（＝累積報酬が最も少ないアーム）に報酬を設定することになるため
# 敵対者は各アームの累積報酬は等しくなるように設定する