In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import numpy as np

In [67]:
class Games:
    
    player_class = None
    slot_class = None
    arms = 0
    times = 0
    game_count = 0
    expected_rewards_arms = None
    expected_earned_rewards = None

    def __init__(self, player_class, slot_class, arms, times, game_count):
        self.player_class = player_class
        self.slot_class = slot_class
        self.arms = arms
        self.times = times
        self.game_count = game_count
        self.expected_rewards = np.empty(0, float)
        self.expected_rewards_arms = np.empty((0, arms), float)
        
    def execute(self):
        for _ in range(self.game_count):
            game = Game(self.player_class, self.slot_class, self.arms, self.times)
            game.play()
            
            expected_reward_arms = game.player.cal_expected_reward_arms()
            self.expected_rewards_arms = np.vstack((self.expected_rewards_arms, expected_reward_arms))
            
            expected_reward = game.player.cal_expected_reward()
            self.expected_rewards = np.append(self.expected_rewards, expected_reward)
            
            
    def pseudo_regret(self):
        return self.max_arm_expected_rewards_mean() - self.expected_rewards_mean()
        
    def max_arm_expected_rewards_mean(self):
        return self.expected_rewards_arms.mean(axis=0).max()
    
    def expected_rewards_mean(self):
        return self.expected_rewards.mean()

class Game:
    player = None
    slot = None
    arms = 0
    times = 0
    
    def __init__(self, player_class, slot_class, arms, times):
        self.player = player_class(arms)
        self.slot = slot_class(arms)
        self.arms = arms
        self.times = times
        
    def play(self):
        for _ in range(self.times):
            self.turn()
    
    def turn(self):
        self.player.update_probs()
        
        rewards = self.slot.return_rewards(self.player.get_probs())
        self.player.pull_arm(rewards)
        
    def result(self):
        return self.player.earned_rewards.sum()
    
class Player:
    weightss = None
    probss = None
    cumulative_rewardss = None
    choiced_arms = np.array([])
    earned_rewards = np.array([])
    expected_rewardss = None
    
    def __init__(self, arms):
        initial_weights = np.ones(arms) / arms
        self.weightss = initial_weights.reshape(1,-1)
        self.probss = np.empty((0,arms),float)
        self.cumulative_rewardss = np.zeros(arms).reshape(1,-1)
        self.expected_rewardss = np.empty((0,arms), float)
        
    def get_weights(self):
        return self.weightss[-1]

    def get_probs(self):
        return self.probss[-1]
    
    def cal_expected_rewards(self, probs, rewards):
        expected_rewards = probs * rewards
        return expected_rewards
    
    def cal_expected_reward(self):
        reward = self.expected_rewardss.sum()
        return reward

    def cal_expected_reward_arms(self):
        reward_arms = self.expected_rewardss.sum(axis=0)
        return reward_arms

class Slot:
    
    rewardss = None
    
    def __init__(self,arms):
        self.rewardss = np.empty((0,arms), float)
        
class Random(Slot):
    def return_rewards(self, probs):
        rewards = np.zeros(len(probs))
        idx = np.random.randint(0, len(probs))
        rewards[idx] = 1
        self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        
        return rewards
    
class AdaptiveAdversary(Slot):
    def return_rewards(self, probs):
        # 最も確率が低いアームのindexを取得。複数ある場合はrandomに選ぶ。
        min_index = np.random.choice(np.flatnonzero(probs == probs.min()))
        rewards = np.zeros(len(probs))
        rewards[min_index] = 1
        self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)
        return rewards

In [68]:
def create_player_hedge(eta):
    class Hedge(Player):

        eta = 1

        def __init__(self, arms):
            super().__init__(arms)
            self.eta = eta

        def update_probs(self):
            weights = self.get_weights()
            probs = weights / weights.sum()
            self.probss = np.vstack((self.probss, probs))

        def update_weights(self, rewards):
            weights = self.get_weights()
            new_weights = weights * np.exp(rewards * self.eta)
            self.weightss = np.vstack((self.weightss, new_weights))

        def pull_arm(self, rewards):
            probs = self.get_probs()
            choiced_arm = np.random.choice(len(probs), p=probs)
            self.choiced_arms = np.append(self.choiced_arms, choiced_arm)
            reward = rewards[choiced_arm]
            self.earned_rewards = np.append(self.earned_rewards, reward)

            self.update_weights(rewards)

            expected_rewards = self.cal_expected_rewards(probs, rewards)
            self.expected_rewardss = np.vstack((self.expected_rewardss, expected_rewards))

    return Hedge

In [69]:
def create_fixed_probs_class(probs):
    class FixedProbs(Slot):
        
        probs = None
        
        def __init__(self, _):
            self.probs = probs
            self.rewardss = np.empty((0,len(self.probs)), float)
            
        def return_rewards(self, _):
            idx = np.random.choice(len(self.probs), p=self.probs)
            rewards = np.zeros(len(self.probs))
            rewards[idx] = 1
            self.rewardss = np.append(self.rewardss, rewards.reshape(1,-1), axis=0)

            return rewards
    
    return FixedProbs

In [70]:
# 1アームだけ報酬でやすい
arms = 5 
times = 1000
count = 100

eta = 0.7
hedge_class = create_player_hedge(eta)

probs = np.array([0.5,0.125,0.125,0.125,0.125])
fixed_probs_class = create_fixed_probs_class(probs)

games = Games(hedge_class, fixed_probs_class, arms, times, count)
games.execute()

In [71]:
games.pseudo_regret()

-1.1420398887521515

In [72]:
# アームの期待報酬の平均が最も大きいものは50％の確率で報酬が出る
# そのため500ぐらいになっておかしくない。（実際には収束するのに時間がかかるため500より少し少ないはず）
print(games.max_arm_expected_rewards_mean())
assert(450 < games.max_arm_expected_rewards_mean() < 525)

497.2870429116531


In [7]:
games.

AttributeError: 'Games' object has no attribute 'game'

In [73]:
games.expected_rewards_mean()

498.42908280040524

In [74]:
# 1アームだけ報酬でやすい
arms = 5 
times = 1000
count = 100

eta = 0.7
hedge_class = create_player_hedge(eta)

games = Games(hedge_class, AdaptiveAdversary, arms, times, count)
games.execute()

In [75]:
games.pseudo_regret()

-118.88136888228937

In [77]:
games.max_arm_expected_rewards_mean()

29.794284317951075

In [76]:
games.expected_rewards_mean()

148.67565320024045

In [78]:
games.expected_rewards_arms

array([[28.68787086, 30.95901602, 29.54673999, 30.46353983, 29.0184865 ],
       [29.78209842, 29.57543042, 29.98015324, 29.31000204, 30.02796908],
       [29.34478164, 29.6940059 , 29.74621971, 30.10334469, 29.78730127],
       [29.73914071, 29.37017718, 30.02794275, 29.15370874, 30.38468382],
       [29.88237022, 30.25069081, 29.35627423, 29.6278325 , 29.55848545],
       [30.14587529, 29.73492551, 30.34965989, 29.62648053, 28.81871198],
       [29.3010022 , 29.55939629, 29.79619977, 30.3872583 , 29.63179664],
       [30.15121441, 29.89618623, 28.83323826, 29.7059828 , 30.08903149],
       [29.69638538, 30.2354196 , 29.0588988 , 29.69069482, 29.9942546 ],
       [30.38575385, 29.34439648, 29.60747113, 29.3262021 , 30.01182963],
       [29.22759734, 29.81444407, 30.74486427, 29.31751609, 29.57123143],
       [30.30083007, 29.79092741, 29.28887832, 29.46946806, 29.82554933],
       [29.45140256, 29.59672006, 30.19682681, 30.12849193, 29.30221183],
       [29.95047516, 29.21217308, 29.8

In [None]:
games.expected_rewards_arms

In [None]:
# 1アームだけ報酬でやすい
arms = 5 
times = 1000
eta = 0.7
player = Hedge(arms,eta)

probs = np.array([0.5,0.125,0.125,0.125,0.125])
slot = FixedProbs(probs)

game = Game(player, slot, arms, times)
game.play()

In [None]:
game.result()

In [None]:
game.player.

In [None]:
game.player.expected_rewardss

In [None]:
# etaを微小な値にすると重みが更新されない
arms = 5
times = 1000
eta = 0.000001
player = Hedge(arms,eta)

probs = np.array([0.5,0.125,0.125,0.125,0.125])
slot = FixedProbs(probs)

game = Game(player, slot, arms, times)
game.play()

In [None]:
game.result()

In [None]:
game.player.weightss

In [None]:
# 敵対的
arms = 5
times = 1000
eta = 0.7
player = Hedge(arms, eta)

slot = AdaptiveAdversary(arms)
game = Game(player, slot, arms, times)

In [None]:
game.play()

In [None]:
game.result()

In [None]:
game.slot.rewardss.sum(axis=0)

In [None]:
# プレイヤーがアームを引く確率は累積報酬のみで決まる。
# プレイヤーが引く確率が低いアーム（＝累積報酬が最も少ないアーム）に報酬を設定することになるため
# 敵対者は各アームの累積報酬は等しくなるように設定する