In [34]:
import random

class HeadlineNews:
    def __init__(self, actual_ctr=[0.1, 0.2]):
        self.num_news = len(actual_ctr)
        self.actual_ctr = actual_ctr
    
    def place(self, selected_news_index):
        if selected_news_index > self.num_news:
            raise Exception(f'the selected index {selected_news_index} is invalid')
        
        ctr = self.actual_ctr[selected_news_index]
        clicked = [0, 1]
        reward = random.choices(clicked, weights = [1 - ctr, ctr], k=1)

        return reward[0]
            

In [132]:
import statistics 

hn = HeadlineNews(actual_ctr=[0.3, 0.1])
rewards = []
for i in range(100):
    action = 1
    reward = hn.place(action)
    rewards.append(reward)
    print(f'reward is {reward} for the action {action}')
    
print(f'average reward value is {statistics.mean(rewards)}')

reward is 0 for the action 1
reward is 0 for the action 1
reward is 1 for the action 1
reward is 1 for the action 1
reward is 1 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 1 for the action 1
reward is 0 for the action 1
reward is 1 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 for the action 1
reward is 0 fo

In [155]:
import numpy as np


class EpsilonGreedy:
    def __init__(self, epsilon, num_action, verbose=False):
        self.epsilon = epsilon
        self.num_action = num_action
        self.placement_count = np.ones(num_action)
        self.click_count = np.zeros(num_action)
        self.verbose = verbose
    
    def get_action(self):
        if random.uniform(0, 1) <= self.epsilon:
            action = random.randint(0, self.num_action-1)
            if self.verbose:
                print(f'random selection:{action}')
        else:
            ctr = self.get_ctr()
            
            idx_max = np.argwhere(ctr == np.amax(ctr))
            idx_max_list = idx_max.flatten().tolist()
            
            action = random.choice(idx_max_list)
            
            if self.verbose:
                print(f'select best known action:{action}')

        return action
    
    def get_ctr(self):
        return np.divide(self.click_count, self.placement_count)
    
    def update(self, action, reward):
        self.placement_count[action] = self.placement_count[action] + 1
        self.click_count[action] = self.click_count[action] + reward

In [164]:
eg = EpsilonGreedy(epsilon = 0.5, num_action = 2, verbose=True)
# you will observe that about half of the time, action is randomly selected, 

for i in range(10):
    eg.get_action()

    


select best known action:1
random selection:0
random selection:0
select best known action:1
random selection:0
random selection:0
random selection:0
select best known action:1
random selection:0
select best known action:1


In [184]:
# let's build a case that the first news (with action index 0) always get clicked for 100 times of placement
eg = EpsilonGreedy(epsilon = 0.5, num_action = 2, verbose=True)

for i in range(100):
    eg.update(0, 1)


print(f'observed CTR is {eg.get_ctr()}')

# you will noticed that for the half of the time to chose ation base on the best observed CTR, 
# the first news (with action index 0) are always slected
for i in range(10):
    eg.get_action()
    #print(eg.get_action())

observed CTR is [0.99009901 0.        ]
select best known action:0
random selection:0
select best known action:0
random selection:1
select best known action:0
random selection:1
select best known action:0
random selection:0
random selection:1
select best known action:0


In [185]:
eg.placement_count

array([101.,   1.])

In [186]:
eg.click_count

array([100.,   0.])

In [187]:
eg.get_ctr()

array([0.99009901, 0.        ])

In [179]:
import statistics 


actual_ctr = [0.1, 0.2]
hn = HeadlineNews(actual_ctr)

epsilon = 0.1
num_action = len(actual_ctr)

eg = EpsilonGreedy(epsilon, num_action)

num_episode = 100
num_step = 1000

ep_rewards = []

verbose = False
for episode in range(num_episode):
    cumulative_reward = 0
    for step in range(num_step):    
        action = eg.get_action()
        reward = hn.place(action)

        cumulative_reward = cumulative_reward + reward

        eg.update(action, reward)

        if verbose and step % 100 == 0:
            print('==================================')
            print(f'at step {step}')
            print(f'current ctr is: {eg.get_ctr()}')
            print(f'placement_count is: {eg.placement_count}')
            print(f'click_count is: {eg.click_count}')

    print(f'for the episode {episode}, total rewards is {cumulative_reward}')
    ep_rewards.append(cumulative_reward)

print(f'mean value of cumulative rewards for 1000 steps is {statistics.mean(ep_rewards)}')
print(f'standard deviation of cumulative rewards for 1000 steps is {statistics.stdev(ep_rewards)}')

for the episode0, total rewards is 187
for the episode1, total rewards is 202
for the episode2, total rewards is 211
for the episode3, total rewards is 215
for the episode4, total rewards is 193
for the episode5, total rewards is 212
for the episode6, total rewards is 176
for the episode7, total rewards is 220
for the episode8, total rewards is 195
for the episode9, total rewards is 194
for the episode10, total rewards is 177
for the episode11, total rewards is 214
for the episode12, total rewards is 179
for the episode13, total rewards is 201
for the episode14, total rewards is 184
for the episode15, total rewards is 184
for the episode16, total rewards is 191
for the episode17, total rewards is 217
for the episode18, total rewards is 214
for the episode19, total rewards is 187
for the episode20, total rewards is 201
for the episode21, total rewards is 190
for the episode22, total rewards is 187
for the episode23, total rewards is 205
for the episode24, total rewards is 210
for the ep

In [180]:
import numpy as np


class RandomSelection:
    def __init__(self, num_action, verbose=False):
        self.num_action = num_action
        self.placement_count = np.ones(num_action)
        self.click_count = np.zeros(num_action)
        self.verbose = verbose
    
    def get_action(self):
        action = random.randint(0, self.num_action-1)
        return action
    
    def update(self, action, reward):
        self.placement_count[action] = self.placement_count[action] + 1
        self.click_count[action] = self.click_count[action] + reward
        
    def get_ctr(self):
        return np.divide(self.click_count, self.placement_count)

In [182]:
actual_ctr = [0.1, 0.2]
hn = HeadlineNews(actual_ctr)

num_action = len(actual_ctr)

rs = RandomSelection(num_action)

num_episode = 100
num_step = 1000

rs_rewards = []

verbose = False
for episode in range(num_episode):
    cumulative_reward = 0
    for step in range(num_step):    
        action = rs.get_action()
        reward = hn.place(action)

        cumulative_reward = cumulative_reward + reward

        rs.update(action, reward)

        if verbose and step % 100 == 0:
            print('==================================')
            print(f'at step {step}')
            print(f'current ctr is: {rs.get_ctr()}')
            print(f'placement_count is: {rs.placement_count}')
            print(f'click_count is: {rs.click_count}')

    print(f'for the episode {episode}, total rewards is {cumulative_reward}')
    rs_rewards.append(cumulative_reward)

print(f'mean value of cumulative rewards for 1000 steps is {statistics.mean(rs_rewards)}')
print(f'standard deviation of cumulative rewards for 1000 steps is {statistics.stdev(rs_rewards)}')

for the episode 0, total rewards is 147
for the episode 1, total rewards is 171
for the episode 2, total rewards is 158
for the episode 3, total rewards is 161
for the episode 4, total rewards is 171
for the episode 5, total rewards is 143
for the episode 6, total rewards is 158
for the episode 7, total rewards is 133
for the episode 8, total rewards is 143
for the episode 9, total rewards is 136
for the episode 10, total rewards is 152
for the episode 11, total rewards is 155
for the episode 12, total rewards is 136
for the episode 13, total rewards is 148
for the episode 14, total rewards is 141
for the episode 15, total rewards is 158
for the episode 16, total rewards is 145
for the episode 17, total rewards is 155
for the episode 18, total rewards is 141
for the episode 19, total rewards is 150
for the episode 20, total rewards is 139
for the episode 21, total rewards is 156
for the episode 22, total rewards is 146
for the episode 23, total rewards is 150
for the episode 24, total 

In [183]:
(195-149)/195

0.2358974358974359