<a href="https://colab.research.google.com/github/yhk775206/2023.RL/blob/main/a_karmed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Toy k-armed *bandit*

In [None]:
import numpy as np

In [None]:
# 1. 초기화
# (1) bandit 설정 (bandit 1 ~ 3을 사용할 것이므로 bandit[0]은 0으로 초기화)
# 0번 bandit은 쓰지 않음
bandit = [[0, 0, 0, 0, 0],
          [1, 0, 8, 5, 2],
          [0, 1, -1, 2, 28],
          [7, -3, -3, -3, -3]]

# (2) bandit에서 실행할 횟수
action = [0, -1, -1, -1]

# (3) Q 함수 (0으로 초기화)
Q = np.array([-1.0, 0.0, 0.0, 0.0])

# (4) alpha를 0.5로 초기화
alpha = 0.5

In [None]:
# 2. k-armed bandit 수행
# (1) 1번째로 선택할 bandit의 action을 선택
selected = 3
print(f"[1] bandit {selected}")
action[selected] +=1

# (2) 2~5번째(4번) bandit 선택을 수행
for i in range(1, 5):
    # (2-1) 선택한 bandit의 reward 가져오기
    reward = bandit[selected][action[selected]]

    # (2-2) 선택된 bandit의 Q 값 갱신: Q = Q + alpha * (R - Q)
    Q[selected] = Q[selected] + alpha * (reward - Q[selected])

    # (2-3) 다음 action 선택
    selected = np.argmax(Q)
    print(f"[{i+1}]-th bandit {selected} is selected")

    # (2-4) 선택한 bandit의 다음 action으로 이동
    action[selected] += 1


[1] bandit 3
[2]-th bandit 3 is selected
[3]-th bandit 3 is selected
[4]-th bandit 1 is selected
[5]-th bandit 1 is selected


### 2. Sample-average method
#### 1) Bandit class 정의

In [None]:
# 1. Bandit class 정의
class Bandit:
    # (1) 변수 설정
    def __init__(self, k, means, std_devs):
        self.k = k
        self.rewards = np.array([np.random.normal(loc=mean, scale=std_dev)
                                          for mean, std_dev in zip(means, std_devs)])
        self.Qs = np.zeros(k) # action values for each action
        self.num_selected = np.zeros(k) # number of times each action was selected

    # (2) reward 함수 정의: 선택한 action에 대한 reward를 return
    def get_reward(self, action):
        reward = self.rewards[action]
        return reward

    # (3) action 선택 함수 정의: 다음 action은 action_values (Qs) 중에서
    #     가장 큰 값으로 선택
    def choose_action(self):
        action = np.argmax(self.Qs)
        return action

    # (4) action value (Q)를 update하는 함수 정의
    def update_Qs(self, action, reward):
        self.num_selected[action] += 1
        alpha = 1.0 / self.num_selected[action]
        self.Qs[action] += alpha * (reward - self.Qs[action])


#### 2) 초기화

In [None]:
# 2. 초기화
# (1) arm의 수 설정
k = 3

# (2) k-armed baidnt 초기화: Mean values for each action
# means, std_devs -> bandit 선언
means = [2.75, 2.0, 2.5]
std_devs = [0.5, 3.0, 2.0]
bandit = Bandit(k, means, std_devs)

# (3) 수행 횟수 설정
n_iterations = 1000

#### 3) k-armed bandit 실행

In [None]:
# 3. k-armed bandit 실행
sum_reward = 0
for i in range(n_iterations):
    # (1) action을 수행할 bandit을  결정
    action = bandit.choose_action()
    # 첫번째 action은 1로 선택
    if (i == 0):
        action = 1
        print(action)

    # (2) 선택한 action의 reward 가져오기
    reward = bandit.get_reward(action)
    sum_reward += reward

    # (3) 선택한 action의 Q 값 갱신
    bandit.update_Qs(action, reward)


1


In [None]:
print("Number of selections for each action: ", bandit.num_selected)
print("Estimated values for each action: ", bandit.Qs)
print("Sum of reward:", sum_reward)


Number of selections for each action:  [   0. 1000.    0.]
Estimated values for each action:  [0.         2.03796579 0.        ]
Sum of reward: 2037.9657911784761


### 3. epsilon-greedy method

In [None]:
# 1. Bandit class 정의
class Bandit:
    # (1) 변수 설정
    def __init__(self, k, means, std_devs):
        self.k = k
        self.rewards = np.array([np.random.normal(loc=mean, scale=std_dev)
                                          for mean, std_dev in zip(means, std_devs)])
        self.Qs = np.zeros(k) # action values for each action
        self.num_selected = np.zeros(k) # number of times each action was selected
        self.epsilon = 0.1

    # (2) reward 함수 정의: 선택한 action에 대한 reward를 return
    def get_reward(self, action):
        reward = self.rewards[action]
        return reward

    # (3) random number를 선택
    #     이 값이 epsilon보다 작으면 random한 bandit 선택
    #     이 값이 epsilon보다 크면 이전과 같음
    def choose_action(self):
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.k)
        else:
            action = np.argmax(self.Qs)
        return action

    # (4) action value (Q)를 update하는 함수 정의
    def update_Qs(self, action, reward):
        self.num_selected[action] += 1
        alpha = 1.0 / self.num_selected[action]
        self.Qs[action] += alpha * (reward - self.Qs[action])

# 2. 초기화
# (1) arm의 수 설정
k = 3

# (2) k-armed baidnt 초기화: Mean values for each action
# means, std_devs -> bandit 선언
means = [2.75, 2.0, 2.5]
std_devs = [0.5, 3.0, 2.0]
bandit = Bandit(k, means, std_devs)

# (3) 수행 횟수 설정
n_iterations = 1000

# 3. k-armed bandit 실행
sum_reward = 0
for i in range(n_iterations):
    # (1) action을 수행할 bandit을  결정
    action = bandit.choose_action()
    # 첫번째 action은 1로 선택
    if (i == 0):
        action = 1
        print(action)

    # (2) 선택한 action의 reward 가져오기
    reward = bandit.get_reward(action)
    sum_reward += reward

    # (3) 선택한 action의 Q 값 갱신
    bandit.update_Qs(action, reward)

print("Number of selections for each action: ", bandit.num_selected)
print("Estimated values for each action: ", bandit.Qs)
print("Sum of reward:", sum_reward)


1
Number of selections for each action:  [ 75.  34. 891.]
Estimated values for each action:  [ 3.17583278 -1.83699854  4.30424481]
Sum of reward: 4010.811629336359


### 4. Optimistic initial value

In [None]:
# 1. Bandit class 정의
class Bandit:
    # (1) 변수 설정
    def __init__(self, k, means, std_devs, initial_value):
        self.k = k
        self.rewards = np.array([np.random.normal(loc=mean, scale=std_dev)
                                          for mean, std_dev in zip(means, std_devs)])
        #self.Qs = np.zeros(k) # action values for each action
        self.Qs = np.full(k, initial_value)  # action values for each action
        self.num_selected = np.zeros(k) # number of times each action was selected

    # (2) reward 함수 정의: 선택한 action에 대한 reward를 return
    def get_reward(self, action):
        reward = self.rewards[action]
        return reward

    # (3) random number를 선택
    #     이 값이 epsilon보다 작으면 random한 bandit 선택
    #     이 값이 epsilon보다 크면 이전과 같음
    def choose_action(self):
        action = np.argmax(self.Qs)
        return action

    # (4) action value (Q)를 update하는 함수 정의
    def update_Qs(self, action, reward):
        self.num_selected[action] += 1
        alpha = 1.0 / self.num_selected[action]
        self.Qs[action] += alpha * (reward - self.Qs[action])

# 2. 초기화
# (1) arm의 수 설정
k = 3

# (2) k-armed baidnt 초기화: Mean values for each action
# means, std_devs -> bandit 선언
means = [2.75, 2.0, 2.5]
std_devs = [0.5, 3.0, 2.0]
initial_value = 10.0
bandit = Bandit(k, means, std_devs, initial_value)

# (3) 수행 횟수 설정
n_iterations = 1000

# 3. k-armed bandit 실행
sum_reward = 0
for i in range(n_iterations):
    # (1) action을 수행할 bandit을  결정
    action = bandit.choose_action()
    # 첫번째 action은 1로 선택
    if (i == 0):
        action = 1
        print(action)

    # (2) 선택한 action의 reward 가져오기
    reward = bandit.get_reward(action)
    sum_reward += reward

    # (3) 선택한 action의 Q 값 갱신
    bandit.update_Qs(action, reward)

print("Number of selections for each action: ", bandit.num_selected)
print("Estimated values for each action: ", bandit.Qs)
print("Sum of reward:", sum_reward)


1
Number of selections for each action:  [  1.   1. 998.]
Estimated values for each action:  [2.42336684 2.91849719 6.28727816]
Sum of reward: 6280.045470880532
