In [None]:
import numpy as np
import pandas as pd
import time
import os
from IPython.display import clear_output
clear = lambda: os.system('cls') # windows
clear = lambda: os.system('clear') #linux

### global variable

In [None]:
ROW, COL = 30, 40
N_STATES = ROW*COL   #有多少種states
ACTIONS = ['left', 'right', 'up', 'down']     #可以做的動作
EPSILON = 0.9   # epsilon greedy
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 100   # maximum episodes
FRESH_TIME = 0.15    # fresh time for one move
MIN_STEP = 1e8
MIS_STEP_EPISODE = 1e5
# np.random.seed(1000)

### 建立Q table


3x3 的 Q table:
```
0 1 2
3 4 5
6 7 8
```
```
o - -
- - -
- - T
```
3x4 的 Q table:
```
0 1 2 3
4 5 6 7
8 9 10 11
```


In [None]:
def build_q_table(n_states, actions): 
    table = pd.DataFrame(np.zeros((n_states, len(actions))),columns=actions,)   
    return table

### choose action的功能

In [None]:
def choose_action(state, q_table): 
    state_actions = q_table.iloc[state, :] #取state這一行的對應資料 (Series)
    #act non-greedy or state-action have no value
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  
        action_name = np.random.choice(ACTIONS) 
    else:   # act greedy
        action_name = state_actions.idxmax()
    return action_name

### 建立環境對我們行為的feedback
1.   寶藏 reward 給超多，ex:1e8
2.   撞牆 reward 給負超多，ex: -1e5
3.   當機器人`上一個位置離終點的距離` 比 `這一次位置離終點的距離` 大的話，代表有朝終點走，reward 給正的；反之給負的。 (有設計reward隨距離指數遞增>好像沒實際用上) ex: 1 or -1



In [None]:
def distance(S, S_):
    '''distance = 所需步數 = 水平距離+垂直距離'''
    x, y = S%COL, int(S/COL)
    x_, y_ = S_%COL, int(S_/COL)
    distance = abs(x-x_) + abs(y-y_)
    # print('x:', x, 'y:',y)
    # print('x_:', x_, 'y_:',y_)
    # print('distance:', distance)

    return distance

In [None]:
def reward(S, S_):
    '''reward = 1/接近寶藏的距離，正分從1~0的指數遞減(變近)，距離遠負分'''
    d  = distance(S, N_STATES-1)
    d_ = distance(S_, N_STATES-1)
    difference = d - d_
    reward = 1/difference
    return reward

In [None]:
distance(0, 1199)

In [None]:
reward(2, 0)

In [None]:
def get_env_feedback(S, A): 
    R = 0

    if A == 'right':    # move right
        if S == N_STATES - 2:   # 寶藏前一個位置
            S_ = 'terminal'  
            R = 1e5 # 找到才給reward
        elif (S+1)%COL == 0:
            # print('reach right wall')
            S_ = S  # reach right wall
            R = -1e5
        else: 
            S_ = S + 1 
            R = reward(S, S_)

    if A == 'left':   # move left
        if (S)%COL == 0:
            # print('reach left wall')
            S_ = S  # reach left wall
            R = -1e5
        else:
            S_ = S - 1
            R = reward(S, S_)

    if A == 'up':
        if S < COL:
            # print('reach top wall')
            S_ = S  # reach top wall
            R = -1e5
        else:
            S_ = S - COL
            R = reward(S, S_)

    if A == 'down':
        if (S+COL) == N_STATES - 1:
            S_ = 'terminal'  # FIND !!!
            R = 1e5 
        elif S >= (ROW-1)*COL:
            # print('reach down wall')
            S_ = S  # reach down wall
            R = -1e5
        else:
            S_ = S + COL
            R = reward(S, S_)
      
    return S_, R

In [None]:
S, A = 11990, 'down'
S_, R = get_env_feedback(S, A)
print("S next:", S_, "Reowrd:", R)

### 更新環境

In [None]:
def update_env(S, episode, step_counter):
    # '---------T' draw our 2D environment 
    # env_list = []
    # i = 0
    # for c in range(ROW):
    #     for r in range(COL):
    #         if i == S:
    #             env_list += 'o'
    #         else:
    #             env_list += '-'
    #         i += 1
    #     env_list += '\n'
    # env_list = env_list[:-2]
    # env_list += 'T\n'
    
    if S == 'terminal': 
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter) #回應
        # clear_output(wait=True)
        print('\r', end='')
        print('{}'.format(interaction)) 
        # time.sleep(1)                             
        # print('\r                                ', end='') #清空
        # 紀錄最小總步數
        global MIN_STEP
        global MIS_STEP_EPISODE
        if step_counter < MIN_STEP:
            MIN_STEP = step_counter
            MIS_STEP_EPISODE = episode+1
    else:
        if step_counter!=0 and step_counter%1000 == 0:
        # interaction = ''.join(env_list) 
            # clear_output(wait=True)
            print('\r', end='')
            print('\r', end='')
            print(f'Episode *{episode+1}: current_step = {step_counter}', end='')
        # print('{}'.format(interaction), end='')
        # time.sleep(FRESH_TIME)

In [None]:
update_env(S=4, episode=1, step_counter=500)
update_env(S=4, episode=2, step_counter=1000)
update_env(S='terminal', episode=3, step_counter=1500)

### 建立reinforcement learning

In [None]:
def rl():
    q_table = build_q_table(N_STATES, ACTIONS) #建立 Q table
    for episode in range(MAX_EPISODES): #從第一個回合玩到最後一個回合
        step_counter = 0
        S = 0 #初始情況，探索者放到左邊
        is_terminated = False 
        # EPSILON Greedy
        update_env(S, episode, step_counter) #更新環境
        while not is_terminated: #回合沒有結束

            A = choose_action(S, q_table) 
            S_, R = get_env_feedback(S, A)  
            q_predict = q_table.loc[S, A] #估計值 Single label for row and column > loc[index, col]
            if S_ != 'terminal': #回合還沒結束
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   #真實值 
            else:
                q_target = R    
                is_terminated = True    # 結束這一回合
                
            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)
            step_counter += 1
    return q_table, step_counter

### Execute Program

In [None]:
if __name__ == "__main__":
    q_table, step = rl()
    # clear_output(wait=True)
    print()
    print('minina step:', MIN_STEP, '@ episode',MIS_STEP_EPISODE)
    print('\nQ-table:\n')
    print(q_table)

Episode 1: total_steps = 223
Episode 2: total_steps = 215
Episode 3: total_steps = 152
Episode 4: total_steps = 92
Episode 5: total_steps = 92
Episode 6: total_steps = 79
Episode 7: total_steps = 78
Episode 8: total_steps = 96
Episode 9: total_steps = 100
Episode 10: total_steps = 105
Episode 11: total_steps = 82
Episode 12: total_steps = 83
Episode 13: total_steps = 83
Episode 14: total_steps = 81
Episode 15: total_steps = 68
Episode 16: total_steps = 80
Episode 17: total_steps = 77
Episode 18: total_steps = 76
Episode 19: total_steps = 87
Episode 20: total_steps = 74
Episode 21: total_steps = 96
Episode 22: total_steps = 75
Episode 23: total_steps = 86
Episode 24: total_steps = 75
Episode 25: total_steps = 78
Episode 26: total_steps = 76
Episode 27: total_steps = 81
Episode 28: total_steps = 76
Episode 29: total_steps = 78
Episode 30: total_steps = 74
Episode 31: total_steps = 74
Episode 32: total_steps = 74
Episode 33: total_steps = 74
Episode 34: total_steps = 78
Episode 35: total

In [None]:
q_table.to_csv('q_table.csv')