In [0]:
import numpy as np
import pandas as pd
import time
import os
from IPython.display import clear_output
clear = lambda: os.system('cls') # windows
clear = lambda: os.system('clear') #linux

### global variable

In [0]:
ROW, COL = 30, 40
N_STATES = ROW*COL   #有多少種states
ACTIONS = ['left', 'right', 'up', 'down']     #可以做的動作
EPSILON = 0.8   # epsilon greedy
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 500   # maximum episodes
FRESH_TIME = 0.15    # fresh time for one move
MIN_STEP = 999999
# np.random.seed(1000)

### 建立Q table


3x3 的 Q table:
```
0 1 2
3 4 5
6 7 8
```
```
o - -
- - -
- - T
```
3x4 的 Q table:
```
0 1 2 3
4 5 6 7
8 9 10 11
```


In [0]:
def build_q_table(n_states, actions): 
    table = pd.DataFrame(np.zeros((n_states, len(actions))),columns=actions,)   
    return table

In [0]:
q_table1 = build_q_table(N_STATES, ACTIONS)
q_table1

Unnamed: 0,left,right,up,down
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
1195,0.0,0.0,0.0,0.0
1196,0.0,0.0,0.0,0.0
1197,0.0,0.0,0.0,0.0
1198,0.0,0.0,0.0,0.0


### choose action的功能

In [0]:
def choose_action(state, q_table): 
    state_actions = q_table.iloc[state, :] #取state這一行的對應資料 (Series)
    #act non-greedy or state-action have no value
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  
        action_name = np.random.choice(ACTIONS) 
    else:   # act greedy
        action_name = state_actions.idxmax()
    return action_name

In [0]:
choose_action(0, q_table)

'right'

In [0]:
q_table.iloc[1198, :]

left     0.000000
right    0.468559
up       0.080842
down    -0.063144
Name: 1198, dtype: float64

### 建立環境對我們行為的feedback
撞牆 reward 給負的

In [0]:
def get_env_feedback(S, A): 
    R = 0

    if A == 'right':    # move right
        if S == N_STATES - 2:   # 寶藏前一個位置
            S_ = 'terminal'  
            R = 1 # 找到才給reward
        elif (S+1)%COL == 0:
            # print('reach right wall')
            S_ = S  # reach right wall
            R = -1
        else: 
            S_ = S + 1 

    if A == 'left':   # move left
        if (S)%COL == 0:
            # print('reach left wall')
            S_ = S  # reach left wall
            R = -1
        else:
            S_ = S - 1

    if A == 'up':
        if S < COL:
            # print('reach top wall')
            S_ = S  # reach top wall
            R = -1
        else:
            S_ = S - COL

    if A == 'down':
        if (S+COL) == N_STATES - 1:
            S_ = 'terminal'  # FIND !!!
            R = 1 
        elif S >= (ROW-1)*COL:
            # print('reach down wall')
            S_ = S  # reach down wall
            R = -1
        else:
            S_ = S + COL
      
    return S_, R

In [0]:
S, A = 11990, 'down'
S_, R = get_env_feedback(S, A)
print("S next:", S_, "Reowrd:", R)

S next: 11990 Reowrd: -1


### 更新環境

In [0]:
def update_env(S, episode, step_counter):
    # '---------T' draw our 2D environment 
    # env_list = []
    # i = 0
    # for c in range(ROW):
    #     for r in range(COL):
    #         if i == S:
    #             env_list += 'o'
    #         else:
    #             env_list += '-'
    #         i += 1
    #     env_list += '\n'
    # env_list = env_list[:-2]
    # env_list += 'T\n'
    
    if S == 'terminal': 
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter) #回應
        # clear_output(wait=True)
        print('\r', end='')
        print('{}'.format(interaction)) 
        # time.sleep(1)                             
        # print('\r                                ', end='') #清空
        # 紀錄最小總步數
        global MIN_STEP
        if step_counter < MIN_STEP:
            MIN_STEP = step_counter
    else:
        if step_counter!=0 and step_counter%1000 == 0:
        # interaction = ''.join(env_list) 
            # clear_output(wait=True)
            print('\r', end='')
            print('\r', end='')
            print(f'Episode *{episode+1}: current_step = {step_counter}', end='')
        # print('{}'.format(interaction), end='')
        # time.sleep(FRESH_TIME)

In [8]:
update_env(S=4, episode=1, step_counter=500)
update_env(S=4, episode=2, step_counter=1000)
update_env(S='terminal', episode=3, step_counter=1500)

Episode *3: current_step = 1000Episode 4: total_steps = 1500


### 建立reinforcement learning

In [0]:
def rl():
    q_table = build_q_table(N_STATES, ACTIONS) #建立 Q table
    for episode in range(MAX_EPISODES): #從第一個回合玩到最後一個回合
        step_counter = 0
        S = 0 #初始情況，探索者放到左邊
        is_terminated = False 
        update_env(S, episode, step_counter) #更新環境
        while not is_terminated: #回合沒有結束

            A = choose_action(S, q_table) 
            S_, R = get_env_feedback(S, A)  
            q_predict = q_table.loc[S, A] #估計值 Single label for row and column > loc[index, col]
            if S_ != 'terminal': #回合還沒結束
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   #真實值 
            else:
                q_target = R    
                is_terminated = True    # 結束這一回合
                
            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)
            step_counter += 1
    return q_table, step_counter

### Execute Program

In [10]:
if __name__ == "__main__":
    q_table, step = rl()
    # clear_output(wait=True)
    print()
    print('minina step:', MIN_STEP)
    print('\nQ-table:\n')
    print(q_table)

Episode 1: total_steps = 116771
Episode 2: total_steps = 60048
Episode 3: total_steps = 126855
Episode 4: total_steps = 54836
Episode 5: total_steps = 16467
Episode 6: total_steps = 7319
Episode 7: total_steps = 81604
Episode 8: total_steps = 31635
Episode 9: total_steps = 9269
Episode 10: total_steps = 89394
Episode 11: total_steps = 16810
Episode 12: total_steps = 9405
Episode 13: total_steps = 13143
Episode 14: total_steps = 9064
Episode 15: total_steps = 21808
Episode 16: total_steps = 50046
Episode 17: total_steps = 3033
Episode 18: total_steps = 8356
Episode 19: total_steps = 1241
Episode 20: total_steps = 20115
Episode 21: total_steps = 58611
Episode 22: total_steps = 19564
Episode 23: total_steps = 7681
Episode 24: total_steps = 6571
Episode 25: total_steps = 6570
Episode 26: total_steps = 7045
Episode 27: total_steps = 15932
Episode 28: total_steps = 13233
Episode 29: total_steps = 14528
Episode 30: total_steps = 20369
Episode 31: total_steps = 7694
Episode 32: total_steps = 2

In [0]:
q_table.to_csv('q_table_撞牆扣reward.csv')