# SARSA using PANDAS

In [None]:
import numpy as np
import pandas as pd
import time

## Parameters

In [None]:
np.random.seed(2)  # reproducible

N_STATES = 6   # the length of the 1 dimensional world
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.2  # greedy police
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 10   # maximum episodes

## Initialize Q Tables

In [None]:
def build_q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))), 
        columns=actions,
    )
    return table

## Choose Actions Based On Epsiion Greedy Strategy

In [None]:
def choose_action(state, q_table):
    state_actions = q_table.loc[state, :]
    if (np.random.uniform() < EPSILON):  
        action_name = np.random.choice(ACTIONS)
    else:   
        action_name = state_actions.idxmax() 
    return action_name

## Env Feedback

In [None]:
def get_env_feedback(S, A):
    if A == 'right':    
        if S == N_STATES - 2:   
            S_ = N_STATES - 1
            R = 10
        else:
            S_ = S + 1
            R = 1
    else:   # move left
        R = -1
        if S == 0:
            S_ = S  
        else:
            S_ = S - 1
    return S_, R


## Update Env

In [None]:
def update_env(S, episode, step_counter):
    # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == N_STATES - 1:
        print(' Episode {}: total_steps = {}'.format(episode+1,step_counter))
        time.sleep(0.3)
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(0.3)


## Update Q Tables 

In [None]:
q_table = build_q_table(N_STATES, ACTIONS)
for episode in range(MAX_EPISODES):
    step_counter = 0
    S = 0
    is_terminated = False
    update_env(S, episode, step_counter)
    
    A = choose_action(S, q_table)
    while not is_terminated:     
        S_, R = get_env_feedback(S, A)  
        A_ = choose_action(S_, q_table)
        q_current = q_table.loc[S, A]
        if S_ != N_STATES-1:
            q_target = R + GAMMA * q_table.loc[S_, A_]   
        else:
            q_target = R     
            is_terminated = True   

        q_table.loc[S, A] += ALPHA * (q_target - q_current) 
        S = S_  
        A = A_

        step_counter += 1
        update_env(S, episode, step_counter)
       
        


## Final Q Tables

In [None]:
print(q_table)

## Maximum Q-Value Policy

In [None]:
def choose_action_optimal(state, q_table):
    state_actions = q_table.loc[state, :]
    action_name = state_actions.idxmax() #idxmax is argmax in pandas
    return action_name

S = 0
step_counter = 0
is_terminated = False
while not is_terminated:  
    update_env(S, 0, step_counter)
    A = choose_action_optimal(S, q_table)
    S_, R = get_env_feedback(S, A)
    step_counter += 1
    S = S_
    if S == N_STATES-1: 
        is_terminated = True
print(' Total steps = ',step_counter)