# SARSA using Numpy

In [None]:
import numpy as np
import time

## Parameters

In [None]:
N_STATES = 6   # No of States
N_ACTIONS = 2  # No of Actions 
EPSILON = 0.2  # greedy police
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 10   # maximum episodes

## Initialize Q Tables

In [None]:
Q = np.zeros((N_STATES, N_ACTIONS))
print(Q)

## Epsilon Greedy Action

In [None]:
def choose_action(S, Q):
    if np.random.random() < EPSILON:
        A = np.random.randint(0, N_ACTIONS)
    else:
        A = np.argmax(Q[S,:])
    return A

## Env Feedback

In [None]:
def get_env_feedback(S, A):
    if A == 1:  # move right
        if S == N_STATES - 2:   
            S_ = N_STATES - 1
            R = 10
        else:
            S_ = S + 1
            R = 1
    else:   # move left
        R = -1
        if S == 0:
            S_ = S  
        else:
            S_ = S - 1
    return S_, R


## Update Env

In [None]:
def update_env(S, episode, step_counter):
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == N_STATES - 1:
        print(' Episode {}: total_steps = {}'.format(episode+1,step_counter))
        time.sleep(0.3)
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(0.3)

## Reinforcement Learning using Sarsa to update Q Tables

In [None]:
for episode in range(MAX_EPISODES):
    step_counter = 0
    S = 0
    is_terminated = False
    update_env(S, episode, step_counter)
    
    A = choose_action(S, Q)
    while not is_terminated:
        
        S_, R = get_env_feedback(S, A)  
        A_ = choose_action(S_, Q)
  
        q_current = Q[S, A]
        if S_ != N_STATES-1:
            q_target = R + GAMMA*Q[S_, A_] 
        else:
            q_target = R     
            is_terminated = True    

        Q[S, A] += ALPHA * (q_target - q_current)  
        S = S_  
        A = A_

        update_env(S, episode, step_counter+1)
        step_counter += 1

## Final Q Tables

In [None]:
print(Q)

## Maximum Q-Value Policy

In [None]:
S = 0
step_counter = 0
is_terminated = False
while not is_terminated:  
    update_env(S, 0, step_counter)
    A = np.argmax(Q[S,:])
    S_, R = get_env_feedback(S, A)
    step_counter += 1
    S = S_
    if S == N_STATES-1: 
        is_terminated = True
print(' Total steps = ',step_counter)

## Policy and Value Function

In [None]:
policy = {}
V = np.zeros(N_STATES)
for S in range(N_STATES):
    policy[S] = np.argmax(Q[S,:])
    V[S] = np.max(Q[S,:])
print('policy :', policy)
print('value function: ', V)