In [4]:
import numpy as np
from collections import defaultdict
import random
import numpy as np

class Agent:

    def __init__(self, nA=6):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        
    def epsilon_greedy(Q, state, nA, eps):
        """Selects epsilon-greedy action for supplied state
        parameters :-
        Q(dict): action-value function
        state(int): current states
        nA(int): number of actions in the environment
        eps(float): epsilon"""
        if random.random() > eps: #selects a greedy acion with epsilon
            return np.argmax(Q[state])
        else:
            return random.choice(np.arange(nA))
        
    def updted_Q_sarsa(alpha, gamma, Q, state, action, reward, next_state=None, next_action=None):
        """Returns updted Q-value for the most recent experience."""
        current = Q[state][action] # estimate in the Q-table (for current state, action pair)
        # get value of state action pair at time step
        Qsa_next = Q[next_state][next_action] if next_state is not None else 0
        target = reward + (gamma * Qsa_next)
        new_value = current + (alpha * (target - current))
        return new_value
        
        

    def select_action(self, state):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        return np.random.choice(self.nA)

    def step(self,num_episodes, state, action, reward, next_state, done, plot_every =100):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        Q = defaultdict(lambda: np.zeros(nA))
        #monitor performence
        temp_score = deque(maxlen=plot_every)
        avg_scores = deque(maxlen=num_episodes)
        for  i_episode in range(1, num_episodes+1):
            # monitor progress
            if  i_episode % 100 == 0:
                print("/r Episode {}/{}".format(i_episode, num_episodes),end="")
                score = 0
                state = 0
                eps = 1.0 / i_episode
                action = epsilon_greedy(Q, state, nA, eps)
                
                while True:
                    next_state, reward, done, info = action
                    score += reward
                    if not done:
                        next_action = epsilon_greedy(Q, next_state, nA, eps)
                        Q[state][action] = update_Q_sarsa(alpha, gamma, Q, \
                                                         state, action, reward, next_state, next_action)
                        tmp_scores.append(score)
                        break
                    if (i_episode % plot_every == 0):
                        avg_scores.append(np.mean(tmp_scores))
        self.Q[state][action] += 1
        return Q