## Tax-v3 usi Q Learning

In [None]:
import gym
import numpy as np
from collections import deque
import sys
import math

In [None]:
import numpy as np
from collections import defaultdict
import random

class Agent:

    def __init__(self, nA=6,eps=0.1,gamma=1, alpha = 0.08 ):
        """ Initialize agent.

        Params
        ======
        - nA: number of actions available to the agent
        """
        self.nA = nA
        self.Q = defaultdict(lambda: np.zeros(self.nA))
        self.eps = eps
        self.gamma = gamma
        self.alpha = alpha

    def select_action(self, state, i):
        """ Given the state, select an action.

        Params
        ======
        - state: the current state of the environment

        Returns
        =======
        - action: an integer, compatible with the task's action space
        """
        self.eps = self.eps/i

        if random.random() > self.eps:
        	return np.argmax(self.Q[state])

        else:
        	return np.random.choice(self.nA)

    def step(self, state, action, reward, next_state, done):
        """ Update the agent's knowledge, using the most recently sampled tuple.

        Params
        ======
        - state: the previous state of the environment
        - action: the agent's previous choice of action
        - reward: last reward received
        - next_state: the current state of the environment
        - done: whether the episode is complete (True or False)
        """
        target = reward + self.gamma*np.max(self.Q[next_state])
        self.Q[state][action] = self.Q[state][action] + self.alpha*(target - self.Q[state][action])
        return self.Q

  and should_run_async(code)


In [None]:
def interact(env, agent, num_episodes=20000, window=100):
    """ Monitor agent's performance.

    Params
    ======
    - env: instance of OpenAI Gym's Taxi-v1 environment
    - agent: instance of class Agent (see Agent.py for details)
    - num_episodes: number of episodes of agent-environment interaction
    - window: number of episodes to consider when calculating average rewards

    Returns
    =======
    - avg_rewards: deque containing average rewards
    - best_avg_reward: largest value in the avg_rewards deque
    """
    # initialize average rewards
    avg_rewards = deque(maxlen=num_episodes)
    # initialize best average reward
    best_avg_reward = -math.inf
    # initialize monitor for most recent rewards
    samp_rewards = deque(maxlen=window)
    # for each episode
    for i_episode in range(1, num_episodes+1):
        # begin the episode
        state = env.reset()
        # initialize the sampled reward
        samp_reward = 0
        while True:
            # agent selects an action
            action = agent.select_action(state,i_episode)
            # agent performs the selected action
            next_state, reward, done, _ = env.step(action)
            # agent performs internal updates based on sampled experience
            Q = agent.step(state, action, reward, next_state, done)
            # update the sampled reward
            samp_reward += reward
            # update the state (s <- s') to next time step
            state = next_state
            if done:
                # save final sampled reward
                samp_rewards.append(samp_reward)
                break
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards)
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward
        # monitor progress
        print("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")
        sys.stdout.flush()
        # check if task is solved (according to OpenAI Gym)
        if best_avg_reward >= 9.7:
            print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")
            break
        if i_episode == num_episodes: print('\n')
    return avg_rewards, best_avg_reward,Q

In [None]:
env = gym.make('Taxi-v3')

  deprecation(
  deprecation(


In [None]:
# Let's first do some random steps in the game so you see how the game looks like

rew_tot=0
obs= env.reset()
env.render()
frames=[]
for _ in range(6):
    action = env.action_space.sample() #take step using random action from possible actions (actio_space)
    obs, rew, done, info = env.step(action)
    rew_tot = rew_tot + rew
    # env.render()
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': obs,
        'action': action,
        'reward': rew
        }
    )
#Print the reward of these random action
print("Reward: %r" % rew_tot)

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Reward: -15


  if not isinstance(terminated, (bool, np.bool8)):
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


In [None]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(0.5)

In [None]:
from IPython.display import clear_output
from time import sleep
print_frames(frames)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+
  (East)

Timestep: 6
State: 449
Action: 2
Reward: -1


In [None]:
agent = Agent(eps=1,gamma=0.9, alpha = 0.7)
avg_rewards, best_avg_reward,q = interact(env, agent,num_episodes=5000)

Episode 5000/5000 || Best average reward 8.49



In [None]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(0.5)

In [None]:
env_test = gym.make("Taxi-v3").env
state1 = env.encode(4, 1, 2, 1) # (taxi row, taxi column, pickup index, destination index)
env_test.s = state1
env_test.reset()
env_test.render()

In [None]:
done = False
frames=[]
while not done:
    action = np.argmax(q[state1])

    state1, reward, done, info = env_test.step(action)

    frames.append({
        'frame': env_test.render(mode='ansi'),
        'state': state1,
        'action': action,
        'reward': reward
        }
    )

### This RL agent will pick customer (Blue colour) and drops it (Red colour)

In [None]:
from IPython.display import clear_output
from time import sleep
print_frames(frames)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)

Timestep: 15
State: 475
Action: 5
Reward: 20
