In [None]:
!pip install cmake 'gym[atari]' scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Set up the environment

In [None]:
import gym

env = gym.make("Taxi-v3").env

env.render()

+---------+
|R: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [None]:
env.s = 301
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |B: |
+---------+



In [None]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


# State encoding and setting

In [None]:
state = env.encode(4, 0, 3, 2)
state

414

In [None]:
state = env.encode(4, 0, 3, 2) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 414
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[43mY[0m[0m| : |[34;1mB[0m: |
+---------+



In [None]:
env.P[416]

{0: [(1.0, 416, -1, False)],
 1: [(1.0, 316, -1, False)],
 2: [(1.0, 416, -1, False)],
 3: [(1.0, 416, -1, False)],
 4: [(1.0, 416, -10, False)],
 5: [(1.0, 408, -1, False)]}

In [None]:

env.P[100]

# env.s = 100
# env.render()

{0: [(1.0, 200, -1, False)],
 1: [(1.0, 0, -1, False)],
 2: [(1.0, 120, -1, False)],
 3: [(1.0, 100, -1, False)],
 4: [(1.0, 100, -10, False)],
 5: [(1.0, 100, -10, False)]}

# Brute force approach


In [None]:
env.action_space.sample()

4

Brute_force_approach(env,s): fuction that takes an environment and state this solving the environment without Reinforcement Learning
     

In [None]:
def Brute_force_approach(env,s):

  env.s = 328  # set environment to illustration's state
  
  epochs = 0
  penalties, rewards = 0, 0

  frames = [] # for animation

  done = False

  
  while not done:

    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    if reward > 0:
      rewards += 1

    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
  print("Timesteps taken: {}".format(epochs))
  print("Penalties incurred: {}".format(penalties))


In [None]:
s=env.s = 328  # set environment to illustration's state
Brute_force_approach(env,s)

Timesteps taken: 936
Penalties incurred: 288


In [None]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(1)
        


In [None]:
import random
from IPython.display import clear_output
import numpy as np

if random.uniform(0, 1) < 0.3:
  print("ok")

Training the Agent by Q_learning_approach, training(env,s): fuction that takes an environment and state this solving the environment by Reinforcement Learning the function takes an environment and the three parameters (alpha and gamma and epsilon)like this : training (env,a,g,e) , where alpha is the learning rate ,gamma is the discount factor
     - Results : The Q-table is a matrix where we have a row for every state (500) and a column for every actions in 6 action that we have  (south,north ,east,west ,pickup,dropoff ) 
     

In [None]:
def training (env,a,g,e):

  
  %%time
  """Training the agent"""

  import random
  from IPython.display import clear_output
  import numpy as np

  # Initialize the q table
  q_table = np.zeros([env.observation_space.n, env.action_space.n])

  # Hyperparameters
  alpha = a
  gamma = g
  epsilon = e

  # For plotting metrics
  all_epochs = []
  all_penalties = []

  for i in range(1, 100001):


    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:

      if random.uniform(0, 1) < epsilon:


        action = env.action_space.sample() # Explore action space
      else:

        action = np.argmax(q_table[state]) # Exploit learned values

      next_state, reward, done, info = env.step(action) 
        
      old_value = q_table[state, action]
      next_max = np.max(q_table[next_state])
    
      new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
      q_table[state, action] = new_value


      if reward == -10:
        penalties += 1

      state = next_state
      epochs += 1
        
      if i % 100 == 0:

        clear_output(wait=True)
        print(f"Episode: {i}")
  print("Training finished.\n")
  return q_table


In [None]:
 # Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
q_table=training (env,alpha,gamma,epsilon)

Episode: 100000
Training finished.



In [None]:
env.encode

<bound method TaxiEnv.encode of <gym.envs.toy_text.taxi.TaxiEnv object at 0x7f936ca66190>>

# Evalutation

Evaluating the agent ,evaluation (q_table,env) : this function that takes a q_table and the environment , this fuction evaluates the performance of our agent. We don't need to explore actions any further, so now the next action is always selected using the best Q-value
     -Results :this fuction return Average timesteps per episode and Average penalties per episode and Average rewards.

In [None]:
def evaluation (q_table,env):

  """Evaluate agent's performance after Q-learning"""

  total_epochs, total_penalties ,total_rewards = 0, 0,0
  episodes = 1000

  for _ in range(episodes):

    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs
    total_rewards +=reward
  avr_time_stampe=(total_epochs /episodes)
  avr_penalties=(total_penalties / episodes)
  avr_rewards=(total_rewards/episodes)

  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {avr_time_stampe}")
  print(f"Average penalties per episode: {avr_penalties}")
  return avr_time_stampe,avr_penalties,avr_rewards
  



In [None]:
avr_time_stampe,avr_penalties,avr_rewards=evaluation (q_table,env)

Results after 1000 episodes:
Average timesteps per episode: 13.092
Average penalties per episode: 0.0


TUNNING

Tunning :The values of `alpha`, `gamma`, and `epsilon` , all three should decrease over time because as the agent continues to learn, give to parameters (alpha , gamma, epsilon)different values from (0.1 :0.9) and try which parameters will affect in the environment 

In [None]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output
import numpy as np

# Initialize the q table
q_table = np.zeros([env.observation_space.n, env.action_space.n])

# Hyperparameters
alpha = 0.7
gamma = 0.7
epsilon = 0.6

# For plotting metrics
all_epochs = []
all_penalties = []
count=0
for i in range(1, 100001):
  
  state = env.reset()

    
  epochs, penalties, reward, = 0, 0, 0
  done = False
    
  while not done:
      if random.uniform(0, 1) < epsilon:
          action = env.action_space.sample() # Explore action space
      else:
          action = np.argmax(q_table[state]) # Exploit learned values

      next_state, reward, done, info = env.step(action) 
        
      old_value = q_table[state, action]
      next_max = np.max(q_table[next_state])
    
      new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
      q_table[state, action] = new_value

      if reward == -10:
          penalties += 1

      state = next_state
      epochs += 1
        
      if i % 100 == 0:
        alpha =alpha*0.9
        gamma =gamma*0.9
        epsilon =epsilon*0.9
        clear_output(wait=True)
        print(f"Episode: {i}")
      count+=1
print("Training finished.\n")

CREAT A GRID SEARCH ,as the training take a number of hours to train 10 values for every parameter , for this reason ,I take 3 values only for every parameter 

In [None]:
import itertools
alpha_list=[0.2,0.6,0.8]
gamma_list=[0.2,0.6,0.8]
epsilon_list=[0.2,0.6,0.8]
lis=[alpha_list,gamma_list,epsilon_list]
grid_search=list(itertools.product(*lis))



Hyper Parameter Tunning: making a grid search and try which parameters will give us highest evaluation by using this formula (rewords/(avr_penalties+avr_time_stampe)) 
to choose parameters which enable us to get the maximum reward as fast as possible. 

Note the hyper Parameter Tunning takes more than 3 hours to train the Agent with 3 values for every parameter

In [None]:
all_Evaluations=[]
for i in range(0,len(grid_search)):

  l=grid_search[i]
  alpha=l[0]
  gamma=l[1]
  epsilon=l[2]
  q_table =training(env,alpha,gamma,epsilon)
  avr_time_stampe,avr_penalties,rewords=evaluation(q_table,env)
  eva=(rewords/(avr_penalties+avr_time_stampe))
  all_Evaluations.append(eva)


Episode: 100000
Training finished.

Results after 1000 episodes:
Average timesteps per episode: 13.234
Average penalties per episode: 0.0


In [None]:
all_Evaluations

[1.5124016938898972, 1.511258878645912]

In [None]:
max_evaluation= max(all_Evaluations)
high_parameter = all_Evaluations.index(max_evaluation)
grid_search[high_parameter]

(0.2, 0.2, 0.2)