# Install the main package

In [None]:
!pip install cmake 'gym[atari]' scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Importing main Libraries

In [None]:
from numpy import array
from math import inf
from numpy.linalg import norm
import gym
from IPython.display import clear_output
from time import sleep
import numpy as np
import random
from IPython.display import clear_output

# Defining environment function

In [None]:
def render(env):
  env = gym.make(env)
  env.render()
  env.reset() # reset environment to a new, random state
  print("Action Space {}".format(env.action_space))
  print("State Space {}".format(env.observation_space))
  return env

In [None]:
env = render('Taxi-v3')

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [None]:
state = env.encode(3, 1, 2, 0) 
print("State:", state)
env.s = state
env.render()
env.P[328]

State: 328
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|[34;1mB[0m: |
+---------+



{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [None]:
env.s = 328 
def incorrect_time(env):
  epochs = 0
  penalties = 0
  reward = 0
  frames = []
  done = False
  while not done:
      action = env.action_space.sample()
      state, reward, done, info = env.step(action)
      if reward == -10:
          penalties += 1
      frames.append({
          'frame': env.render(mode='ansi'),
          'state': state,
          'action': action,
          'reward': reward
          }
      )
      epochs += 1
  print("Timesteps taken: {}".format(epochs))
  print("Penalties incurred: {}".format(penalties))
  return frames

In [None]:
frames = incorrect_time(env)

Timesteps taken: 200
Penalties incurred: 72


# Defining a function to show frames






In [None]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)
print_frames(frames)

+---------+
|R: | : :[35m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)

Timestep: 200
State: 93
Action: 1
Reward: -1


# Defining the main function for training

In [None]:
def training(env):
  q_table = np.zeros([env.observation_space.n, env.action_space.n])
  # Hyperparameters
  alpha = 0.1
  gamma = 0.6
  epsilon = 0.1

  # For plotting metrics
  all_epochs = []
  all_penalties = []
  table = []
  for i in range(1, 100001):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      done = False
      
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
          
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value
          table.append(q_table[state, action])
          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")

  print("Training finished.\n")
  return q_table

In [None]:
q_table = training(env)
q_table

Episode: 100000
Training finished.



array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -2.41837066,  -2.3639511 ,  -2.41837065,  -2.36395106,
         -2.27325184, -11.36395098],
       [ -1.87014398,  -1.45024005,  -1.870144  ,  -1.45024004,
         -0.7504    , -10.45023897],
       ...,
       [ -1.03958441,   0.41599995,  -1.01106646,  -1.27151752,
         -4.6641925 ,  -4.75285588],
       [ -2.14527308,  -2.1220628 ,  -2.14621956,  -2.12206152,
         -6.77056354,  -4.46843968],
       [  2.82610541,   1.46325204,   2.51976598,  11.        ,
         -2.768596  ,  -1.45600185]])

# Evaluation function after training


> calculating time steps and penalties



In [None]:
def apply_evaluate(q_table,env):
  total_epochs, total_penalties = 0, 0
  episodes = 100

  for _ in range(episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
      
      done = False
      
      while not done:
          action = np.argmax(q_table[state])
          state, reward, done, info = env.step(action)

          if reward == -10:
              penalties += 1

          epochs += 1

      total_penalties += penalties
      total_epochs += epochs

  
  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {total_epochs / episodes}")
  print(f"Average penalties per episode: {total_penalties / episodes}")


In [None]:
apply_evaluate(q_table,env)

Results after 100 episodes:
Average timesteps per episode: 12.74
Average penalties per episode: 0.0


In [None]:
env_b = render('FrozenLake-v0')


[41mS[0mFFF
FHFH
FFFH
HFFG
Action Space Discrete(4)
State Space Discrete(16)


In [None]:
frames_2 = incorrect_time(env_b)

Timesteps taken: 5
Penalties incurred: 0


In [None]:
print_frames(frames_2)

  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG

Timestep: 5
State: 5
Action: 1
Reward: 0.0


In [None]:
q_table_b = training(env_b)

Episode: 100000
Training finished.



In [None]:
q_table_b

array([[6.45811256e-04, 1.39009412e-03, 8.01426734e-04, 6.65151561e-04],
       [5.98032581e-04, 1.26957527e-03, 7.23996781e-04, 1.83415759e-03],
       [5.30691661e-03, 2.26716351e-03, 3.47446943e-03, 1.43043957e-03],
       [8.41343080e-04, 1.19459329e-03, 7.67681286e-04, 8.37178620e-04],
       [1.47796868e-03, 2.50911636e-03, 1.59690554e-03, 7.78437162e-04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.35877142e-02, 9.08746288e-03, 7.54278803e-03, 1.61139610e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.81870935e-03, 7.65187898e-03, 8.20222094e-03, 4.93953252e-03],
       [1.74396077e-02, 4.16208375e-02, 2.94600915e-02, 2.21194719e-02],
       [3.08081957e-02, 5.03243636e-02, 9.04127706e-02, 9.77073758e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.99172575e-02, 7.39256117e-02, 7.59393940e

In [None]:
apply_evaluate(q_table_b,env_b)

Results after 100 episodes:
Average timesteps per episode: 8.88
Average penalties per episode: 0.0


In [None]:
env_c = render('CliffWalking-v0')

o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
x  C  C  C  C  C  C  C  C  C  C  T

Action Space Discrete(4)
State Space Discrete(48)


In [None]:
frames_3 = incorrect_time(env_c)

Timesteps taken: 2556
Penalties incurred: 0


In [None]:
q_table_c = training(env_c)

Episode: 100000
Training finished.



In [None]:
apply_evaluate(q_table_c,env_c)

Results after 100 episodes:
Average timesteps per episode: 13.0
Average penalties per episode: 0.0


# Hyperparameter tuning function

In [None]:
def parameters_deg(env, alpha, gamma, epsilon):
  q_table = np.zeros([env.observation_space.n, env.action_space.n])
  # Hyperparameters
  alpha = alpha
  gamma = gamma
  epsilon = epsilon

  # For plotting metrics
  all_epochs = []
  all_penalties = []
  table = []
  for i in range(1, 100001):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      done = False
      
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          first = q_table[state, action]
          next_max = np.max(q_table[next_state])
          
          new_value = (1 - alpha) * first + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value
          table.append(q_table[state, action])
          alpha = alpha - (0.0001*alpha)
          gamma = gamma - (0.0001*gamma)
          epsilon = epsilon - (0.0001*epsilon)
          if alpha<=0:
            alpha = 0.1
          if gamma<=0:
            gamma = 0.6
          if epsilon<= 0:
            epsilon = 0.1

          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")

  print("Training finished.\n")
  return q_table

In [None]:
env_hyper = render('Taxi-v3')

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m|[43m [0m: |B: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [None]:
frames = incorrect_time(env_hyper)

Timesteps taken: 200
Penalties incurred: 77


In [None]:
q_table_d = parameters_deg(env_hyper,0.2,0.2,0.2)

Episode: 100000
Training finished.



In [None]:
apply_evaluate(q_table_d,env_hyper)

Results after 100 episodes:
Average timesteps per episode: 200.0
Average penalties per episode: 0.0


# Evaluation function for grid search

In [None]:
def grid_search_evaluation(q_table,env):
  total_epochs, total_penalties = 0, 0
  episodes = 100

  for _ in range(episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
      
      done = False
      
      while not done:
          action = np.argmax(q_table[state])
          state, reward, done, info = env.step(action)

          if reward == -10:
              penalties += 1

          epochs += 1

      total_penalties += penalties
      total_epochs += epochs

  average_timesteps = total_epochs / episodes
  average_penalties = total_penalties / episodes
  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {average_timesteps}")
  print(f"Average penalties per episode: {average_penalties}")
  return average_timesteps,average_penalties

# Training for grid search

In [None]:
def training_grid_search(env,alpha,gamma,epsilon):
  q_table = np.zeros([env.observation_space.n, env.action_space.n])
  
  # The Hyperparameters:
  alpha = alpha
  gamma = gamma
  epsilon = epsilon

 
  all_epochs = []
  all_penalties = []
  table = []
  for i in range(1, 100001):
      state = env.reset()

      epochs, penalties, reward, = 0, 0, 0
      done = False
      
      while not done:
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
          
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value
          table.append(q_table[state, action])
          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          
      if i % 100 == 0:
          clear_output(wait=True)
          print(f"Episode: {i}")

  print("Training finished.\n")
  return q_table,alpha,gamma,epsilon

# Grid search Function

In [None]:
def grid_search(parm,env):
  time_steps = 10000
  penalties = 10000
  parameter = parm
  for i in parameter['alpha']:
    for j in parameter['gamma']:
      for k in parameter['epsilon']:
        q_table,alpha,gamma,epsilon = training_grid_search(env,alpha=i,gamma=j,epsilon=k)
        average_timesteps,average_penalties = grid_search_evaluation(q_table,env)
        if average_timesteps<= time_steps:
          if average_penalties <= penalties:
            time_steps = average_timesteps
            penalties = average_penalties
            chosen_parameters = {'alpha':alpha,'gamma':gamma,'epsilon':epsilon,'Time':average_timesteps,'penalties':average_penalties}
  return chosen_parameters        

In [None]:
paramter = {'alpha':[0.1,0.2,0.3],'gamma':[0.1,0.2,0.3],'epsilon':[0.1,0.2,0.3]}
grid_search(paramter,env)

Episode: 100000
Training finished.

Results after 100 episodes:
Average timesteps per episode: 12.73
Average penalties per episode: 0.0


{'Time': 12.66, 'alpha': 0.2, 'epsilon': 0.3, 'gamma': 0.1, 'penalties': 0.0}