`Note - For detailed documentation wse SpaceInvaders`

## Import required Packages

In [18]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random

## Creatinng the environment

In [2]:
env  = gym.make('Taxi-v3')

## Taking random Actions

In [3]:
episodes = 10 

for episode in range(1,episodes):
    
    state = env.reset() #Every time we iterate we reset state to its original poistion and restart our agent at the begining
    done = False # whether our agent has completed the level
    score = 0
    
    while not done:
        env.render() #for visualisation
        state,reward,done,info = env.step(env.action_space.sample()) ##the action that our agents will take within each frame
        #In each frame our agent will take action,now we are just going to take a random action
        #.sample() will do random action out of action_space(total 6 possible action)
        
        # state will be next state after taking this action
        
        score += reward#What current reward is within this while loop
        
        clear_output(wait=True)
        
        print('Episode{}\n score{}\n'.format(episode,score))
        print(reward,"reward")
        print(state,"state")
        print(done,"done")
        print(info,"info")
        
env.close()


#env.step()=  Run one timestep of the environment's dynamics. When end of
#episode is reached, you are responsible for calling `reset()`
#to reset this environment's state.
        

Episode9
 score-794

-1 reward
339 state
True done
{'prob': 1.0, 'TimeLimit.truncated': True} info


## Creating Q-table

In [4]:
actions = env.action_space.n
print("Total no. actions agent can take :",actions)

state = env.observation_space.n
print("Total number of state our environment has:",state)

q_table = np.zeros((state,actions))
print("Created array of zeros with shape :",q_table.shape)
print("Visualise q table:\n",q_table)

Total no. actions agent can take : 6
Total number of state our environment has: 500
Created array of zeros with shape : (500, 6)
Visualise q table:
 [[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


## Creating parameters for q-learnning algorithm

In [5]:
num_episodes = 10000 #No. of timmes we are going to re-iterate through our algorithm
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99 # High priority on current reward than future rewars

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01 # 1% probability that we take a random action by the end of our decay of exploration rate

exploration_decay_rate = 0.001 # we want to decay our exploration_rate as we train our neural networks

#creating the list where we store all our rewards
rewards_all_episodes = []

## Q-learning Algorithm

In [6]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0 # this is the current reward we have per episode
    
    
    for step in range(max_steps_per_episode):
        
        #Coding Exploration vs exploitation trade-off
        exploration_threshold = random.uniform(0,1)
        if exploration_threshold > exploration_rate: #intially  this value will be false
            action = np.argmax(q_table[state,:]) # Take the action which give maximum value depennding on state we are in
        else:
            action = env.action_space.sample() # take random actions
            
        
        #taking our action
        new_state,reward,done,info = env.step(action)
        
        
        #Updating q table or computing q-value
        
        
        #New Q[s,a] = Q[s,a] + α([R(s,a) + γ(maxQ'(s',a') - Q(s,a))])
        
        q_table[state,action] = q_table[state,action]*(1-learning_rate) + learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))
        
        #note - this equation has slight variation from the equation mentioned above
        
        state = new_state
        
        rewards_current_episode += reward
        
        if done == True:
            break
            
        
    exploration_rate = min_exploration_rate + \
                        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode) 
    # it decays the exploration rate
    #so that we start taking actions from our q-table rather than taking random actions
    
    rewards_all_episodes.append(rewards_current_episode)
    
print("*****Training over*******")
            
            
        
        
            
        
        

*****Training over*******


In [7]:
q_table

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -2.36613996,   1.44087409,  -2.17755678,  -0.8980715 ,
          9.6220697 ,  -8.58988931],
       [  4.77225736,   3.69182565,   2.05328962,   6.24382132,
         14.11880599,  -2.13533785],
       ...,
       [ -1.23789808,   2.51208828,  -1.27013433,  -1.33135003,
         -8.18309942,  -7.37379978],
       [ -2.75595891,   1.77124471,  -2.61926953,  -2.72934177,
        -10.09029138,  -9.05297012],
       [  0.2212298 ,   0.59087738,  -0.41941   ,  17.3539126 ,
         -2.72881   ,  -3.73944683]])

## Calculate and prinnt average reward per thousand episodes

In [8]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("Average per thousand Episodes")

for r in rewards_per_thousand_episodes:
    print(count, " : ", str(sum(r/1000)))
    count += 1000

Average per thousand Episodes
1000  :  -255.34100000000015
2000  :  -39.164000000000016
3000  :  2.1339999999999937
4000  :  5.631999999999976
5000  :  6.8089999999999735
6000  :  7.254999999999961
7000  :  7.241999999999964
8000  :  7.298999999999962
9000  :  7.3119999999999665
10000  :  7.515999999999963


In [22]:
## Visulaising the agent

import time

for episode in range(3):
    state = env.reset()
    done = False
    print("Episode is:" + str(episode))
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        
        env.render() #For visualisitaion
        
        time.sleep(0.4)
        
        action = np.argmax(q_table[state,:]) # maximum value in q_table of a given state
        
        new_state,reward,done,info= env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("***Goal Reached*****")
                time.sleep(2)
                clear_output(wait=True)
            else:
                print("***Failed :( *******")
                clear_output(wait=True)
                time.sleep(2)
                
            break
                
        state = new_state
        
env.close()
        

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
***Failed :( *******


In [None]:
! git add TaxilRL.ipynb
! git commit -m "14:18/23-05-2021"
! git push origin main