`Note - For detailed documentation wse SpaceInvaders`

## Import required Packages

In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random

## Creatinng the environment

In [2]:
env  = gym.make('FrozenLake-v0')

## Taking random Actions

In [4]:
episodes = 10 

for episode in range(1,episodes):
    
    state = env.reset() #Every time we iterate we reset state to its original poistion and restart our agent at the begining
    done = False # whether our agent has completed the level
    score = 0
    
    while not done:
        env.render() #for visualisation
        state,reward,done,info = env.step(env.action_space.sample()) ##the action that our agents will take within each frame
        #In each frame our agent will take action,now we are just going to take a random action
        #.sample() will do random action out of action_space(total 5 possible action)
        
        # state will be next state after taking this action
        
        score += reward#What current reward is within this while loop
        
        clear_output(wait=True)
        
        print('Episode{}\n score{}\n'.format(episode,score))
        print(reward,"reward")
        print(state,"state")
        print(done,"done")
        print(info,"info")
        
env.close()


#env.step()=  Run one timestep of the environment's dynamics. When end of
#episode is reached, you are responsible for calling `reset()`
#to reset this environment's state.
        

Episode9
 score0.0

0.0 reward
5 state
True done
{'prob': 0.3333333333333333} info


## Creating Q-table

In [5]:
actions = env.action_space.n
print("Total no. actions agent can take :",actions)

state = env.observation_space.n
print("Total number of state our environment has:",state)

q_table = np.zeros((state,actions))
print("Created array of zeros with shape :",q_table.shape)
print("Visualise q table:\n",q_table)

Total no. actions agent can take : 4
Total number of state our environment has: 16
Created array of zeros with shape : (16, 4)
Visualise q table:
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## Creating parameters for q-learnning algorithm

In [6]:
num_episodes = 10000 #No. of timmes we are going to re-iterate through our algorithm
max_steps_per_episode = 100

learning_rate = 0.1
discount_rate = 0.99 # High priority on current reward than future rewars

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01 # 1% probability that we take a random action by the end of our decay of exploration rate

exploration_decay_rate = 0.001 # we want to decay our exploration_rate as we train our neural networks

#creating the list where we store all our rewards
rewards_all_episodes = []

## Q-learning Algorithm

In [7]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0 # this is the current reward we have per episode
    
    
    for step in range(max_steps_per_episode):
        
        #Coding Exploration vs exploitation trade-off
        exploration_threshold = random.uniform(0,1)
        if exploration_threshold > exploration_rate: #intially  this value will be false
            action = np.argmax(q_table[state,:]) # Take the action which give maximum value depennding on state we are in
        else:
            action = env.action_space.sample() # take random actions
            
        
        #taking our action
        new_state,reward,done,info = env.step(action)
        
        
        #Updating q table or computing q-value
        
        
        #New Q[s,a] = Q[s,a] + α([R(s,a) + γ(maxQ'(s',a') - Q(s,a))])
        
        q_table[state,action] = q_table[state,action]*(1-learning_rate) + learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))
        
        #note - this equation has slight variation from the equation mentioned above
        
        state = new_state
        
        rewards_current_episode += reward
        
        if done == True:
            break
            
        
    exploration_rate = min_exploration_rate + \
                        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode) 
    # it decays the exploration rate
    #so that we start taking actions from our q-table rather than taking random actions
    
    rewards_all_episodes.append(rewards_current_episode)
    
print("*****Training over*******")
            
            
        
        
            
        
        

*****Training over*******


In [8]:
q_table

array([[0.54101808, 0.4827518 , 0.47506185, 0.48083297],
       [0.30505589, 0.29231558, 0.26341336, 0.46606886],
       [0.40565976, 0.39960215, 0.39219694, 0.42743327],
       [0.36137333, 0.34883911, 0.27471928, 0.41432135],
       [0.5731125 , 0.36956339, 0.42071479, 0.27391452],
       [0.        , 0.        , 0.        , 0.        ],
       [0.26649005, 0.16675625, 0.18851886, 0.14993957],
       [0.        , 0.        , 0.        , 0.        ],
       [0.36404505, 0.31179086, 0.31528235, 0.62047771],
       [0.38409006, 0.65669199, 0.49898953, 0.40203865],
       [0.57984979, 0.38307678, 0.29619692, 0.29224732],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.40749339, 0.62964478, 0.71458455, 0.49577822],
       [0.68631928, 0.81062093, 0.68718409, 0.64358696],
       [0.        , 0.        , 0.        , 0.        ]])

## Calculate and prinnt average reward per thousand episodes

In [9]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("Average per thousand Episodes")

for r in rewards_per_thousand_episodes:
    print(count, " : ", str(sum(r/1000)))
    count += 1000

Average per thousand Episodes
1000  :  0.05200000000000004
2000  :  0.20500000000000015
3000  :  0.4040000000000003
4000  :  0.5620000000000004
5000  :  0.5990000000000004
6000  :  0.6900000000000005
7000  :  0.7050000000000005
8000  :  0.6650000000000005
9000  :  0.6910000000000005
10000  :  0.6530000000000005


In [11]:
## Visulaising the agent

import time

for episode in range(3):
    state = env.reset()
    done = False
    print("Episode is:" + str(episode))
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        
        env.render() #For visualisitaion
        
        time.sleep(0.4)
        
        action = np.argmax(q_table[state,:]) # maximum value in q_table of a given state
        
        new_state,reward,done,info= env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("***Goal Reached*****")
                time.sleep(2)
                clear_output(wait=True)
            else:
                print("***Failed :( *******")
                clear_output(wait=True)
                time.sleep(2)
                
            break
                
        state = new_state
        
env.close()
        

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
***Goal Reached*****


In [None]:
! git add FrozenLake.ipynb
! git commit -m "14:15/23-05-2021"
! git push origin main