## Importing the package

In [3]:
import tensorflow as tf
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Activation,Flatten,Conv2D,MaxPooling2D
from tensorflow.keras.optimizers import Adam

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY #Agent will move only right
# from nes_py.wrappers import joypad_space
from nes_py.wrappers import JoypadSpace

from IPython.display import clear_output

from keras.models import save_model
from keras.models import load_model

import time


## Creating the Environment

In [4]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env,RIGHT_ONLY) #Actions will only move right

## Taking random actions in our environment

In [5]:
# total_reward = 0
# done = True

# for step in range(100000):
#     env.render()
#     if done:
#         state = env.reset()
#     state,reward,done,info = env.step(env.action_space.sample())
# #     preprocess_state(state)
#     print(info)
#     print(state)
# #     break
    
#     total_reward += reward
#     clear_output(wait=True)
    
# env.close()
    

In [6]:
#Shows Preprocess_state image
# state = env.reset()
# state = preprocess_state(state)
# print(f"array of preprocessed image is\n {state}")
# print(f"state  shape is {state.shape}")

## Building class(Brain) for Mario Agent

In [7]:
class DQNAgent:
    def __init__(self, state_size, action_size): #state_size is input layer and action_size is output_layer
        #Create variables for our agent
        self.state_space = state_size
        self.action_space = action_size
        self.memory = deque(maxlen=5000)
        self.gamma = 0.8 #discount factor- Priority to immediate reward than longterm reward
        self.chosenAction = 0
        
        
        #Creating exploration variable
        self.epsilon = 1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.decay_epsilon = 0.0001
        
        
        #Building Neural Networks for agent
        self.main_network = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network() # it will simply set wait of our main_network to our target network
        
        
    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64,(4,4),strides=4,padding = 'same', input_shape=self.state_space))
        model.add(Activation('relu'))

        model.add(Conv2D(64,(4,4),strides = 2,padding='same'))
        model.add(Activation('relu'))

        model.add(Conv2D(64,(3,3),strides = 1, padding = 'same'))
        model.add(Activation('relu'))
        model.add(Flatten())

        model.add(Dense(512,activation='relu'))
        model.add(Dense(256,activation='relu'))
        model.add(Dense(self.action_space,activation='linear')) #action_space = possible acton we can take in our environment

        model.compile(loss='mse',optimizer = Adam())

        return model


    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights()) 
        ## This method will simply set wait of our main_network to our target network

    #Create a function that allows agent to act differently in different state

    def act(self,state,onGround):
        
        if onGround < 83: #83 is any arbitray number
            print("on Ground") 
            #we'll only make predictions when value is less than 83
            if random.uniform(0,1) < self.epsilon:
                self.chosenAction = np.random.randint(self.action_space)
                return self.chosenAction  #take the random action

            Q_value = self.main_network.predict(state) #it will give prediction. Prediction means Q(s,a)
            print(Q_value)
            self.chosenAction = np.argmax(Q_value[0])
            return self.chosenAction
        else : 
            print("We are not on ground")
            return self.chosenAction
        
        # by doing this we are checking if we are on the ground and are below the y position of 83 then we can make an action
        # Else we are just going to return the action that we computed in our previous action-prediction
        
        
        

    # Function to update epsilon and decay over time.

    def update_epsilon(self,episode):
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_epsilon * episode)
        

    #Creating train function
    def train(self,batch_size):
        #Taking minibath from memory
        minibatch = random.sample(self.memory,batch_size)

        #Get variables from batch so we can find q-value
        for state,action,reward,next_state,done in minibatch:
            target = self.main_network.predict(state)
#             print(target)

            if done:
                target[0][action] = reward
            else:
                target[0][action] = (reward + self.gamma*np.amax(self.target_network.predict(next_state))) # it removes oscillation from happening
                # we are periodically updating weights for our target model

            self.main_network.fit(state,target,epochs=1,verbose = 0)


    #Storing state-action reward in our memory
    def store_transition(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
        
    #Predict the action without epsilon greedy policy
    def get_pred_act(self,state):
        Q_values = self.main_network.predict(state)
        return np.argmax(Q_values[0])
        
        
        
    def load(self,name):
        self.main_network = load_model(name)
        self.target_network = load_model(name)
        
    def save(self,name):
        save_model(self.main_network,name)



                    
            
            
        
        

### `we create target network to avoid oscillation. So that we can improve accuracy and training`

`To be explicit, the role of the model (self.main_network) is to do the actual predictions on what action to take, and the target model (self.target_model) tracks what action we want our model to take.`

`Why not just have a single model that does both? After all, if something is predicting the action to take, shouldn’t it be implicitly determine what model we want our model to take? This is actually one of those “weird tricks” in deep learning that DeepMind developed to get convergence in the DQN algorithm. If you use a single model, it can (and often does) converge in simple environments (such as the CartPole). But, the reason it doesn’t converge in these more complex environments is because of how we’re training the model: because we’re training it “on the fly.”`

`As a result, we are doing training at each time step and, if we used a single network, would also be essentially changing the “goal” at each time step. Think of how confusing that would be! That would be like if a teacher told you to go finish pg. 6 in your textbook and, by the time you finished half of it, she changed it to pg. 9, and by the time you finished half of that, she told you to do pg. 21! This, therefore, causes a lack of convergence by a lack of clear direction in which to employ the optimizer, i.e. the gradients are changing too rapidly for stable convergence. So, to compensate, we have a network that changes more slowly that tracks our eventual goal and one that is trying to achieve those.`

In [8]:

action_space = env.action_space.n
state_space = (80,88,1)

print("env.observation_space",env.observation_space)

## Grayscaling and preprocessing our image to make it computationally in expensive

from PIL import  Image

def preprocess_state(state):
    image = Image.fromarray(state)
    image = image.resize((88,80))
    image = image.convert('L')
#     image.show()
    image = np.array(image)
#     print(image)
    
    
    return image



#We need to convert size of env.observati on_space(which is 240, 256, 3) to 
# (80,88,1) so that it is not computationally expensive



env.observation_space Box(0, 255, (240, 256, 3), uint8)


## Creating the training loop

In [9]:
num_episodes = 1000000
num_timesteps = 400000 #amount of frames we'll be training on at each episodes
batch_size = 64
DEBUG_LENGTH = 300

# creating our deep q network

In [10]:
dqn = DQNAgent(state_space,action_space)


In [None]:
print('Starting Training')

stuck_buffer = deque(maxlen=DEBUG_LENGTH)

for i in range(num_episodes):
    Return = 0
    done = False
    time_step = 0
    onGround = 79 #this is set to 79 because when the y_pos is 79 we are on the ground
    # we are only on ground at first
    
    
    state = preprocess_state(env.reset())
    state = state.reshape(-1,80,88,1)
    
    for t in range(num_timesteps):
        env.render()
        time_step += 1
        
        if t>1 and stuck_buffer.count(stuck_buffer[-1]) > DEBUG_LENGTH - 50:
            #If the count of the last value in a buffer is equal to the DEBUG_LENGTH then we have 
            #250 of the same x poition inside of our buffer
            action = dqn.act(state,onGround = 79)
            
        else:
        
            action = dqn.act(state,onGround)
        
        print(f"Action is {str(action)}")
        
        next_state,reward,done,info = env.step(action)
        
        print(f"info of  is y_pos {info['y_pos']}")
        onGround = info['y_pos']
        stuck_buffer.append(info['x_pos'])
        
        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1,80,88,1)
        
        dqn.store_transition(state,action,reward,next_state,done) #Store the transition
        state = next_state #Set the state to next_state
        
        Return += reward
        print(f"Episode is :{str(i)}\n,Total Time Step:{str(time_step)}\n Current Reward:{str(Return)}\n Epsilon is :{str(dqn.epsilon)}")
        
        clear_output(wait=True)
        
        if done:
            break
        
        #If we have more data than batch_size then we can start training on it
        if len(dqn.memory) > batch_size and i > 5: # also we'll start training when i is greater than 5
            dqn.train(batch_size)
           
           
    dqn.update_epsilon(i)
    clear_output(wait=True)
    dqn.update_target_network()
    
    #Save model
    dqn.save('MarioRL.h5')
    
env.close()
    
           
           
        

We are not on ground
Action is 0
info of  is y_pos 103
Episode is :1
,Total Time Step:9
 Current Reward:1
 Epsilon is :1.0


` We can check whether we are on the ground by checking this y position`

In [None]:
dqn.save('MarioRL.h5') #dqn is instance of DQNAgent

In [12]:
dqn.load('MarioRL.h5')

# Visualising the Model

In [None]:
while True:
    done = False
    state = preprocess_state(env.reset())
    state = state.reshape(-1,80,88,1)
    total_reward = 0
    
    while not done:
        env.render()
        action = dqn.get_pred_act(state)
        next_state,reward,done,info = env.step(action)
        
        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1,80,88,1)
        state = next_state
        
env.close()
        
        

In [None]:
! git add MarioRL.ipynb
! git commit -m "22:57/25-05-2021"
! git push origin main