## DQN Agent for Moutain Car Game Environment

In [1]:
import gym
import pygame
import time
from PIL import Image

In [7]:
!pip3 install 'gym[classic_control]'

You should consider upgrading via the '/Users/sushantkhattar/Desktop/PythonDev/ML/rl_maountain_car_v0/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [5]:
!pip3 install pygame --upgrade

Collecting pygame
  Using cached pygame-2.5.2-cp39-cp39-macosx_11_0_arm64.whl (12.2 MB)
Installing collected packages: pygame
  Attempting uninstall: pygame
    Found existing installation: pygame 2.1.0
    Uninstalling pygame-2.1.0:
      Successfully uninstalled pygame-2.1.0
Successfully installed pygame-2.5.2


## Hard-Coded Strategy(Random)

In [7]:
# initialize pygame
pygame.init()

# set the desired frame rate
desired_fps = 250

# # pygame clock object
# clock = pygame.time.Clock()

# Create the Gym environment with 'human' rendering mode
env = gym.make('MountainCar-v0',render_mode='human')
env.metadata['render_fps'] = desired_fps
episode_gif = False

# 10 game episodes with 200 steps for each episode
for e in range(10):
    observation = env.reset()
    action=2
    max_vel=env.observation_space.high[1]
    min_vel=env.observation_space.low[1]
    initial_pos=observation[0]
    if(e%4==0):
        # Capture the GIF of this episode
        episode_gif = True
        # List to store frames for GIF creation
        frames = []
        
    for t in range(200):
        env.render()
        pygame.display.update()
        
        if(episode_gif and t%2==0):
            # Capture the screen frame
            screenshot = pygame.surfarray.array3d(pygame.display.get_surface())
            frames.append(Image.fromarray(screenshot))
        
        observation,reward,done,truncated, other_info = env.step(action)
        #print("velocity: ",observation[1])
        if(action==2):
            if(observation[1]>0.0):
                #print("Continue Forward running")
                action=2
            else:
                #print("Start Backward running")
                action=0
        
        elif(action==0):
            if(observation[1]<0.0):
                #print("Continue Backward running")
                action=0
            else:
                #print("Start Forward running")
                action=2
        if done: #Game Over
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,10,199-t,1.0))
            break
    if(episode_gif):
        # Save frames as a GIF for each episode
        gif_filename = f'episode_{e+1}.gif'
        frames[0].save(gif_filename, save_all=True, append_images=frames[1:], duration= 1000 // desired_fps)
        episode_gif = False
env.reset()
env.close()
print("Game Over!")

## RL Based Strategy

#### Designing AI agent

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random


In [3]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #Discount Factor
        self.epsilon = 1.0 # Exploration Rate: How much to act randomly, 
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001 
        self.model = self._create_model()
        
    
    def _create_model(self):
        #Neural Network To Approximate Q-Value function
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu')) #1st Hidden Layer
        model.add(Dense(24,activation='relu')) #2nd Hidden Layer
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done)) #remembering previous experiences
        
    def act(self,state):
        # Exploration vs Exploitation
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # predict reward value based upon current state
        return np.argmax(act_values[0]) #Left or Right
    
    def train(self,batch_size=32): #method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done: #boolean 
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0) #single epoch, x =state, y = target_f, loss--> target_f - 
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)

#### Training the DQN Agent (Deep Q-Learner)

In [15]:
n_episodes = 1000
output_dir = "mountain_car_self/"

In [5]:
agent = Agent(state_size=2,action_size=3)
done = False
state_size = 2
action_size = 3
batch_size = 32

Metal device set to: Apple M1


2023-09-23 16:23:54.495982: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-09-23 16:23:54.496781: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
  super(Adam, self).__init__(name, **kwargs)


In [16]:
agent = Agent(state_size, action_size) # initialise agent
done = False
for e in range(n_episodes):
    state,info = env.reset()
    init_pos=state[0]
    state = np.reshape(state,[1,state_size])
    for time in range(200):
        env.render()
        action = agent.act(state) #action is 0 or 1
        prev_vel=state[0][1]
        
        next_state,reward,done,truncated,other_info = env.step(action) 
        new_vel=next_state[1]
        
        if(np.abs(new_vel)>np.abs(prev_vel)):
            n=1
        else:
            n=-1
        if done:
            if(time<199):
                reward=40
            else:
                reward=-10
        else:
            reward = n
        
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,199-time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

2023-09-23 16:39:06.374132: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2023-09-23 16:39:06.512330: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Game Episode :189/1000, High Score:33,Exploration Rate:0.39
Game Episode :193/1000, High Score:34,Exploration Rate:0.38
Game Episode :200/1000, High Score:1,Exploration Rate:0.37
Game Episode :250/1000, High Score:0,Exploration Rate:0.29


KeyboardInterrupt: 

In [None]:
print(agent)