In [18]:
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential, load_model
import tensorflow as tf
import numpy as np
import gym
import time
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook, tqdm

In [19]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        self.input_shape = input_shape
        self.discrete = discrete
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros((self.mem_size))
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.mem_cntr = 0
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1-int(done)
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.mem_cntr += 1
        
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)
        
        states = self.state_memory[batch]
        states_ = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, states_, terminal
    
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
        Dense(fc1_dims, input_shape=(input_dims, )),
        Activation('relu'),
        Dense(fc2_dims), 
        Activation('relu'),
        Dense(n_actions)
    ])
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=lr), loss='mse')
    
    return model

class Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size, input_dims, epsilon_dec=0.996, epsilon_end=0.01, mem_size=100_000, fname='dqn_model.h5'):
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions, discrete=True)
        
        self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)
        
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, state):
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
            
        return action
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        action_values = np.array(self.action_space, dtype=np.int8)
        action_indices = np.dot(action, action_values)
        
        q_eval = self.q_eval.predict(state)
        q_next = self.q_eval.predict(new_state)
        
        q_target = q_eval.copy()
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        q_target[batch_index, action_indices] = reward + self.gamma*np.max(q_next, axis=1)*done
        
        _ = self.q_eval.fit(state, q_target, verbose=0)
        
        self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min
        
    
    def save_model(self):
        self.q_eval.save(self.model_file)
        
    def load_model(self):
        self.q_eval = load_model(self.model_file)

In [20]:
MODEL_NAME = 'lunar_256'
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.01
ALPHA = 0.0005
INPUT_DIMS = 8
N_ACTIONS = 4
REPLAY_BUFFER_SIZE = 100_000
BATCH_SIZE = 64
SHOW = False
EPISODES = 500

In [21]:
if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    n_games = EPISODES
    agent = Agent(gamma=GAMMA, epsilon=EPSILON_START, alpha=ALPHA, input_dims=INPUT_DIMS, n_actions=N_ACTIONS, mem_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE, epsilon_end=EPSILON_END)
    
    scores = []
    eps_history = []
    
    for i in tqdm_notebook(range(n_games)):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            
            if SHOW and not i % 5:
                env.render()
            
            agent.remember(observation, action, reward, observation_, done)
            observation = observation_
            agent.learn()
        
        eps_history.append(agent.epsilon)
        scores.append(score)
        
        avg_score = np.mean(scores[max(0, i-100):(i+1)])

        print('episode ', i, 'score %.2f' % score, 'average score %.2f' % avg_score)
        if i % 10 == 0 and i > 0:
            agent.save_model()
            
        filename = 'lunar_lander'
        x = [i+1 for i in range(n_games)]
    
    env.close()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(n_games)):


  0%|          | 0/500 [00:00<?, ?it/s]

2022-06-24 12:51:04.829379: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2022-06-24 12:51:04.937221: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


episode  0 score -121.90 average score -121.90


2022-06-24 12:51:06.183613: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


episode  1 score -172.70 average score -147.30
episode  2 score -51.51 average score -115.37
episode  3 score -92.37 average score -109.62
episode  4 score -120.65 average score -111.83
episode  5 score -36.87 average score -99.33
episode  6 score -222.52 average score -116.93
episode  7 score -99.93 average score -114.81
episode  8 score -229.47 average score -127.55
episode  9 score -121.40 average score -126.93
episode  10 score -77.13 average score -122.40
episode  11 score -256.07 average score -133.54
episode  12 score -200.54 average score -138.70
episode  13 score -114.38 average score -136.96
episode  14 score -280.60 average score -146.54
episode  15 score -5.06 average score -137.69
episode  16 score -353.36 average score -150.38
episode  17 score -127.24 average score -149.10
episode  18 score -100.51 average score -146.54
episode  19 score -64.10 average score -142.42
episode  20 score -283.63 average score -149.14
episode  21 score -38.56 average score -144.11
episode  22

In [24]:
if __name__ == '__main__':    
    done = False
    score = 0
    env = gym.make('LunarLander-v2')
    observation = env.reset()
    smart_agent = Agent(gamma=GAMMA, epsilon=0, alpha=ALPHA, input_dims=INPUT_DIMS, n_actions=N_ACTIONS, mem_size=REPLAY_BUFFER_SIZE, batch_size=BATCH_SIZE)
    smart_agent.load_model()

    while not done:
        action = smart_agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        env.render()  
        smart_agent.remember(observation, action, reward, observation_, done)
        observation = observation_
    env.close()
    print(score)

2022-06-24 16:35:22.467013: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


235.74135801818903
