In [3]:
import gym
import torch 
import collections
import os
import numpy as np
from utils import *
from exp_replay_memory import ReplayMemory




In [4]:
def dqn_lander(env, n_episodes, gamma, lr, min_eps, \
                batch_size=32, memory_capacity=50000, \
                network='linear', learning_starts=1000, \
                train_freq=1, target_network_update_freq=1000, \
                print_freq=500, render_freq=500, save_freq=1000):

    # set device to run on
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    loss_function = torch.nn.MSELoss()

    # path to save checkpoints
    PATH = "./models"
    if not os.path.isdir(PATH):
        os.mkdir(PATH)

    num_actions = env.action_space.n
    
#     input_shape = env.observation_space.shape[-1]
    input_shape = 9
    
    qnet, qnet_optim = build_qnetwork(num_actions, lr, input_shape, network, device)
    qtarget_net, _ = build_qnetwork(num_actions, lr, input_shape, network, device)
    qtarget_net.load_state_dict(qnet.state_dict())
    qnet.train()
    qtarget_net.eval()
    replay_memory = ReplayMemory(memory_capacity)

    epsilon = 1.0 
    return_per_ep = [0.0] 
    saved_mean_reward = None
    t = 0

    for i in range(n_episodes):
        curr_state = lmn_input(env.reset())
        curr_observation = torch.zeros(curr_state.shape) if curr_state[0][1]>=0.9 and curr_state[0][1]<=1.1 else curr_state
                
        if (i + 1) % render_freq == 0:
            render = True
        else:
            render = False

        while True:
#             if render:
#                 env.render()

            # choose action A using behaviour policy -> ε-greedy; use q-network
            action = epsilon_greedy(qnet, curr_observation.to(device), epsilon, num_actions)
            
            # take action A, earn immediate reward R and land into next state S'
            next_state, reward, done, _ = env.step(action)
            next_state = lmn_input(next_state)
            # all zeor represent no signal 
            next_observation =torch.zeros(next_state.shape) if next_state[0][1]>=0.9 and  next_state[0][1]<=1.1 else next_state
            
            # store transition (S, A, R, S', Done) in replay memory
            replay_memory.store(curr_observation, action, float(reward), next_observation, float(done))
            
            # if replay memory currently stores > 'learning_starts' transitions,
            # sample a random mini-batch and update q_network's parameters
            if t > learning_starts and t % train_freq == 0:
                curr_observations, actions, rewards, next_observations, dones = replay_memory.sample_minibatch(batch_size)
                #loss = 
                fit(qnet, \
                    qnet_optim, \
                    qtarget_net, \
                    loss_function, \
                    curr_observations, \
                    actions, \
                    rewards, \
                    next_observations, \
                    dones, \
                    gamma, \
                    num_actions, 
                    device)

            # periodically update q-target network's parameters
            if t > learning_starts and t % target_network_update_freq == 0:
                update_target_network(qnet, qtarget_net)

            t += 1
            return_per_ep[-1] += reward

            if done:
                if (i + 1) % print_freq == 0:
                    print("\nEpisode: {}".format(i + 1))
                    print("Episode return : {}".format(return_per_ep[-1]))
                    print("Total time-steps: {}".format(t))

                if (i + 1) % 100 == 0:
                    mean_100ep_reward = round(np.mean(return_per_ep[-101:-1]), 1)
                    print("\nLast 100 episodes mean reward: {}".format(mean_100ep_reward))

                if t > learning_starts and (i + 1) % save_freq == 0:
                    if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                        print("\nSaving model due to mean reward increase: {} -> {}".format(saved_mean_reward, mean_100ep_reward))
                        save_model(qnet, i + 1, PATH)
                        saved_mean_reward = mean_100ep_reward

                return_per_ep.append(0.0)
                epsilon = decay_epsilon(epsilon, min_eps)

                break
            current_state = next_state
            curr_observation = next_observation

    return return_per_ep

In [5]:
n_episodes= 10000
lr = 0.001
gamma = 0.99
final_eps = 0.01
environment = gym.make("LunarLander-v2")

In [6]:
print("\nTraining DQN lander with arguments num_episodes={}, learning rate={}, gamma={}, final_epsilon={} ..."\
                            .format(n_episodes,lr, gamma, final_eps))
total_rewards = dqn_lander(environment, n_episodes, gamma, lr, final_eps)
print("Done!")


Training DQN lander with arguments num_episodes=10000, learning rate=0.001, gamma=0.99, final_epsilon=0.01 ...

Last 100 episodes mean reward: -158.7

Last 100 episodes mean reward: -94.7

Last 100 episodes mean reward: -130.7

Last 100 episodes mean reward: -41.6

Episode: 500
Episode return : 294.7675979923026
Total time-steps: 180682

Last 100 episodes mean reward: 44.2

Last 100 episodes mean reward: 127.2

Last 100 episodes mean reward: 191.8

Last 100 episodes mean reward: 196.3

Last 100 episodes mean reward: 204.6

Episode: 1000
Episode return : 29.29274415991489
Total time-steps: 360631

Last 100 episodes mean reward: 160.5

Saving model due to mean reward increase: None -> 160.5

Last 100 episodes mean reward: 136.9

Last 100 episodes mean reward: 187.9

Last 100 episodes mean reward: 218.4

Last 100 episodes mean reward: 212.0

Episode: 1500
Episode return : 269.58891315600835
Total time-steps: 513229

Last 100 episodes mean reward: 218.4

Last 100 episodes mean reward: 227

In [None]:
def plot_rewards(chosen_agents, agents_returns, num_episodes, window):
    num_intervals = int(num_episodes / window)
    for agent, agent_total_returns in zip(chosen_agents, agents_returns):
        print(len(agent_total_returns))
        print("\n{} lander average reward = {}".format(agent, sum(agent_total_returns) / num_episodes))
        l = []
        for j in range(num_intervals):
            l.append(round(np.mean(agent_total_returns[j * 100 : (j + 1) * 100]), 1))
        plt.plot(range(0, num_episodes, window), l)

    plt.xlabel("Episodes")
    plt.ylabel("Reward per {} episodes".format(window))
    plt.title("RL Lander(s)")
    plt.legend(chosen_agents, loc="lower right")
    plt.show()