In [1]:
import gym
import torch 
import collections
import os
import numpy as np
from utils import *
from exp_replay_memory import ReplayMemory




# SARSA

In [2]:
def epsilon_greedy(q_func, observation, eps, env_actions):
    prob = np.random.random()

    if prob < eps:
        return random.choice(range(env_actions))
    elif isinstance(q_func, CNN) or isinstance(q_func, LinearMapNet):
        with torch.no_grad():
            return q_func(observation).max(1)[1].item()
    else:
        qvals = [q_func[observation + (action, )] for action in range(env_actions)]
        return np.argmax(qvals)
    
def greedy(qstates_dict, observation, env_actions):
    qvals = [qstates_dict[observation + (action, )] for action in range(env_actions)]
    return max(qvals)

In [3]:
def sarsa_lander(env, n_episodes, gamma, lr, min_eps, print_freq=500, render_freq=500):
    q_states = collections.defaultdict(float)   # note that the first insertion of a key initializes its value to 0.0
    return_per_ep = [0.0]
    epsilon = 1.0
    num_actions = env.action_space.n
    
    for i in range(n_episodes):
        t = 0
       

        # Initial episode state: S
        curr_state = discretize_state(env.reset())
        # Choose A from S using policy π
        action = epsilon_greedy(q_states, curr_state, epsilon, num_actions)
        
        while True:

            # Create (S, A) pair
            qstate = curr_state + (action, )

            # Take action A, earn immediate reward R and land into next state S'
            # S --> A --> R --> S'
            observation, reward, done, _ = env.step(action)
            next_state = discretize_state(observation)

            # Next State: S'
            # Choose A' from S' using policy π
            next_action = epsilon_greedy(q_states, next_state, epsilon, num_actions)

            # create (S', A') pair
            new_qstate = next_state + (next_action, )

            ###################################################################
            # Policy evaluation step
            if not done:
                q_states[qstate] += lr * (reward + gamma * q_states[new_qstate] - q_states[qstate]) # (S', A') non terminal state
            else:
                q_states[qstate] += lr * (reward - q_states[qstate])    # (S', A') terminal state
            ###################################################################

            return_per_ep[-1] += reward

            if done:
                if (i + 1) % print_freq == 0:
                    print("\nEpisode finished after {} timesteps".format(t + 1))
                    print("Episode {}: Total Return = {}".format(i + 1, return_per_ep[-1]))
                    print("Total keys in q_states dictionary = {}".format(len(q_states)))

                if (i + 1) % 100 == 0:
                    mean_100ep_reward = round(np.mean(return_per_ep[-101:-1]), 1)
                    print("Last 100 episodes mean reward: {}".format(mean_100ep_reward))

                epsilon = decay_epsilon(epsilon, min_eps)
                return_per_ep.append(0.0)

                break

            curr_state = next_state
            action = next_action
            t += 1

    return return_per_ep

In [4]:
n_episodes=10000
lr=0.1
gamma=0.99
final_eps=0.01
environment = gym.make("LunarLander-v2")

In [None]:
print("\nTraining Sarsa lander with arguments num_episodes={}, step-size={}, gamma={}, final_epsilon={} ..."\
                            .format(n_episodes, lr, gamma, final_eps))
sarsa_total_rewards  = sarsa_lander(environment, n_episodes, gamma, lr, final_eps)
print("Done!")
environment = gym.make("LunarLander-v2")



Training Sarsa lander with arguments num_episodes=10000, step-size=0.1, gamma=0.99, final_epsilon=0.01 ...
Last 100 episodes mean reward: -159.3
Last 100 episodes mean reward: -96.6
Last 100 episodes mean reward: -118.8
Last 100 episodes mean reward: -140.6

Episode finished after 158 timesteps
Episode 500: Total Return = -205.89852223166193
Total keys in q_states dictionary = 8711
Last 100 episodes mean reward: -136.1
Last 100 episodes mean reward: -142.7
Last 100 episodes mean reward: -128.6
Last 100 episodes mean reward: -136.9
Last 100 episodes mean reward: -111.4

Episode finished after 252 timesteps
Episode 1000: Total Return = -219.40028862625013
Total keys in q_states dictionary = 15409
Last 100 episodes mean reward: -100.9
Last 100 episodes mean reward: -95.2
Last 100 episodes mean reward: -47.5
Last 100 episodes mean reward: -40.5
Last 100 episodes mean reward: -36.5

Episode finished after 623 timesteps
Episode 1500: Total Return = -83.65257368198641
Total keys in q_states 

In [None]:
#save result
a=np.array(sarsa_total_rewards)
np.save(f'sarsa_total_rewards.npy',a) 

# DQN

In [None]:
def dqn_lander(env, n_episodes, gamma, lr, min_eps, \
                batch_size=32, memory_capacity=50000, \
                network='linear', learning_starts=1000, \
                train_freq=1, target_network_update_freq=1000, \
                print_freq=500, render_freq=500, save_freq=1000):

    # set device to run on
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    loss_function = torch.nn.MSELoss()

    # path to save checkpoints
    PATH = "./models"
    if not os.path.isdir(PATH):
        os.mkdir(PATH)

    num_actions = env.action_space.n
    input_shape = env.observation_space.shape[-1]
    qnet, qnet_optim = build_qnetwork(num_actions, lr, input_shape, network, device)
    qtarget_net, _ = build_qnetwork(num_actions, lr, input_shape, network, device)
    qtarget_net.load_state_dict(qnet.state_dict())
    qnet.train()
    qtarget_net.eval()
    replay_memory = ReplayMemory(memory_capacity)

    epsilon = 1.0 
    return_per_ep = [0.0] 
    saved_mean_reward = None
    t = 0

    for i in range(n_episodes):
        curr_state = lmn_input(env.reset())
        
        while True:
           
            # choose action A using behaviour policy -> ε-greedy; use q-network
            action = epsilon_greedy(qnet, curr_state.to(device), epsilon, num_actions)
            # take action A, earn immediate reward R and land into next state S'
            next_state, reward, done, _ = env.step(action)
            #next_frame = get_frame(env)
            next_state = lmn_input(next_state)

            # store transition (S, A, R, S', Done) in replay memory
            replay_memory.store(curr_state, action, float(reward), next_state, float(done))

            # if replay memory currently stores > 'learning_starts' transitions,
            # sample a random mini-batch and update q_network's parameters
            if t > learning_starts and t % train_freq == 0:
                states, actions, rewards, next_states, dones = replay_memory.sample_minibatch(batch_size)
                #loss = 
                fit(qnet, \
                    qnet_optim, \
                    qtarget_net, \
                    loss_function, \
                    states, \
                    actions, \
                    rewards, \
                    next_states, \
                    dones, \
                    gamma, \
                    num_actions, 
                    device)

            # periodically update q-target network's parameters
            if t > learning_starts and t % target_network_update_freq == 0:
                update_target_network(qnet, qtarget_net)

            t += 1
            return_per_ep[-1] += reward

            if done:
                if (i + 1) % print_freq == 0:
                    print("\nEpisode: {}".format(i + 1))
                    print("Episode return : {}".format(return_per_ep[-1]))
                    print("Total time-steps: {}".format(t))

                if (i + 1) % 100 == 0:
                    mean_100ep_reward = round(np.mean(return_per_ep[-101:-1]), 1)
                    print("\nLast 100 episodes mean reward: {}".format(mean_100ep_reward))

                if t > learning_starts and (i + 1) % save_freq == 0:
                    if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                        print("\nSaving model due to mean reward increase: {} -> {}".format(saved_mean_reward, mean_100ep_reward))
                        save_model(qnet, i + 1, PATH)
                        saved_mean_reward = mean_100ep_reward

                return_per_ep.append(0.0)
                epsilon = decay_epsilon(epsilon, min_eps)

                break

            curr_state = next_state

    return return_per_ep

In [None]:
n_episodes= 10000
lr = 0.001
gamma = 0.99
final_eps = 0.01

environment = gym.make("LunarLander-v2")

In [None]:
print("\nTraining DQN lander with arguments num_episodes={}, learning rate={}, gamma={}, final_epsilon={} ..."\
                            .format(n_episodes,lr, gamma, final_eps))
dqn_total_rewards = dqn_lander(environment, n_episodes, gamma, lr, final_eps)
print("Done!")

In [None]:
#save result
a=np.array(dqn_total_rewards)
np.save(f'dqn_total_rewards.npy',a) 

# Plot

In [None]:
def plot_rewards(chosen_agents, agents_returns, num_episodes, window):
    num_intervals = int(num_episodes / window)
    for agent, agent_total_returns in zip(chosen_agents, agents_returns):
        print(sum(agent_total_returns) / num_episodes)
        print("\n{} lander average reward = {} without blind area".format(agent, sum(agent_total_returns) / num_episodes,))
        l = []
        for j in range(num_intervals):
            l.append(round(np.mean(agent_total_returns[j * 100 : (j + 1) * 100]), 1))
        plt.plot(range(0, num_episodes, window), l)
        
    plt.xlabel("Episodes")
    plt.ylabel("Average reward per {} episodes".format(window))
    plt.title("RL Lander(s)")
    plt.legend(chosen_agents, loc="lower right")
    plt.show()

In [None]:
chosen_agents = ['SARSA','DQN']
agents_returns = [sarsa_total_rewards,dqn_total_rewards]

win = 100
plot_rewards(chosen_agents, agents_returns, n_episodes, win)