In [1]:
import gym
import random
import numpy as np
from collections import deque

import torch

from IPython import display
import matplotlib.pyplot as plt

import sys
sys.path.append('../')
from per_agent import PERAgent
from model import QNetwork

In [2]:
env = gym.make('LunarLander-v2')
o_dim = env.observation_space.shape[0]
a_dim = env.action_space.n
print('State shape: ', o_dim)
print('Number of actions: ', a_dim)

State shape:  8
Number of actions:  4


## poritized experience replay

In [3]:
LR = 5e-4               # learning rate
LR_STEP_SIZE = 4
BATCH_SIZE = 64         # minibatch size
GAMMA = 0.99            # discount factor
TAU = 0.001             # for soft update of target parameters
BUFFER_SIZE = int(1e5)  # replay buffer size
UPDATE_EVERY = 1        
SEED = 1234

In [4]:
# parameters for training
NUM_EPISODES = 800
PRINT_EVERY = 50
TMAX = 1000

eps_start = 1.0
eps_end = 0.01
eps_decay = 0.995

In [5]:
agent = PERAgent(QNetwork, o_dim, a_dim, lr = LR, lr_step_size = LR_STEP_SIZE, 
              batch_size = BATCH_SIZE, gamma = GAMMA, tau = TAU, buffer_size = BUFFER_SIZE,
              update_every = UPDATE_EVERY, seed = SEED, algorithm = "ddqn")
print(agent.algorithm)
agent.target_net

ddqn


QNetwork(
  (main): Sequential(
    (0): Linear(in_features=8, out_features=64, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU(inplace=True)
    (4): Linear(in_features=64, out_features=4, bias=True)
  )
)

In [6]:
score_list = []
score_window = deque(maxlen = PRINT_EVERY)  
best_score = 0.0

eps = eps_start
for e in range(1, NUM_EPISODES+1):
    
    state = env.reset()
    score = 0
    for t in range(TMAX):
        action = agent.get_action(state, eps)
        next_state, reward, done, _ = env.step(action)
        agent.step(state, action, reward, next_state, done)
        state = next_state
        score += reward
        if done:
            break
            
    eps = max(eps_end, eps_decay*eps) # decrease epsilon
    score_window.append(score)
    mean_score = np.mean(score_window)
    score_list.append([score, mean_score])
    
    print('\r\rEpisode {}\tsum of rewards {:8.2f}'.format(e, mean_score), end = "")
    
    if e % PRINT_EVERY == 0:
        print('\rEpisode {}\tsum of rewards {:8.2f}'.format(e, mean_score))
        agent.scheduler.step()
        
        if mean_score >= best_score:
            print('\nmodel saved!')
            torch.save(agent.online_net.state_dict(), 'checkpoint.pth')
            best_score = mean_score



score1 = np.asarray(score_list).T

Episode 50	sum of rewards  -138.49
Episode 100	sum of rewards   -96.00
Episode 150	sum of rewards   -28.10
Episode 200	sum of rewards   -28.45
Episode 250	sum of rewards    27.19

model saved!
Episode 300	sum of rewards    80.86

model saved!
Episode 350	sum of rewards   130.10

model saved!
Episode 400	sum of rewards   177.08

model saved!
Episode 450	sum of rewards   157.41
Episode 500	sum of rewards   175.48
Episode 550	sum of rewards   211.04

model saved!
Episode 600	sum of rewards   166.72
Episode 650	sum of rewards   179.87
Episode 697	sum of rewards   109.04

KeyboardInterrupt: 

In [None]:
fig = plt.figure()
plt.plot(np.arange(score1.shape[1]), score1[0])
plt.plot(np.arange(score1.shape[1]), score1[1])
plt.ylabel('Total rewards')
plt.xlabel('Episode #')
plt.show()

In [None]:
# load checkpoint
agent.online_net.load_state_dict(torch.load('checkpoint.pth'))
# load to cpu and save the best weights
torch.save(agent.online_net.cpu().state_dict(), "LunarLander.pth")