### Cartpole-v0
### Double Deep Q-Network (DDQN)

### 1. Import packages

In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

import gym
import numpy as np
import torch
import matplotlib

import matplotlib.pyplot as plt
import time
from  torch.autograd import Variable
from collections import deque

from ddqn_agent_1 import Agent, FloatTensor
from replay_buffer import ReplayMemory, Transition


## set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display
plt.ion()

<matplotlib.pyplot._IonContext at 0x1d6faa982e0>

In [2]:
## Applying GPU
use_cuda = torch.cuda.is_available()
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
device = torch.device("cuda" if use_cuda else "cpu")

## Hyperparameter Definition
BATCH_SIZE = 64  
gamma = 0.99
lr = 0.001
TARGET_UPDATE = 10
num_episodes = 10000
print_every = 50
hidden_dim = 32
min_eps = 0.01
max_eps_episode = 200

In [3]:
## Environment Loading
env = gym.make('CartPole-v1')
env = gym.wrappers.Monitor(env, directory="monitors", force=True)

In [4]:
## Cartpole Constraints
threshold = env.spec.reward_threshold        
space_dim =  env.observation_space.shape[0]
action_dim = env.action_space.n 
print('input_dim: ', space_dim, ', hidden_dim: ', hidden_dim, ', output_dim: ', action_dim, 'threshold: ', threshold)

input_dim:  4 , output_dim:  2 , hidden_dim:  32
threshold:  475.0


In [5]:
agent = Agent(space_dim, action_dim, hidden_dim, BATCH_SIZE, lr)

In [6]:
# Further Demo Play Parameters
def save(directory, filename):
    torch.save(agent.q_old.state_dict(), '%s/%s_local.pth' % (directory, filename))
    torch.save(agent.q_new.state_dict(), '%s/%s_target.pth' % (directory, filename))

### 2. Applying epsilon decreasing

In [7]:
def epsilon_decreasing(i_epsiode, max_episode, min_eps: float):
    slope = (min_eps - 1.0) / max_episode
    ret_eps = max(slope * i_epsiode + 1.0, min_eps)
    return ret_eps  

### 3. Define run on episode

In [8]:
def run_episode(env, agent, eps):
    """Play an epsiode and train
    """
    state = env.reset()
    done = False
    total_reward = 0
    

    while not done:

        action = agent.get_action(FloatTensor(np.array([state])), eps)
        
        next_state, reward, done, _ = env.step(action.item())

        total_reward += reward

        if done:
            reward = -1
                    
        # Store the transition in memory
        agent.rememory.push(
                (FloatTensor(np.array([state])), 
                 action, # action is already a tensor
                 FloatTensor(np.array([reward])), 
                 FloatTensor(np.array([next_state])), 
                 FloatTensor(np.array([done]))))
                 
        if len(agent.rememory) > BATCH_SIZE:
            batch = agent.rememory.sample(BATCH_SIZE)
            agent.learn(batch, gamma, BATCH_SIZE)
        state = next_state

    return total_reward

### 4. Training

In [9]:
## Training Process
avg_scores_array = []  
scores_var_array = []
scores_array = []
scores_deque = deque(maxlen=100) 
    
time_start = time.time()

for i_episode in range(num_episodes):
    
     eps = epsilon_decreasing(i_episode, max_eps_episode, min_eps)
     score = run_episode(env, agent, eps)

     scores_deque.append(score)
     scores_array.append(score)
        
     avg_score = np.mean(scores_deque)
     avg_scores_array.append(avg_score)
        
     scores_var = np.var(scores_deque)
     scores_var_array.append(scores_var) 

     dt = (int)(time.time() - time_start)
            
     if i_episode % print_every == 0 and i_episode > 0:
         
          print('Episode: {:5} Score: {:5}  Avg.Score: {:.2f}, eps-greedy: {:5.2f} Time: {:02}:{:02}:{:02}'.\
                    format(i_episode, score, avg_score, eps, dt//3600, dt%3600//60, dt%60))
            
     if len(scores_deque) == scores_deque.maxlen:      
            ### 195.0: for cartpole-v0 and 475 for v1
        if np.mean(scores_deque) >= threshold: 
            print('\n Environment solved in {:d} episodes!\tAverage Score: {:.2f}'. \
                    format(i_episode, np.mean(scores_deque)))
            break

                        
     if i_episode % TARGET_UPDATE == 0:
          agent.q_new.load_state_dict(agent.q_old.state_dict()) 
    
agent.q_new.load_state_dict(agent.q_old.state_dict())

scores = scores_array
avg_scores = avg_scores_array
scores_var = scores_var_array



Episode:    50 Score:  11.0  Avg.Score: 27.10, eps-greedy:  0.75 Time: 00:00:07
Episode:   100 Score: 121.0  Avg.Score: 43.52, eps-greedy:  0.51 Time: 00:00:19
Episode:   150 Score:  41.0  Avg.Score: 100.42, eps-greedy:  0.26 Time: 00:00:47
Episode:   200 Score: 116.0  Avg.Score: 142.24, eps-greedy:  0.01 Time: 00:01:15
Episode:   250 Score:  88.0  Avg.Score: 197.50, eps-greedy:  0.01 Time: 00:02:07
Episode:   300 Score: 229.0  Avg.Score: 243.69, eps-greedy:  0.01 Time: 00:02:55
Episode:   350 Score: 111.0  Avg.Score: 184.91, eps-greedy:  0.01 Time: 00:03:24
Episode:   400 Score: 105.0  Avg.Score: 114.57, eps-greedy:  0.01 Time: 00:03:45
Episode:   450 Score: 140.0  Avg.Score: 112.70, eps-greedy:  0.01 Time: 00:04:11


KeyboardInterrupt: 

### 5.Plot

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

print('length of scores: ', len(scores), ', len of avg_scores: ', len(avg_scores))

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores, label="Score")
plt.plot(np.arange(1, len(avg_scores)+1), avg_scores, label="Avg on 100 episodes")
plt.legend(bbox_to_anchor=(1.05, 1)) 
plt.ylabel('Score')
plt.xlabel('Episodes #')
plt.show()

### 6.Save model

In [None]:
#save('dir_chk_V0_ddqn', 'cartpole-v0-ddqn-239epis')