# Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
import time
from environment import CacheEnv
# from environment import scheduler
%load_ext autoreload
%autoreload 2

In [2]:
# env vars
EPS_LEN = 100
N_PAGES = 10
CACHE_LIMIT = 5 
env = CacheEnv(
        eps_len=EPS_LEN, 
        n_pages=N_PAGES, 
        limit=CACHE_LIMIT
        )

# dqn vars
N_EPS = 60000
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # greedy policy
GAMMA = 0.9                 # reward discount
TARGET_REPLACE_ITER = 100   # target update frequency
MEMORY_CAPACITY = 20000

s = env.reset()
N_ACTIONS = env.action_space_n
STATE_SHAPE = (CACHE_LIMIT, 2)
N_STATES = STATE_SHAPE[0]*STATE_SHAPE[1]

In [3]:
N_ACTIONS

5

In [4]:
class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        input_size = N_STATES
        self.fc1 = nn.Linear(input_size, 100)
        self.fc2 = nn.Linear(100, 10)
#         self.fc3 = nn.Linear(10, 100)
        self.out = nn.Linear(10, N_ACTIONS)

    def forward(self, x):
#         bs = x.shape[0]
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
#         x = F.relu(self.fc3(x))
        x = self.out(x)
        return F.softmax(x)

In [5]:
class DQN(object):
    def __init__(self):
        self.eval_net, self.target_net = Net().cuda(), Net().cuda()
        self.learn_step_counter = 0                                     # for target updating
        self.memory_counter = 0                                         # for storing memory
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     # initialize memory
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()
        self.PATH = 'model/'

    def choose_action(self, x):
#         print(f"X for choose action: {x}")
        x = torch.unsqueeze(torch.FloatTensor(x), 0).cuda()
        # input only one sample
        if np.random.uniform() < EPSILON:   # greedy
            actions_value = self.eval_net(x).detach().cpu()
            action = torch.argmax(actions_value).cpu().data.numpy()
        else:   # random
            action = np.random.randint(0, N_ACTIONS)
        return action

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1

    def learn(self):
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # sample batch transitions
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :N_STATES]).cuda()
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:]).cuda()

        # q_eval w.r.t the action in experience
        out = self.eval_net(b_s).cpu()
#         print(out.shape)
#         print("b_a:", b_a.shape)
#         print("b_a:", b_a.max(), '\t', b_a.min())
        q_eval = out.gather(1, b_a)  # shape (batch, 1) (
        q_next = self.target_net(b_s_).detach().cpu()     # detach from graph, don't backpropagate
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)   # shape (batch, 1)
        loss = self.loss_func(q_eval, q_target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def save_model(self, eval_name = 'eval_net', train_name = 'train_net'):
        torch.save(self.eval_net.state_dict(), self.PATH  + str(N_STATES) + eval_name)
        torch.save(self.target_net.state_dict(), self.PATH  + str(N_STATES)+ train_name)
        
    def load_model(self, eval_name = 'eval_net.m', train_name = 'train_net.m'):
        self.eval_net.load_state_dict(torch.load(self.PATH +  str(N_STATES) + eval_name))
        self.target_net.load_state_dict(torch.load(self.PATH  + str(N_STATES) + train_name))

In [None]:
start = time.time()
dqn = DQN()
# dqn.load_model()
print("Start")
print('\nCollecting experience...')
history = []
for i_episode in range(N_EPS):
    s = env.reset()
    ep_r = 0
    n = 0
    while True:
        a = dqn.choose_action(s)
        s_, r, done, info = env.step(a)
        dqn.store_transition(s, a, r, s_)
        ep_r += r
        n+= 1
        
        if dqn.memory_counter > MEMORY_CAPACITY:
            dqn.learn()
        
        if done:
            history.append(ep_r)
            if (dqn.memory_counter > MEMORY_CAPACITY) and i_episode%100==0:
                print('Ep: ', i_episode, '| Ep_r: ', ep_r, 'Ran for:', n)
            break

        s = s_
end = time.time()

Start

Collecting experience...


  app.launch_new_instance()


Ep:  200 | Ep_r:  0 Ran for: 100
Ep:  300 | Ep_r:  -14 Ran for: 100
Ep:  400 | Ep_r:  10 Ran for: 100
Ep:  500 | Ep_r:  6 Ran for: 100
Ep:  600 | Ep_r:  -14 Ran for: 100
Ep:  700 | Ep_r:  8 Ran for: 100
Ep:  800 | Ep_r:  16 Ran for: 100
Ep:  900 | Ep_r:  14 Ran for: 100
Ep:  1000 | Ep_r:  12 Ran for: 100
Ep:  1100 | Ep_r:  24 Ran for: 100
Ep:  1200 | Ep_r:  18 Ran for: 100
Ep:  1300 | Ep_r:  6 Ran for: 100
Ep:  1400 | Ep_r:  8 Ran for: 100
Ep:  1500 | Ep_r:  0 Ran for: 100
Ep:  1600 | Ep_r:  2 Ran for: 100
Ep:  1700 | Ep_r:  0 Ran for: 100
Ep:  1800 | Ep_r:  12 Ran for: 100
Ep:  1900 | Ep_r:  24 Ran for: 100
Ep:  2000 | Ep_r:  8 Ran for: 100
Ep:  2100 | Ep_r:  26 Ran for: 100
Ep:  2200 | Ep_r:  24 Ran for: 100
Ep:  2300 | Ep_r:  0 Ran for: 100
Ep:  2400 | Ep_r:  14 Ran for: 100
Ep:  2500 | Ep_r:  12 Ran for: 100
Ep:  2600 | Ep_r:  10 Ran for: 100
Ep:  2700 | Ep_r:  -10 Ran for: 100
Ep:  2800 | Ep_r:  18 Ran for: 100
Ep:  2900 | Ep_r:  2 Ran for: 100
Ep:  3000 | Ep_r:  4 Ran for: 100
Ep