In [1]:
import gym
import math
import random
import numpy as np
import pandas as pd

from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import time

import collections
import copy

#env = gym.make('CartPole-v0').unwrapped

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# if gpu is to be used
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [2]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward','content'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [3]:
class DQN(nn.Module):
    def __init__(self, input_dim, emb_dim=None, hid_dim, n_layers, dropout = 0.5):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        #self.embedding = nn.Embedding(input_dim, emb_dim)
        
        #self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.rnn = nn.LSTM(input_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src len, batch size]
        
        #embedded = self.dropout(self.embedding(src))
        #embedded = [src len, batch size, emb dim]
        
        embedded = self.dropout(src)
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        

In [4]:
data_path = '/home/ubuntu/data/dataset/R267_U1_V10/'
UIT = pd.read_csv(data_path + 'UIT.csv')
UIT

Unnamed: 0,u,i,day,video_type,level1,level2,level3,level4,time,v
0,343,5753,0,11,0,0,0,0,0,5753
1,22,5806,0,11,0,0,0,0,0,5806
2,294,5463,0,11,0,0,0,2,0,5463
3,551,2902,0,11,0,0,0,0,0,2902
4,278,4277,0,11,0,0,0,0,0,4277
...,...,...,...,...,...,...,...,...,...,...
267680,913,2875,29,11,0,0,0,0,43198,2875
267681,594,7449,29,11,0,0,0,0,43199,7449
267682,271,8016,29,13,0,0,0,0,43199,8016
267683,482,1772,29,13,0,0,0,2,43199,1772


In [5]:
trainUIT = UIT[UIT['day']<18]
contentNum = len(UIT.i.drop_duplicates())
userNum = len(UIT.u.drop_duplicates())

In [6]:
class ENV(object):
    def __init__(self,userNum,contentNum):
        self.userNum = userNum
        self.contentNum =contentNum

        self.r = np.zeros(shape=(userNum,contentNum),dtype=int)
        self.p = np.full(shape=contentNum,fill_value = 1/userNum)
        self.e = np.zeros(shape=contentNum)
        self.S = np.ones(shape=contentNum,dtype=int)
        self.l_edge = 0.1
        self.l_cp = 1

        self.B = np.full(shape=userNum,fill_value=10,dtype=int)

        self.pipe = collections.OrderedDict()


    #有序字典实现LRU
    def updateEgdeCache(self,action,t):
        for i in np.argwhere(action==1).squeeze(-1):
            if i in self.pipe.keys():
                self.pipe.pop(i)
            elif len(self.pipe) >= 500:
                self.e[self.pipe.popitem(last=False)[0]] = 0
            self.pipe[i] = t
            self.e[i] = 1

    
    def updateEnv(self,u,action,t):

        self.p = ((self.r[u] | action)-self.r[u])*(1/self.userNum) + self.p

        self.r[u] = self.r[u] | action

        self.updateEgdeCache(action,t)

    def getStatus(self):
        return (torch.from_numpy(self.r),
                torch.from_numpy(self.p) , 
                torch.from_numpy(self.e),
                torch.from_numpy(self.S),
                self.l_edge,
                self.l_cp)

    def reset(self):
        self.r = np.zeros(shape=(self.userNum,self.contentNum),dtype=int)
        self.p = np.full(shape=self.contentNum,fill_value = 1/self.userNum)
        self.e = np.zeros(shape=self.contentNum)
        self.S = np.ones(shape=self.contentNum,dtype=int)
        self.l_edge = 0.1
        self.l_cp = 1
        self.B = np.full(shape=self.userNum,fill_value=10,dtype=int)
        self.pipe = collections.OrderedDict()


In [7]:
#每个神经网络单独作为一个reward进行训练
class UE(object):
    def __init__(self,u,env,rewardPara):
        self.u = u

        self.W = []
        self.v = torch.zeros(size=(env.contentNum,),dtype=int)

        self.Bu = int(env.B[self.u])
        self.contentNum = env.contentNum
        self.userNum = env.userNum

        self.r , self.p , self.e, self.S,self.l_edge, self.l_cp = env.getStatus()

        self.action = torch.zeros(size=(env.contentNum,),dtype=int)
        self.lastAction = self.action

        self.reward = 0
        self.ALPHAh = rewardPara['alpha']
        self.BETAo =  rewardPara['betao']
        self.BETAl =  rewardPara['betal']
        self.EPS_START = 0.9
        self.EPS_END = 0.1
        self.EPS_DECAY = 10
        
        self.t = 0
        
        self.statusFeature = self.statusEmbedding()

    def updateViewContent(self,i):
        self.W.append(i)
        self.v[i] = 1


    def statusEmbedding(self):
        statusFeature = torch.zeros(size=(5,env.contentNum)).to(device)
        
        statusFeature[0] = self.v
        statusFeature[1] = self.r[self.u]
        statusFeature[2] = self.p
        statusFeature[3] = self.e
        statusFeature[4] = self.S

        #statusFeature[5] = status['r']
        return statusFeature.T
    
    def getReward(self,lastru,lastp,ru,p,i,action,S,Bu,l_edge,l_cp,e):

        Rh = - self.ALPHAh * (torch.log(lastru * lastp + (1-lastru) * (1-lastp)) - torch.log(ru * p + (1-ru) * (1-p)))

        Ro =   self.ALPHAo * action[i] * (S[i] / Bu + ( e[i] * l_edge + ( 1-e[i] ) * l_cp ) / S[i])

        Rl =   self.ALPHAl * ( ( 1 - action[i] )  * ( l_cp - ( e[i] * l_edge + ( 1 - e[i] ) * l_cp ) ) ) / S[i]

        Rh[i] = Rh[i] + Ro + Rl

        return  Rh

    def selectAction(self,env,uit,QNetwork,train,memory):

        self.lastStatusFeature = self.statusFeature
        self.lastAction = self.action
        self.lastp = self.p
        self.lastr = self.r

        self.updateViewContent(uit[1])
        self.r , self.p , self.e, self.S, self.l_edge, self.l_cp = env.getStatus()
        self.statusFeature = self.statusEmbedding()
        
        self.reward = self.getReward(self.lastr[self.u],self.lastp,self.r[self.u],self.p,self.W[-1],self.action,self.S,self.Bu,self.l_edge,self.l_cp,self.e)
        
        if train:
            
            lastAction = torch.cat(((1-self.lastAction).unsqueeze(1),self.lastAction.unsqueeze(1)),1)
            memory.push(self.lastStatusFeature, 
                    lastAction.to(device), 
                    self.statusFeature,
                    self.reward.float().to(device),
                    torch.tensor([self.W[-1]]).to(device))
        
        sample = random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) *  np.exp(-1. * self.t / self.EPS_DECAY)
        self.t = self.t + 1
        
        self.action = torch.zeros(size=(env.contentNum,),dtype=int)
        self.action[self.W[-1]] = 1
        if  not train or (train and sample > eps_threshold):
            QNetwork.eval()
            with torch.no_grad():
                Q_value = QNetwork(self.statusFeature)
                actionIndex = list((Q_value[:,1]-Q_value[:,0]).argsort()[0:self.Bu])
            QNetwork.train()
        else:
            actionIndex = list(torch.randint(0,self.contentNum,(self.Bu,)))
        if self.W[-1] not in actionIndex:
            actionIndex.pop()
        for index in actionIndex:
            self.action[index] = 1

        env.updateEnv(self.u,self.action.numpy(),uit[2])

        return self.action

In [8]:
BATCH_SIZE = 256
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200


steps_done = 0


def optimize_model():
    if len(memory) < BATCH_SIZE:
        #print(len(memory))
        return 0
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    #non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool)
    #non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    content_batch = torch.cat(batch.content)
    next_state_batch = torch.cat(batch.next_state)
    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_mask_bacth = action_batch.ge(0.5)
    state_action_values = torch.masked_select(policy_net(state_batch),state_action_mask_bacth)
    #print(state_action_values,state_action_values.dtype)
    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    #next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    
    def getNextStatusQ(s_batch,c_batch):
        with torch.no_grad():
            Q_value= torch.stack(target_net(s_batch).chunk(BATCH_SIZE,dim=0))
            c = c_batch.chunk(BATCH_SIZE,dim=0)
            action = torch.zeros(size=(BATCH_SIZE,contentNum,2),dtype=int).to(device)
            for b in range(BATCH_SIZE):
                Q_value_sortindex = list((Q_value[b,:,1]-Q_value[b,:,0]).argsort()[0:10])
                i = c[b].squeeze()
                if i not in Q_value_sortindex:
                    Q_value_sortindex.pop()
                action[b,i,1] = 1
                for index in Q_value_sortindex:
                    action[b,index,1] = 1
            action[:,:,0]=1-action[:,:,1]
            action_mask = action.ge(0.5).to(device)
            next_state_values = torch.masked_select(Q_value,action_mask).float()

            return next_state_values
    
    next_state_values =  getNextStatusQ(next_state_batch,content_batch)

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values.unsqueeze(1), expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss

In [9]:
# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
#init_screen = get_screen()
#_, _, screen_height, screen_width = init_screen.shape

# Get number of actions from gym action space
#n_actions = env.action_space.n

policy_net = DQN(5, 2).to(device)
target_net = DQN(5, 2).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters())

memory = ReplayMemory(10000)


MODELPATH =  './model_dict/lstm1_ep'

num_episodes = 20
TARGET_UPDATE = 2
bestReward =  0
rewardPara = {"alpha":1,"betao":0.5,"betal":0.5}

env = ENV(userNum,contentNum)


UEs = {}
sumReward = 0
loss = 0
UEHit = np.zeros(userNum)
edgeHit = 0


for i_episode in range(num_episodes):
    # Initialize the environment and state

    for index,trace in trainUIT.iterrows():
        uit = trace.to_numpy()
        if uit[0] not in UEs:
            UEs[uit[0]] = UE(uit[0],env,rewardPara)

        ue = UEs[uit[0]]
        
        actionIndex = np.argwhere(ue.lastAction)
        if uit[1] in actionIndex:
            UEHit[uit[0]] += 1
        elif uit[1] in env.pipe.keys():
            edgeHit += 1

        ue.selectAction(env,uit,policy_net,1,memory)

        sumReward += ue.reward.sum()
        
        # Perform one step of the optimization (on the policy network)
        if index % 1000 == 0:
            loss += optimize_model()
        
        if index % 10000 == 0:
            print("Time:",time.asctime( time.localtime(time.time())),"--Episode:",i_episode,"  Index:",index,"  Loss:",float(loss/(index+1)),"  Reward:",sumReward/(index+1),)
            print("UEHitrate:",UEHit.sum()/(index+1)," edgeHitrate",edgeHit/(index+1),"sumHitrate",(edgeHit+UEHit.sum())/(index+1))
            print()
    print("----------------------------------------------------------------")
    print("Time:",time.asctime( time.localtime(time.time())),"--End episode:",i_episode,"  Loss:",loss/(index+1),"  Reward:",sumReward/(index+1))
    print("UEHitrate:",UEHit.sum()/(index+1)," edgeHitrate",edgeHit/(index+1),"sumHitrate",(edgeHit+UEHit.sum())/(index+1))
    print("----------------------------------------------------------------")
    print()
    
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0: 
        if sumReward > bestReward:
            bestLoss = loss
            bestReward = sumReward
            bestUEHit = UEHit
            bestEdgeHit = edgeHit
            torch.save(policy_net.state_dict(),MODELPATH+"{}_".format(i_episode)+time.strftime("%m%d-%H-%M-%S",time.localtime(time.time())))

        target_net.load_state_dict(policy_net.state_dict())

    env.reset()
    UEs = {}
    sumReward = 0
    loss = 0
    UEHit = np.zeros(userNum)
    edgeHit = 0

    

Time: Sat Sep 11 14:49:58 2021 --Episode: 0   Index: 0   Loss: 0.0   Reward: tensor(0., dtype=torch.float64)
UEHitrate: 0.0  edgeHitrate 0.0 sumHitrate 0.0

Time: Sat Sep 11 14:50:46 2021 --Episode: 0   Index: 10000   Loss: 1.1372811059118249e-05   Reward: tensor(0.0654, dtype=torch.float64)
UEHitrate: 0.0098990100989901  edgeHitrate 0.1421857814218578 sumHitrate 0.15208479152084792

Time: Sat Sep 11 14:51:35 2021 --Episode: 0   Index: 20000   Loss: 8.50613560032798e-06   Reward: tensor(0.0982, dtype=torch.float64)
UEHitrate: 0.01239938003099845  edgeHitrate 0.2136393180340983 sumHitrate 0.22603869806509674

Time: Sat Sep 11 14:52:25 2021 --Episode: 0   Index: 30000   Loss: 6.514484539366094e-06   Reward: tensor(0.1200, dtype=torch.float64)
UEHitrate: 0.014599513349555015  edgeHitrate 0.2619579347355088 sumHitrate 0.27655744808506383

Time: Sat Sep 11 14:53:17 2021 --Episode: 0   Index: 40000   Loss: 5.192690423427848e-06   Reward: tensor(0.1354, dtype=torch.float64)
UEHitrate: 0.01592