In [1]:
import gym
import math
import random
import numpy as np
import pandas as pd

from collections import namedtuple, deque
from itertools import count

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import time

import collections
import copy

#env = gym.make('CartPole-v0').unwrapped

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# if gpu is to be used
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=1)

In [2]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward','content'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [3]:
class DQN(nn.Module):

    def __init__(self,inputs,outputs):
        super(DQN, self).__init__()
        self.conn1 = nn.Linear(inputs,32)
        self.conn2 = nn.Linear(32,128)
        self.conn3 = nn.Linear(128,16)
        self.conn4 = nn.Linear(16, outputs)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.float().to(device)
        x = F.relu(self.conn1(x))
        x = F.relu(self.conn2(x))
        x = F.relu(self.conn3(x))
        return self.conn4(x)

    

In [4]:
data_path = '/home/ubuntu/data/dataset/R3009_U5_V100/'
UIT = pd.read_csv(data_path + 'UIT.csv')
UIT

Unnamed: 0,u,i,day,time,viewtime,video_type,video_format,city,city_isp,client_ip,conn_type,device_type
0,365,3391,0,0,148,1030,101001,0,0,11807,1,2
1,203,5779,0,0,7,1030,10203,0,0,15068,1,2
2,208,4675,0,0,92,1035,10203,0,0,5375,1,2
3,159,332,0,0,56,1030,10202,0,0,5992,1,2
4,50,674,0,0,439,1030,10203,0,0,3468,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
300978,483,6831,29,2591880,34,1030,10203,0,0,10010,1,2
300979,158,8448,29,2591880,34,1030,10203,0,0,23340,1,2
300980,483,6463,29,2591940,35,1030,10203,0,0,10010,1,2
300981,158,4715,29,2591940,34,1030,10203,0,0,23340,1,2


In [5]:
trainUIT = UIT[UIT['day']<18]
contentNum = len(UIT.i.drop_duplicates())
userNum = len(UIT.u.drop_duplicates())
contentNum,userNum,trainUIT

(10000,
 500,
           u     i  day     time  viewtime  video_type  video_format  city  \
 0       365  3391    0        0       148        1030        101001     0   
 1       203  5779    0        0         7        1030         10203     0   
 2       208  4675    0        0        92        1035         10203     0   
 3       159   332    0        0        56        1030         10202     0   
 4        50   674    0        0       439        1030         10203     0   
 ...     ...   ...  ...      ...       ...         ...           ...   ...   
 198170  264  7442   17  1555140        90        1035         10203     0   
 198171   19  9362   17  1555140       424        1035         10203     0   
 198172   82  9223   17  1555140        94        1037         10203     0   
 198173   35  4164   17  1555140        22        1030         10203     0   
 198174  239  5062   17  1555140        89        1035         10203     0   
 
         city_isp  client_ip  conn_type  device_

In [6]:
class ENV(object):
    def __init__(self,userNum,contentNum,latency):
        self.userNum = userNum
        self.contentNum =contentNum

        self.r = np.zeros(shape=(userNum,contentNum),dtype=int)
        self.p = np.full(shape=contentNum,fill_value = 1/userNum)
        self.e = np.zeros(shape=contentNum)
        self.S = np.ones(shape=contentNum,dtype=int)
        self.l = np.array(latency)
        

        self.B = np.full(shape=userNum,fill_value=15,dtype=int)

        self.pipe = collections.OrderedDict()


    #有序字典实现LRU
    def updateEgdeCache(self,action,t):
        for i in np.argwhere(action==1).squeeze(-1):
            if i in self.pipe.keys():
                self.pipe.pop(i)
            elif len(self.pipe) >= 500:
                self.e[self.pipe.popitem(last=False)[0]] = 0
            self.pipe[i] = t
            self.e[i] = 1

    
    def updateEnv(self,u,action,t):
        
        p_tmp = ((self.r[u] | action)-self.r[u])*(1/self.userNum) + self.p
        self.p = np.where(p_tmp<1-1/self.userNum,p_tmp,1-1/self.userNum)

        self.r[u] = self.r[u] | action

        self.updateEgdeCache(action,t)

    def getStatus(self):
        return (torch.from_numpy(self.r),
                torch.from_numpy(self.p), 
                torch.from_numpy(self.e),
                torch.from_numpy(self.S),
                torch.from_numpy(self.l))

    #def reset(self):
    #    self.r = np.zeros(shape=(self.userNum,self.contentNum),dtype=int)
    #    self.p = np.full(shape=self.contentNum,fill_value = 1/self.userNum)
    #    self.e = np.zeros(shape=self.contentNum)
    #    self.S = np.ones(shape=self.contentNum,dtype=int)
    #    self.l_edge = 0.1
    #    self.l_cp = 1
    #    self.B = np.full(shape=self.userNum,fill_value=15,dtype=int)
    #    self.pipe = collections.OrderedDict()


In [7]:
class UE(object):
    def __init__(self,u,env,rewardPara):
        self.u = u

        self.W = []
        self.v = torch.zeros(size=(env.contentNum,),dtype=int)

        self.Bu = int(env.B[self.u])
        self.contentNum = env.contentNum
        self.userNum = env.userNum

        self.r , self.p , self.e, self.S, self.l = env.getStatus()

        self.action = torch.zeros(size=(env.contentNum,),dtype=int)
        self.lastAction = self.action

        self.reward = 0
        self.ALPHAh = rewardPara['alpha']
        self.BETAo =  rewardPara['betao']
        self.BETAl =  rewardPara['betal']             
        self.statusFeature = self.statusEmbedding()

    def updateViewContent(self,i):
        self.W.append(i)
        self.v[i] = 1


    def statusEmbedding(self):
        statusFeature = torch.zeros(size=(5,env.contentNum)).to(device)
        
        statusFeature[0] = self.v
        statusFeature[1] = self.r[self.u]
        statusFeature[2] = self.p
        statusFeature[3] = self.e
        statusFeature[4] = self.S

        #statusFeature[5] = status['r']
        return statusFeature.T
    
    def getReward(self,lastru,lastp,ru,p,i,action,S,l,e,v):
        
        #self.Rh =   self.ALPHAh * (torch.log(v * p + (1-v) * (1-p)).sum() / torch.log(ru * p + (1-ru) * (1-p)).sum() )

        #self.Rh =   self.ALPHAh * (torch.log(v * p + (1-v) * (1-p)).sum() - torch.log(ru * p + (1-ru) * (1-p)).sum())
        self.Rh =   self.ALPHAh *( torch.log(lastru * lastp + (1-lastru) * (1-lastp)).sum() - torch.log(ru * p + (1-ru) * (1-p)).sum() )

        self.Ro =   self.BETAo * action[i] * S[i] * ( 1 + e[i] * l[0] + ( 1 - e[i] ) * l[1] )

        self.Rl =   self.BETAl *  ( 1 - action[i] ) * S[i] * e[i] * l[2]

        #self.Rh[i] = self.Rh[i] + self.Ro + self.Rl

        #return  self.Rh.sum()
        return  self.Rh + self.Ro + self.Rl

    def selectAction(self,env,uit,QNetwork,train,memory):

        self.lastStatusFeature = self.statusFeature
        self.lastAction = self.action
        self.lastp = self.p
        self.lastr = self.r

        self.updateViewContent(uit[1])
        self.r , self.p , self.e, self.S, self.l = env.getStatus()
        self.statusFeature = self.statusEmbedding()
        
        self.reward = self.getReward(self.lastr[self.u],self.lastp,self.r[self.u],self.p,self.W[-1],self.lastAction,self.S,self.l,self.e,self.v)
        
        if train: 
            lastAction = torch.cat(((1-self.lastAction).unsqueeze(1),self.lastAction.unsqueeze(1)),1)
            memory.push(self.lastStatusFeature, 
                    lastAction.to(device), 
                    self.statusFeature,
                    torch.tensor([self.reward.float()]).to(device),
                    torch.tensor([self.W[-1]]).to(device))
        
            sample = random.random()
            eps_threshold = EPS_END + (EPS_START - EPS_END) *  np.exp(-1. * agentStep / EPS_DECAY)
        

        if  not train or (train and sample > eps_threshold):
            QNetwork.eval()
            with torch.no_grad():
                Q_value = QNetwork(self.statusFeature)
                actionIndex = list((Q_value[:,1]-Q_value[:,0]).argsort()[0:self.Bu])
            QNetwork.train()
        else:
            actionIndex = list(torch.randint(0,self.contentNum,(self.Bu,)))
            
        self.action = torch.zeros(size=(env.contentNum,),dtype=int)
        self.action[self.W[-1]] = 1
        if self.W[-1] not in actionIndex:
            actionIndex.pop()
        for index in actionIndex:
            self.action[index] = 1

        env.updateEnv(self.u,self.action.numpy(),uit[2])

        return self.action

In [8]:
BATCH_SIZE = 128
GAMMA = 0.99

def optimize_model():
    if len(memory) < BATCH_SIZE:
        #print(len(memory))
        return 0
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    #non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool)
    #non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    content_batch = torch.cat(batch.content)
    next_state_batch = torch.cat(batch.next_state)
    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_mask_bacth = action_batch.ge(0.5)
    state_action_values = torch.stack(torch.masked_select(policy_net(state_batch),state_action_mask_bacth).chunk(BATCH_SIZE,dim=0)).sum(dim=1)
    #print(state_action_values,state_action_values.dtype)
    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    #next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    
    def getNextStatusQ(s_batch,c_batch):
        
        Q_value= torch.stack(target_net(s_batch).detach().chunk(BATCH_SIZE,dim=0))
        c = c_batch.chunk(BATCH_SIZE,dim=0)
        action = torch.zeros(size=(BATCH_SIZE,contentNum,2),dtype=int).to(device)
        for b in range(BATCH_SIZE):
            Q_value_sortindex = list((Q_value[b,:,1]-Q_value[b,:,0]).argsort()[0:10])
            i = c[b].squeeze()
            if i not in Q_value_sortindex:
                Q_value_sortindex.pop()
            action[b,i,1] = 1
            for index in Q_value_sortindex:
                action[b,index,1] = 1
        action[:,:,0]=1-action[:,:,1]
        action_mask = action.ge(0.5).to(device)
        next_state_values = torch.stack(torch.masked_select(Q_value,action_mask).chunk(BATCH_SIZE,dim=0)).sum(dim=1).float()

        return next_state_values
    
    next_state_values =  getNextStatusQ(next_state_batch,content_batch)

    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values.unsqueeze(1), expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return loss

In [9]:
MODELPATH =  './model_dict/dnn_h1_reward_'
rewardPara = {"alpha":1,"betao":10,"betal":10}
latency = [0.1,1,0.9]

# Get screen size so that we can initialize layers correctly based on shape
# returned from AI gym. Typical dimensions at this point are close to 3x40x90
# which is the result of a clamped and down-scaled render buffer in get_screen()
#init_screen = get_screen()
#_, _, screen_height, screen_width = init_screen.shape
# Get number of actions from gym action space
#n_actions = env.action_space.n
policy_net = DQN(5, 2).to(device)
target_net = DQN(5, 2).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.Adam(policy_net.parameters())
memory = ReplayMemory(10000)
torch.cuda.empty_cache()


In [10]:
num_episodes = 10
TARGET_UPDATE = 1
EPS_START = 0.9
EPS_END = 0.1
EPS_DECAY = trainUIT.shape[0]*5
agentStep = 0


bestReward =  float("-inf")
env = ENV(userNum,contentNum,latency)
UEs = {}
sumReward = np.zeros(3)
loss = 0
UEHit = np.zeros(userNum)
edgeHit = 0


for i_episode in range(num_episodes):
    # Initialize the environment and state
    for index,trace in trainUIT.iterrows():
        uit = trace.to_numpy()
        if uit[0] not in UEs:
            UEs[uit[0]] = UE(uit[0],env,rewardPara)
        ue = UEs[uit[0]]
        actionIndex = np.argwhere(ue.lastAction)
        if uit[1] in actionIndex:
            UEHit[uit[0]] += 1
        elif uit[1] in env.pipe.keys():
            edgeHit += 1
        ue.selectAction(env,uit,policy_net,1,memory)
        agentStep += 1
        sumReward[0] += float(ue.Rh)
        sumReward[1] += float(ue.Ro)
        sumReward[2] += float(ue.Rl)
        if index % 25 == 0:
            # Perform one step of the optimization (on the policy network)
            loss += float(optimize_model())
        if index % 100 == 0 :
            # Update the target network, copying all weights and biases in DQN
            target_net.load_state_dict(policy_net.state_dict())
            if index % 20000 == 0 :
                psi = 0
                p = torch.from_numpy(env.p)
                for u in UEs:
                    psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()
                print("--Time:",time.asctime( time.localtime(time.time())),"Episode:",i_episode,"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
                print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
                print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
                print()
    psi = 0
    p = torch.from_numpy(env.p)
    for u in UEs:
        psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()
    print()
    print("----------------------------------------------------------------")
    print("--Time:",time.asctime( time.localtime(time.time())),"Episode:",i_episode,"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
    print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
    print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
    print("----------------------------------------------------------------")
    # Update the target network, copying all weights and biases in DQN
    target_net.load_state_dict(policy_net.state_dict())
    if sumReward.sum() > bestReward:
        bestReward = sumReward.sum()
        bestLoss = loss
        bestUEHit = UEHit
        bestEdgeHit = edgeHit
        bestEpisode = i_episode
        bestPath = MODELPATH+'ep{}_'.format(bestEpisode)+time.strftime("%m%d-%H-%M-%S",time.localtime(time.time()))
        torch.save(policy_net.state_dict(),bestPath)
        print(bestPath)
    print()
    env = ENV(userNum,contentNum,latency)
    UEs = {}
    sumReward = sumReward = np.zeros(3)
    loss = 0
    UEHit = np.zeros(userNum)
    edgeHit = 0

--Time: Mon Sep 20 11:18:09 2021 Episode: 0   Index: 0   Loss: 0.0 --
Reward: [0. 0. 0.] total reward: 0.0
UEHitrate: 0.0  edgeHitrate 0.0 sumHitrate 0.0  privacy: 4.02118

--Time: Mon Sep 20 11:20:39 2021 Episode: 0   Index: 20000   Loss: 0.48118 --
Reward: [-4.71013  0.08715  1.01425] total reward: -3.60874
UEHitrate: 0.00315  edgeHitrate 0.11314 sumHitrate 0.11629  privacy: 3.21989

--Time: Mon Sep 20 11:23:11 2021 Episode: 0   Index: 40000   Loss: 0.53658 --
Reward: [-3.65692  0.0966   1.04577] total reward: -2.51454
UEHitrate: 0.0043  edgeHitrate 0.1163 sumHitrate 0.1206  privacy: 2.6674

--Time: Mon Sep 20 11:25:32 2021 Episode: 0   Index: 60000   Loss: 0.54554 --
Reward: [-3.03497  0.10378  0.95593] total reward: -1.97525
UEHitrate: 0.00457  edgeHitrate 0.10636 sumHitrate 0.11093  privacy: 2.33183

--Time: Mon Sep 20 11:27:56 2021 Episode: 0   Index: 80000   Loss: 0.54169 --
Reward: [-2.61399  0.10345  0.89403] total reward: -1.61652
UEHitrate: 0.00461  edgeHitrate 0.09949 sumHi

In [11]:
#test

policy_net = DQN(5, 2).to(device)
target_net = DQN(5, 2).to(device)
policy_net.load_state_dict(torch.load(bestPath))
policy_net.eval()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
torch.cuda.empty_cache()


bestReward =  float("-inf")
env = ENV(userNum,contentNum,latency)
UEs = {}
sumReward = np.zeros(3)
loss = 0
UEHit = np.zeros(userNum)
edgeHit = 0

sumHitrate = np.zeros(UIT.shape[0] // 10000 +1)
UEHitrate = np.zeros(UIT.shape[0] // 10000 +1)
edgeHitrate = np.zeros(UIT.shape[0] // 10000 +1)
privacyReduction = np.zeros(UIT.shape[0] // 10000 +1)

for index,trace in UIT.iterrows():
    uit = trace.to_numpy()
    if uit[0] not in UEs:
        UEs[uit[0]] = UE(uit[0],env,rewardPara)
    ue = UEs[uit[0]]
    
    actionIndex = np.argwhere(ue.lastAction)
    if uit[1] in actionIndex:
        UEHit[uit[0]] += 1
    elif uit[1] in env.pipe.keys():
        edgeHit += 1
    ue.selectAction(env,uit,policy_net,0,memory)
    
    sumReward[0] += float(ue.Rh)
    sumReward[1] += float(ue.Ro)
    sumReward[2] += float(ue.Rl)
     
    if index % 10000 == 0 :
        psi = 0
        p = torch.from_numpy(env.p)
        for u in UEs:
            psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()
        print("--Time:",time.asctime( time.localtime(time.time())),"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
        print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
        print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
        print()
        sumHitrate[int(index // 10000)]   = round((edgeHit+UEHit.sum())/(index+1),5)
        UEHitrate [int(index // 10000)]   = round(UEHit.sum()/(index+1),5)
        edgeHitrate [int(index // 10000)] = round(edgeHit/(index+1),5)
        privacyReduction [int(index // 10000)] = round(float(psi)/len(UEs),5)
        
psi = 0
p = torch.from_numpy(env.p)
for u in UEs:
    psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()

print()
print("----------------------------------------------------------------")
print("--Time:",time.asctime( time.localtime(time.time())),"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
print("----------------------------------------------------------------")
print()
sumHitrate [int(round(index / 10000,0))]  = round((edgeHit+UEHit.sum())/(index+1),5)
UEHitrate  [int(round(index / 10000,0))]  = round(UEHit.sum()/(index+1),5)
edgeHitrate[int(round(index / 10000,0))]  = round(edgeHit/(index+1),5)
privacyReduction [int(round(index / 10000,0))] = round(float(psi)/len(UEs),5)

--Time: Mon Sep 20 16:55:46 2021   Index: 0   Loss: 0.0 --
Reward: [0. 0. 0.] total reward: 0.0
UEHitrate: 0.0  edgeHitrate 0.0 sumHitrate 0.0  privacy: 4.02118

--Time: Mon Sep 20 16:56:32 2021   Index: 10000   Loss: 0.0 --
Reward: [-5.48366  0.06589  0.60384] total reward: -4.81393
UEHitrate: 0.0021  edgeHitrate 0.06749 sumHitrate 0.06959  privacy: 3.83595

--Time: Mon Sep 20 16:57:16 2021   Index: 20000   Loss: 0.0 --
Reward: [-4.86256  0.08015  0.63582] total reward: -4.1466
UEHitrate: 0.0021  edgeHitrate 0.07095 sumHitrate 0.07305  privacy: 3.3345

--Time: Mon Sep 20 16:58:10 2021   Index: 30000   Loss: 0.0 --
Reward: [-4.2262   0.09073  0.59308] total reward: -3.54239
UEHitrate: 0.0026  edgeHitrate 0.0663 sumHitrate 0.0689  privacy: 2.97243

--Time: Mon Sep 20 16:59:05 2021   Index: 40000   Loss: 0.0 --
Reward: [-3.68458  0.0952   0.82303] total reward: -2.76635
UEHitrate: 0.00322  edgeHitrate 0.09185 sumHitrate 0.09507  privacy: 2.77774

--Time: Mon Sep 20 17:00:04 2021   Index:

In [12]:
num_episodes = 10
TARGET_UPDATE = 1
EPS_START = 0.9
EPS_END = 0.1
EPS_DECAY = trainUIT.shape[0]*5
agentStep = 0


bestReward =  float("-inf")
env = ENV(userNum,contentNum,latency)
UEs = {}
sumReward = np.zeros(3)
loss = 0
UEHit = np.zeros(userNum)
edgeHit = 0


for i_episode in range(num_episodes):
    # Initialize the environment and state
    for index,trace in trainUIT.iterrows():
        uit = trace.to_numpy()
        if uit[0] not in UEs:
            UEs[uit[0]] = UE(uit[0],env,rewardPara)
        ue = UEs[uit[0]]
        actionIndex = np.argwhere(ue.lastAction)
        if uit[1] in actionIndex:
            UEHit[uit[0]] += 1
        elif uit[1] in env.pipe.keys():
            edgeHit += 1
        ue.selectAction(env,uit,policy_net,1,memory)
        agentStep += 1
        sumReward[0] += float(ue.Rh)
        sumReward[1] += float(ue.Ro)
        sumReward[2] += float(ue.Rl)
        if index % 64 == 0:
            # Perform one step of the optimization (on the policy network)
            loss += float(optimize_model())
        if index % 10000 == 0 :
            # Update the target network, copying all weights and biases in DQN
            target_net.load_state_dict(policy_net.state_dict())
            if index % 20000 == 0 :
                psi = 0
                p = torch.from_numpy(env.p)
                for u in UEs:
                    psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()
                print("--Time:",time.asctime( time.localtime(time.time())),"Episode:",i_episode,"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
                print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
                print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
                print()
    psi = 0
    p = torch.from_numpy(env.p)
    for u in UEs:
        psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()
    print()
    print("----------------------------------------------------------------")
    print("--Time:",time.asctime( time.localtime(time.time())),"Episode:",i_episode,"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
    print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
    print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
    print("----------------------------------------------------------------")
    # Update the target network, copying all weights and biases in DQN
    target_net.load_state_dict(policy_net.state_dict())
    if sumReward.sum() > bestReward:
        bestReward = sumReward.sum()
        bestLoss = loss
        bestUEHit = UEHit
        bestEdgeHit = edgeHit
        bestEpisode = i_episode
        bestPath = MODELPATH+'ep{}_'.format(bestEpisode)+time.strftime("%m%d-%H-%M-%S",time.localtime(time.time()))
        torch.save(policy_net.state_dict(),bestPath)
        print(bestPath)
    print()
    env = ENV(userNum,contentNum,latency)
    UEs = {}
    sumReward = sumReward = np.zeros(3)
    loss = 0
    UEHit = np.zeros(userNum)
    edgeHit = 0

--Time: Mon Sep 20 17:23:19 2021 Episode: 0   Index: 0   Loss: 7.98768 --
Reward: [0. 0. 0.] total reward: 0.0
UEHitrate: 0.0  edgeHitrate 0.0 sumHitrate 0.0  privacy: 4.02118

--Time: Mon Sep 20 17:25:31 2021 Episode: 0   Index: 20000   Loss: 0.31862 --
Reward: [-4.59599  0.0901   0.79646] total reward: -3.70944
UEHitrate: 0.00205  edgeHitrate 0.0892 sumHitrate 0.09125  privacy: 3.39814

--Time: Mon Sep 20 17:27:38 2021 Episode: 0   Index: 40000   Loss: 0.333 --
Reward: [-3.51379  0.10105  0.9272 ] total reward: -2.48554
UEHitrate: 0.00335  edgeHitrate 0.10365 sumHitrate 0.107  privacy: 2.81475

--Time: Mon Sep 20 17:29:43 2021 Episode: 0   Index: 60000   Loss: 0.31758 --
Reward: [-2.85226  0.11405  1.17943] total reward: -1.55878
UEHitrate: 0.00443  edgeHitrate 0.1317 sumHitrate 0.13613  privacy: 2.4606

--Time: Mon Sep 20 17:31:51 2021 Episode: 0   Index: 80000   Loss: 0.30044 --
Reward: [-2.42193  0.11396  1.31578] total reward: -0.99219
UEHitrate: 0.00464  edgeHitrate 0.14709 sumH

KeyboardInterrupt: 

In [None]:
#test

policy_net = DQN(5, 2).to(device)
target_net = DQN(5, 2).to(device)
policy_net.load_state_dict(torch.load(bestPath))
policy_net.eval()
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
torch.cuda.empty_cache()


bestReward =  float("-inf")
env = ENV(userNum,contentNum,latency)
UEs = {}
sumReward = np.zeros(3)
loss = 0
UEHit = np.zeros(userNum)
edgeHit = 0

sumHitrate = np.zeros(UIT.shape[0] // 10000 +1)
UEHitrate = np.zeros(UIT.shape[0] // 10000 +1)
edgeHitrate = np.zeros(UIT.shape[0] // 10000 +1)
privacyReduction = np.zeros(UIT.shape[0] // 10000 +1)

for index,trace in UIT.iterrows():
    uit = trace.to_numpy()
    if uit[0] not in UEs:
        UEs[uit[0]] = UE(uit[0],env,rewardPara)
    ue = UEs[uit[0]]
    
    actionIndex = np.argwhere(ue.lastAction)
    if uit[1] in actionIndex:
        UEHit[uit[0]] += 1
    elif uit[1] in env.pipe.keys():
        edgeHit += 1
    ue.selectAction(env,uit,policy_net,0,memory)
    
    sumReward[0] += float(ue.Rh)
    sumReward[1] += float(ue.Ro)
    sumReward[2] += float(ue.Rl)
     
    if index % 10000 == 0 :
        psi = 0
        p = torch.from_numpy(env.p)
        for u in UEs:
            psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()
        print("--Time:",time.asctime( time.localtime(time.time())),"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
        print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
        print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
        print()
        sumHitrate[int(index // 10000)]   = round((edgeHit+UEHit.sum())/(index+1),5)
        UEHitrate [int(index // 10000)]   = round(UEHit.sum()/(index+1),5)
        edgeHitrate [int(index // 10000)] = round(edgeHit/(index+1),5)
        privacyReduction [int(index // 10000)] = round(float(psi)/len(UEs),5)
        
psi = 0
p = torch.from_numpy(env.p)
for u in UEs:
    psi += torch.log(UEs[u].r[u] * p + (1-UEs[u].r[u]) * (1-p)).sum() / torch.log(UEs[u].v * p + (1-UEs[u].v) * (1-p)).sum()

print()
print("----------------------------------------------------------------")
print("--Time:",time.asctime( time.localtime(time.time())),"  Index:",index,"  Loss:",round(loss/(index+1),5),"--")
print("Reward:",np.around(sumReward/(index+1),5),"total reward:",round(sumReward.sum()/(index+1),5))
print("UEHitrate:",round(UEHit.sum()/(index+1),5)," edgeHitrate",round(edgeHit/(index+1),5),"sumHitrate",round((edgeHit+UEHit.sum())/(index+1),5)," privacy:",round(float(psi)/len(UEs),5))
print("----------------------------------------------------------------")
print()
sumHitrate [int(round(index / 10000,0))]  = round((edgeHit+UEHit.sum())/(index+1),5)
UEHitrate  [int(round(index / 10000,0))]  = round(UEHit.sum()/(index+1),5)
edgeHitrate[int(round(index / 10000,0))]  = round(edgeHit/(index+1),5)
privacyReduction [int(round(index / 10000,0))] = round(float(psi)/len(UEs),5)