# Initialization

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
from copy import deepcopy
import copy
import pickle

In [2]:
import argparse
import torch
import numpy as np
import os
import datetime
import torch.nn as nn
import torchvision
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import TransformerEncoder
from torch.nn import TransformerEncoderLayer
from torch.nn import Module, Parameter
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
import time
import math

from collections import OrderedDict
from typing import List, Tuple, Union
import matplotlib.pyplot as plt

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Training on {DEVICE}")

Training on cuda:0


In [3]:
import random
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

In [4]:
class Options:
    def __init__(self):
      self.dataset = 'diginetica'
      self.batchSize = 32
      self.hiddenSize = 200
      self.nhead = 2
      self.layer = 3
      self.feedforward = 4
      self.epoch = 12
      self.lr = 0.001
      self.lr_dc = 0.1
      self.lr_dc_step = 3
      self.l2 = 1e-5
      self.patience = 12

opt = Options()

# Now you can access parameters like this:
print(opt.dataset)

diginetica


In [5]:
train_data = []
valid_data = []
list_valid_users = []

for i in range(45):
  if os.path.isfile(f'./SR_SAN_Diginetica/train_{i}.txt'):
    print(i)
    list_valid_users.append(i)
    tr_data = pickle.load(open(f'./SR_SAN_Diginetica/train_{i}.txt', 'rb'))
    ts_data = pickle.load(open(f'./SR_SAN_Diginetica/test_{i}.txt', 'rb'))

    train_data.append(tr_data)
    valid_data.append(ts_data)

print(f"len train: {len(train_data)}")
print(f"len validation: {len(valid_data)}")

0
3
7
8
9
16
21
23
24
25
28
31
32
35
36
37
39
41
42
43
44
len train: 21
len validation: 21


In [6]:
list_valid_users

[0, 3, 7, 8, 9, 16, 21, 23, 24, 25, 28, 31, 32, 35, 36, 37, 39, 41, 42, 43, 44]

In [7]:
# def data_masks(all_usr_pois, item_tail):
#   us_lens = [len(upois) for upois in all_usr_pois]
#   len_max = max(us_lens)
#   us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)]
#   us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens]
#   return us_pois, us_msks, len_max

In [8]:
def data_masks(all_usr_pois, item_tail):
    if not all_usr_pois or all(len(upois) == 0 for upois in all_usr_pois):
        raise ValueError("Input all_usr_pois is empty or contains only empty lists")

    us_lens = [len(upois) for upois in all_usr_pois]
    len_max = max(us_lens)
    us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)]
    us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens]
    return us_pois, us_msks, len_max

In [9]:
class Data():
    def __init__(self, data, shuffle=False, graph=None):
      inputs = data[0]
      inputs, mask, len_max = data_masks(inputs, [0])
      self.inputs = np.asarray(inputs)
      self.mask = np.asarray(mask)
      self.len_max = len_max
      self.targets = np.asarray(data[1])
      self.length = len(inputs)
      self.shuffle = shuffle
      self.graph = graph

    def generate_batch(self, batch_size):
      if self.shuffle:
        shuffled_arg = np.arange(self.length)
        np.random.shuffle(shuffled_arg)
        self.inputs = self.inputs[shuffled_arg]
        self.mask = self.mask[shuffled_arg]
        self.targets = self.targets[shuffled_arg]
      n_batch = int(self.length / batch_size)
      if self.length % batch_size != 0:
        n_batch += 1
      slices = np.split(np.arange(n_batch * batch_size), n_batch)
      slices[-1] = slices[-1][:(self.length - batch_size * (n_batch - 1))]
      return slices

    def get_slice(self, i):
      inputs, mask, targets = self.inputs[i], self.mask[i], self.targets[i]
      items, n_node, A, alias_inputs = [], [], [], []
      for u_input in inputs:
        n_node.append(len(np.unique(u_input)))
      max_n_node = np.max(n_node)
      for u_input in inputs:
        node = np.unique(u_input)
        items.append(node.tolist() + (max_n_node - len(node)) * [0])
        u_A = np.zeros((max_n_node, max_n_node))
        for i in np.arange(len(u_input) - 1):
          if u_input[i + 1] == 0:
            break
          u = np.where(node == u_input[i])[0][0]
          v = np.where(node == u_input[i + 1])[0][0]
          u_A[u][v] = 1
        u_sum_in = np.sum(u_A, 0)
        u_sum_in[np.where(u_sum_in == 0)] = 1
        u_A_in = np.divide(u_A, u_sum_in)
        u_sum_out = np.sum(u_A, 1)
        u_sum_out[np.where(u_sum_out == 0)] = 1
        u_A_out = np.divide(u_A.transpose(), u_sum_out)
        u_A = np.concatenate([u_A_in, u_A_out]).transpose()
        A.append(u_A)
        alias_inputs.append([np.where(node == i)[0][0] for i in u_input])
      return alias_inputs, A, items, mask, targets

    def __len__(self):
      return self.length  # or return len(self.inputs)

In [10]:
class SelfAttentionNetwork(Module):
  def __init__(self, opt, n_node):
    super(SelfAttentionNetwork, self).__init__()
    self.hidden_size = opt.hiddenSize
    self.n_node = n_node
    self.batch_size = opt.batchSize
    self.embedding = nn.Embedding(self.n_node, self.hidden_size)
    self.transformerEncoderLayer = TransformerEncoderLayer(d_model=self.hidden_size, nhead=opt.nhead,dim_feedforward=self.hidden_size * opt.feedforward)
    self.transformerEncoder = TransformerEncoder(self.transformerEncoderLayer, opt.layer)
    self.loss_function = nn.CrossEntropyLoss()
    self.optimizer = torch.optim.Adam(self.parameters(), lr=opt.lr, weight_decay=opt.l2)
    self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=opt.lr_dc_step, gamma=opt.lr_dc)
    self.reset_parameters()

  def reset_parameters(self):
    stdv = 1.0 / math.sqrt(self.hidden_size)
    for weight in self.parameters():
      weight.data.uniform_(-stdv, stdv)

  def compute_scores(self, hidden, mask):
    ht = hidden[torch.arange(mask.shape[0]).long(), torch.sum(mask, 1) - 1]  # batch_size x latent_size
    b = self.embedding.weight[1:]  # n_nodes x latent_size
    scores = torch.matmul(ht, b.transpose(1, 0))
    return scores

  def forward(self, inputs, A):
    hidden = self.embedding(inputs)
    hidden = hidden.transpose(0,1).contiguous()
    hidden = self.transformerEncoder(hidden)
    hidden = hidden.transpose(0,1).contiguous()
    return hidden

In [11]:
def trans_to_cuda(variable):
  if torch.cuda.is_available():
    return variable.cuda()
  else:
    return variable

In [12]:
def trans_to_cpu(variable):
  if torch.cuda.is_available():
    return variable.cpu()
  else:
    return variable

In [13]:
def forward(model, i, data):
  alias_inputs, A, items, mask, targets = data.get_slice(i)
  alias_inputs = trans_to_cuda(torch.Tensor(alias_inputs).long())
  items = trans_to_cuda(torch.Tensor(items).long())
  A = trans_to_cuda(torch.Tensor(A).float())
  mask = trans_to_cuda(torch.Tensor(mask).long())
  hidden = model(items, A)
  get = lambda i: hidden[i][alias_inputs[i]]
  seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()])
  return targets, model.compute_scores(seq_hidden, mask)

In [14]:
def calculate_recall_at_k(scores, targets, k=5):
    recall_at_k = []
    top_k_scores = scores.topk(k)[1]
    for score, target in zip(top_k_scores, targets):
        recall_at_k.append((target - 1) in score)
    return np.mean(recall_at_k)

## Test

In [15]:
def test(model, test_data, device):
    print('start predicting: ', datetime.datetime.now())
    model.to(device)
    model.eval()
    hit, mrr, total_loss = [], [], 0.0
    recall_list = []
    slices = test_data.generate_batch(model.batch_size)

    for i in slices:
        targets, scores = forward(model, i, test_data)
        targets = torch.Tensor(targets).long().to(device)  # Convert targets to a PyTorch tensor and move to the correct device

        loss = model.loss_function(scores, targets - 1)
        total_loss += loss.item()

        sub_scores = scores.topk(5)[1] #top-k items
        sub_scores = sub_scores.cpu().detach().numpy()  # Move to CPU if necessary

        targets_np = targets.cpu().numpy()  # Convert targets to NumPy array for use with NumPy functions
        for score, target_np, mask in zip(sub_scores, targets_np, test_data.mask):
            hit.append(np.isin(target_np - 1, score))
            mrr_index = np.where(score == target_np - 1)[0]
            mrr.append(0 if len(mrr_index) == 0 else 1 / (mrr_index[0] + 1))

        recall = calculate_recall_at_k(scores, targets)
        recall_list.append(recall)

    hit = np.mean(hit) * 100
    mrr = np.mean(mrr) * 100
    recall = np.mean(recall_list) * 100
    average_loss = total_loss / len(slices)

    results = {'recall': recall, 'hit': hit, 'mrr': mrr}

    return average_loss, results

## Train

In [16]:
def train(model, train_data, test_data, epochs, device):
    print('start training: ', datetime.datetime.now())

    model.to(device)
    model.train()

    for epoch in range(epochs):
        total_loss = 0.0
        slices = train_data.generate_batch(model.batch_size)
        num_batches = len(slices)

        for i, j in zip(slices, np.arange(num_batches)):
            model.optimizer.zero_grad()
            targets, scores = forward(model, i, train_data)
            targets = trans_to_cuda(torch.Tensor(targets).long())
            # targets = targets.to(device)  # Ensure targets are on the correct device
            loss = model.loss_function(scores, targets - 1)
            loss.backward()
            for name, param in model.named_parameters():
                if param.grad is None:
                    print(f"Parameter name: {name}")
                    print(f"Parameter shape: {param.shape}")
                    print(f"Gradient: {param.grad}")
                    print("=" * 20)
                    model.optimizer.step()
                    total_loss += loss.item()  # Use .item() to get the scalar value

            if j % int(num_batches / 5 + 1) == 0:
                print(f'[{j}/{num_batches}] Loss: {loss.item():.4f}')

        avg_loss = total_loss / num_batches
        print(f"Epoch: {epoch}, Average Loss: {avg_loss:.4f}")

        # Evaluate the model
        val_loss, val_results = test(model, test_data, device)

        model.scheduler.step()
    
    return val_results

# Solo

In [17]:
hit_list = []
mrr_list = []
recall_list = []

if opt.dataset == 'diginetica':
  n_node = 889
else:
  n_node = 37484

#iterate over all clients
for i in range(len(train_data)):
  print(f"Client {i}")
  train_data_i = Data(train_data[i], shuffle=True)
  test_data_i = Data(valid_data[i], shuffle=False)
  model = trans_to_cuda(SelfAttentionNetwork(opt, n_node))
  train_res = train(model, train_data_i, test_data_i, opt.epoch, DEVICE)
  loss, results = test(model, test_data_i, DEVICE)
  hit_list.append(results['hit'])
  mrr_list.append(results['mrr'])
  recall_list.append(results['recall'])

  #print hit and mrr for each client
  print(f"Hit: {results['hit']:.4f}")
  print(f"MRR: {results['mrr']:.4f}")
  print(f"Recall: {results['recall']:.4f}")

#print average hit and mrr over all clients
print(f"Average Hit: {np.mean(hit_list):.4f}")
print(f"Average MRR: {np.mean(mrr_list):.4f}")
print(f"Average Recall: {np.mean(recall_list):.4f}")

Client 0




start training:  2023-12-10 03:08:07.690211


  A = trans_to_cuda(torch.Tensor(A).float())


Parameter name: transformerEncoderLayer.self_attn.in_proj_weight
Parameter shape: torch.Size([600, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.in_proj_bias
Parameter shape: torch.Size([600])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.weight
Parameter shape: torch.Size([200, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.bias
Parameter shape: torch.Size([200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.weight
Parameter shape: torch.Size([800, 200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.bias
Parameter shape: torch.Size([800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.weight
Parameter shape: torch.Size([200, 800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.bias
Parameter shape: torch.Size([200])
Gradient: None
Parameter name: transformerEncoderLayer.norm1.weight
Parameter shape: torch.Size([200])
Gradient: None
Par

# Centralized

In [18]:
# combine all clients train and test data
train_data_all = []
test_data_all = []
for i in range(len(train_data)):
  train_data_all += train_data[i]
  test_data_all += valid_data[i]

train_data_all = Data(train_data_all, shuffle=True)
test_data_all = Data(test_data_all, shuffle=False)

#initiate model
model = trans_to_cuda(SelfAttentionNetwork(opt, n_node))

#train model on all clients
train_res = train(model, train_data_all, test_data_all, opt.epoch, DEVICE)

#test model on all clients
loss, results = test(model, test_data_all, DEVICE)

#print hit and mrr for all clients
print(f"Hit: {results['hit']:.4f}")
print(f"MRR: {results['mrr']:.4f}")
print(f"Recall: {results['recall']:.4f}")

start training:  2023-12-10 03:08:18.205281
Parameter name: transformerEncoderLayer.self_attn.in_proj_weight
Parameter shape: torch.Size([600, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.in_proj_bias
Parameter shape: torch.Size([600])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.weight
Parameter shape: torch.Size([200, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.bias
Parameter shape: torch.Size([200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.weight
Parameter shape: torch.Size([800, 200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.bias
Parameter shape: torch.Size([800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.weight
Parameter shape: torch.Size([200, 800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.bias
Parameter shape: torch.Size([200])
Gradient: None
Parameter name: transformerEncoderLayer.norm1.weight
Parameter

# FL Setting

## Client

In [19]:
class Client():
  def __init__(self, client_config:dict):
    # client config as dict to make configuration dynamic
    self.id = client_config["id"]
    self.config = client_config
    self.__model = None

    # check if CUDA is available
    if torch.cuda.is_available():
      self.device = 'cuda'
    else:
       self.device = 'cpu'

    self.train_loader = self.config["train_data"]
    self.valid_loader = self.config["test_data"]

  @property
  def model(self):
    return self.__model

  @model.setter
  def model(self, model):
    self.__model = model

  def __len__(self):
    """Return a total size of the client's local data."""
    return len(self.train_loader.sampler)

  def train(self):
    model = trans_to_cuda(self.model)
    results = train(model, self.train_loader, self.valid_loader, 1, self.device)
    print(f"Train result client {self.id}: {results}")

  def test(self):
    loss,result = test(self.model, self.valid_loader, self.device)
    print(f"Test result client {self.id}: {loss, result}")
    return result

## Server

In [20]:
class FedAvg():
  def __init__(self):
    self.globalmodel = trans_to_cuda(SelfAttentionNetwork(opt, n_node))
    self.rounds = 0
    self.params = {}

    # check if CUDA is available
    if torch.cuda.is_available():
      self.device = 'cuda'
    else:
       self.device = 'cpu'


  def aggregate(self, round):
    #v1:update the aggregate to save the model with round and date indicator
    modelparams = []
    for i in self.params.keys():
      modelparams.append(self.params[i])

    avg_weights = {}
    for name in modelparams[0].keys():
      avg_weights[name] = torch.mean(torch.stack([w[name] for w in modelparams]), dim = 0)

    self.globalmodel.load_state_dict(avg_weights)

    #current timestamp
    current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    # filename = f"{path_glob_m}/global_model_round_{round}_{current_time}.pth"
    # torch.save(self.globalmodel.state_dict(), filename)

  def clientstrain(self, clientconfig):
    clients = clientconfig
    for i in clients.keys():
      test_client = Client(clients[i])
      test_client.model = copy.deepcopy(self.globalmodel)
      test_client.model = trans_to_cuda(test_client.model)
      test_client.train()
      # test_client.test()
      self.params[i] = test_client.model.state_dict()

  def initiate_FL(self, clientconfig, serverdata):
    clients = clientconfig
    print("Round: {}".format(self.rounds))

    print("Obtaining Weights!!")
    self.clientstrain(clients)

    #### Aggregate model
    print("Aggregating Model!!")
    self.aggregate(self.rounds)

    #### Replace parameters with global model parameters
    for i in self.params.keys():
        self.params[i] = self.globalmodel.state_dict()


    servertest = serverdata
    loss, results = test(self.globalmodel, servertest, self.device)
    print("Round {} metrics:".format(self.rounds))
    print("Server Loss = {}".format(loss))
    print("Server Recall = {}".format(results['recall']))
    print("Round {} finished!".format(self.rounds))
    self.rounds += 1
    return clients, results['recall']

## Main

In [21]:
numrounds = 20

In [22]:
clients = {}

for i in range(len(train_data)):
  clients[i] = {"id": i, "val_size": 0.25, "batch_size": opt.batchSize, "local_epoch": 1}
  clients[i]['train_data'] = Data(train_data[i], shuffle=True)
  clients[i]['test_data'] = Data(valid_data[i], shuffle=False)
  print(f"client: {i}")
  print(f"Number of batches in the dataloader train: {len(clients[i]['train_data'])}")
  print(f"Number of batches in the dataloader test: {len(clients[i]['test_data'])}")

serverdata = Data(valid_data[0], shuffle=False)
server = FedAvg() ### initialize server

allrecall = []
for i in range(numrounds):
  clients, recall = server.initiate_FL(clients, serverdata)
  allrecall.append(recall)

print("\n")
print("-" * 50)
print("Recall of all rounds: {}".format(allrecall))

client: 0
Number of batches in the dataloader train: 6
Number of batches in the dataloader test: 1
client: 1
Number of batches in the dataloader train: 18
Number of batches in the dataloader test: 4
client: 2
Number of batches in the dataloader train: 2
Number of batches in the dataloader test: 1
client: 3
Number of batches in the dataloader train: 6
Number of batches in the dataloader test: 1
client: 4
Number of batches in the dataloader train: 7
Number of batches in the dataloader test: 1
client: 5
Number of batches in the dataloader train: 2
Number of batches in the dataloader test: 1
client: 6
Number of batches in the dataloader train: 1
Number of batches in the dataloader test: 1
client: 7
Number of batches in the dataloader train: 26
Number of batches in the dataloader test: 1
client: 8
Number of batches in the dataloader train: 10
Number of batches in the dataloader test: 2
client: 9
Number of batches in the dataloader train: 3
Number of batches in the dataloader test: 1
client:

Train result client 0: {'recall': 100.0, 'hit': 100.0, 'mrr': 100.0}
start training:  2023-12-10 03:08:18.834933
Parameter name: transformerEncoderLayer.self_attn.in_proj_weight
Parameter shape: torch.Size([600, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.in_proj_bias
Parameter shape: torch.Size([600])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.weight
Parameter shape: torch.Size([200, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.bias
Parameter shape: torch.Size([200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.weight
Parameter shape: torch.Size([800, 200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.bias
Parameter shape: torch.Size([800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.weight
Parameter shape: torch.Size([200, 800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.bias
Parameter shape: torch.Size([200])
Gradient



Parameter name: transformerEncoderLayer.self_attn.in_proj_weight
Parameter shape: torch.Size([600, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.in_proj_bias
Parameter shape: torch.Size([600])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.weight
Parameter shape: torch.Size([200, 200])
Gradient: None
Parameter name: transformerEncoderLayer.self_attn.out_proj.bias
Parameter shape: torch.Size([200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.weight
Parameter shape: torch.Size([800, 200])
Gradient: None
Parameter name: transformerEncoderLayer.linear1.bias
Parameter shape: torch.Size([800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.weight
Parameter shape: torch.Size([200, 800])
Gradient: None
Parameter name: transformerEncoderLayer.linear2.bias
Parameter shape: torch.Size([200])
Gradient: None
Parameter name: transformerEncoderLayer.norm1.weight
Parameter shape: torch.Size([200])
Gradient: None
Par

## Test to All Clients

In [23]:
final_model = server.globalmodel
final_model = trans_to_cuda(final_model)

recall_clients = []
mrr_clients = []
loss_clients = []

# loop for each client
for i in range(len(train_data)):
    print(f"Testing on user {i}...")
    local_test = valid_data[i]
    testloader = Data(local_test, shuffle=False)

    # Evaluate the network
    loss, results = test(final_model, testloader, DEVICE)

    print(f"Recall@5: {results['recall']:.4f}")
    print(f"MRR@5: {results['mrr']:.4f}")

    recall_clients.append(results['recall'])
    mrr_clients.append(results['mrr'])
    loss_clients.append(loss) 

print(f"Average Recall@5: {np.mean(recall_clients):.4f}")
print(f"Average MRR@5: {np.mean(mrr_clients):.4f}")

Testing on user 0...
start predicting:  2023-12-10 03:08:40.169467
Recall@5: 0.0000
MRR@5: 0.0000
Testing on user 1...
start predicting:  2023-12-10 03:08:40.175450
Recall@5: 25.0000
MRR@5: 25.0000
Testing on user 2...
start predicting:  2023-12-10 03:08:40.182431
Recall@5: 0.0000
MRR@5: 0.0000
Testing on user 3...
start predicting:  2023-12-10 03:08:40.188415
Recall@5: 100.0000
MRR@5: 100.0000
Testing on user 4...
start predicting:  2023-12-10 03:08:40.197391
Recall@5: 0.0000
MRR@5: 0.0000
Testing on user 5...
start predicting:  2023-12-10 03:08:40.203376
Recall@5: 0.0000
MRR@5: 0.0000
Testing on user 6...
start predicting:  2023-12-10 03:08:40.209387
Recall@5: 0.0000
MRR@5: 0.0000
Testing on user 7...
start predicting:  2023-12-10 03:08:40.214371
Recall@5: 0.0000
MRR@5: 0.0000
Testing on user 8...
start predicting:  2023-12-10 03:08:40.221353
Recall@5: 0.0000
MRR@5: 0.0000
Testing on user 9...
start predicting:  2023-12-10 03:08:40.227311
Recall@5: 100.0000
MRR@5: 20.0000
Testing on 