Sequence Recommendation System With Graph Neural Network

Import Library

In [5]:
import pandas as pd
import pickle
import os
import datetime
import time
import csv
import operator
import numpy as np
import torch
from torch import nn
from torch.nn import Module, Parameter
import torch.nn.functional as F
import math

1. Data Cleaning

In [None]:
# Merged CSV files
folder_path = "amazon_dataset_2018"
expected_columns = ["rating", "reviewerID", "product_id", "date"]
all_data = pd.DataFrame(columns=expected_columns)
for filename in os.listdir(folder_path):
  if filename.endswith(".csv"):
    filepath = os.path.join(folder_path, filename)
    try:
      df = pd.read_csv(filepath)
    except pd.errors.ParserError:
      print(f"Error parsing file: {filename}")
      continue
    df = df[expected_columns]
    all_data = pd.concat([all_data, df], ignore_index=True)
all_data.to_csv("merged_data.csv", index=False)
print("Files merged successfully!")

In [None]:
# Filter reviewerIDs that only have 1 rating
data = pd.read_csv("merged_data.csv")
user_vote_count = data.groupby("reviewerID")["reviewerID"].count().reset_index(name="vote_count")
filtered_data = data[data['reviewerID'].isin(
    user_vote_count[user_vote_count['vote_count'] >= 2 & (user_vote_count['vote_count'] < 20)
                    ]['reviewerID'].tolist())]
filtered_data.to_csv("filtered_data.csv", index=False)

In [None]:
# Read data files and organize data based on reviewerID
dataset = 'filtered_data.csv'
print("-- Starting @ %ss" % datetime.datetime.now())
with open(dataset, "r") as f:
    reader = csv.DictReader(f, delimiter=',')
    reviewer_id = {}
    rating_date = {}
    ctr = 0
    curid = -1
    curdate = None
    for data in reader:        
        revid = data['reviewerID']
        if curdate and not curid == revid:
            date = ''            
            date = curdate        
            rating_date[curid] = date
        curid = revid
        item = data['product_id']        
        curdate = ''
        curdate = data['date']
        if revid in reviewer_id:
            reviewer_id[revid] += [item]
        else:
            reviewer_id[revid] = [item]
        ctr += 1
    date = ''
    date = curdate
    rating_date[curid] = date
print("-- Reading data @ %ss" % datetime.datetime.now())
 

In [None]:
# Split the training set and test set data based on the dates of the assessments.
dates = list(rating_date.items())
maxdate = dates[0][1]
for _, date in dates:
    if int(maxdate) < int(date):
        maxdate = date
splitdate = 0
splitdate = int(maxdate) - 86400 * 7 * 4 * 18
print('Splitting date', splitdate)   

tra_sess = filter(lambda x: x[1] < splitdate, map(lambda x: (x[0], int(x[1])), dates))
tes_sess = filter(lambda x: x[1] > splitdate, map(lambda x: (x[0], int(x[1])), dates))

tra_sess = sorted(tra_sess, key=operator.itemgetter(1))  
tes_sess = sorted(tes_sess, key=operator.itemgetter(1))  
print(len(tra_sess))  
print(len(tes_sess))  
print(tra_sess[:3])
print(tes_sess[:3])
print("-- Splitting train set and test set @ %ss" % datetime.datetime.now())


In [None]:
# Create sequences representing products that have been reviewed by reviewerID
item_dict = {}
def obtian_tra():
    train_ids = []
    train_seqs = []
    train_dates = []
    item_ctr = 1
    for s, date in tra_sess:
        seq = reviewer_id[s]
        outseq = []
        for i in seq:
            if i in item_dict:
                outseq += [item_dict[i]]
            else:
                outseq += [item_ctr]
                item_dict[i] = item_ctr
                item_ctr += 1
        if len(outseq) < 2: 
            continue
        train_ids += [s]
        train_dates += [date]
        train_seqs += [outseq]
    print(item_ctr)    
    return train_ids, train_dates, train_seqs

def obtian_tes():
    test_ids = []
    test_seqs = []
    test_dates = []
    for s, date in tes_sess:
        seq = reviewer_id[s]
        outseq = []
        for i in seq:
            if i in item_dict:
                outseq += [item_dict[i]]
        if len(outseq) < 2:
            continue
        test_ids += [s]
        test_dates += [date]
        test_seqs += [outseq]
    return test_ids, test_dates, test_seqs

tra_ids, tra_dates, tra_seqs = obtian_tra()
tes_ids, tes_dates, tes_seqs = obtian_tes()


In [None]:
# Process sequences and corresponding dates of product reviews
def process_seqs(iseqs, idates):
    out_seqs = []
    out_dates = []
    labs = []
    ids = []
    for id, seq, date in zip(range(len(iseqs)), iseqs, idates):
        for i in range(1, len(seq)):
            tar = seq[-i]
            labs += [tar]
            out_seqs += [seq[:-i]]
            out_dates += [date]
            ids += [id]
    return out_seqs, out_dates, labs, ids

tr_seqs, tr_dates, tr_labs, tr_ids = process_seqs(tra_seqs, tra_dates)
te_seqs, te_dates, te_labs, te_ids = process_seqs(tes_seqs, tes_dates)
tra = (tr_seqs, tr_labs)
tes = (te_seqs, te_labs)
print(len(tr_seqs))
print(len(te_seqs))


In [None]:
# Calculate the average length of the strings in the training set and test set,
# Store processed data
all = 0
for seq in tra_seqs:
    all += len(seq)
for seq in tes_seqs:
    all += len(seq)
print('avg length: ', all/(len(tra_seqs) + len(tes_seqs) * 1.0))

if not os.path.exists('amz'):
    os.makedirs('amz')
pickle.dump(tra, open('amz/train.txt', 'wb'))
pickle.dump(tes, open('amz/test.txt', 'wb'))
pickle.dump(tra_seqs, open('amz/all_train_seq.txt', 'wb'))

print('Done.')


2. Processing input data

In [1]:
# Divide the training data set into train data and validation data
def split_validation(train_set, valid_portion):    
    train_set_x, train_set_y = train_set     
    n_samples = len(train_set_x)    
    sidx = np.arange(n_samples, dtype='int32')    
    np.random.shuffle(sidx)
    n_train = int(np.round(n_samples * (1. - valid_portion)))

    valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
    valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
    train_set_x = [train_set_x[s] for s in sidx[:n_train]]
    train_set_y = [train_set_y[s] for s in sidx[:n_train]]
    return (train_set_x, train_set_y), (valid_set_x, valid_set_y)

In [2]:
# Prepare input data for machine learning models
def data_masks(all_usr_pois, item_tail):    
    us_lens = [len(upois) for upois in all_usr_pois]    
    len_max = max(us_lens)    
    us_pois = [upois + item_tail * (len_max - le) for upois, le in zip(all_usr_pois, us_lens)]
    us_msks = [[1] * le + [0] * (len_max - le) for le in us_lens]
    return us_pois, us_msks, len_max


In [3]:
# Prepare and create batches of data for use in training machine learning models
class Data():
    def __init__(self, data, shuffle=False, graph=None, opt=None):        
        inputs = data[0]
        inputs, mask, len_max = data_masks(inputs, [0])        
        self.inputs = np.asarray(inputs)        
        self.mask = np.asarray(mask)
        self.len_max = len_max
        self.targets = np.asarray(data[1])        
        if opt.dynamic:
            self.targets = np.asarray(data[2])        
        self.length = len(inputs)        
        self.shuffle = shuffle        
        self.graph = graph

    def generate_batch(self, batch_size):
        if self.shuffle:            
            shuffled_arg = np.arange(self.length)
            np.random.shuffle(shuffled_arg)
            self.inputs = self.inputs[shuffled_arg]
            self.mask = self.mask[shuffled_arg]
            self.targets = self.targets[shuffled_arg]        
        n_batch = int(self.length / batch_size)
        if self.length % batch_size != 0:
            n_batch += 1        
        slices = np.split(np.arange(n_batch * batch_size), n_batch)
        slices[-1] = slices[-1][:(self.length - batch_size * (n_batch - 1))]
        return slices

    def get_slice(self, i):        
        inputs, mask, targets = self.inputs[i], self.mask[i], self.targets[i]
        items, n_node, A, alias_inputs = [], [], [], []        
        for u_input in inputs:
            n_node.append(len(np.unique(u_input)))
        max_n_node = np.max(n_node)

        for u_input in inputs:
            node = np.unique(u_input)
            items.append(node.tolist() + (max_n_node - len(node)) * [0])
            u_A = np.zeros((max_n_node, max_n_node))
            for i in np.arange(len(u_input) - 1):
                if u_input[i + 1] == 0:
                    break
                u = np.where(node == u_input[i])[0][0]
                v = np.where(node == u_input[i + 1])[0][0]
                u_A[u][v] = 1

            u_sum_in = np.sum(u_A, 0)
            u_sum_in[np.where(u_sum_in == 0)] = 1
            u_A_in = np.divide(u_A, u_sum_in)
            u_sum_out = np.sum(u_A, 1)
            u_sum_out[np.where(u_sum_out == 0)] = 1
            u_A_out = np.divide(u_A.transpose(), u_sum_out)
            u_A = np.concatenate([u_A_in, u_A_out]).transpose()
            A.append(u_A)
            alias_inputs.append([np.where(node == i)[0][0] for i in u_input])
        
        return alias_inputs, A, items, mask, targets


3. Build Model

In [4]:
# PositionEmbedding class is used to calculate and map the positions of elements in the input data string
class PositionEmbedding(nn.Module):
    MODE_EXPAND = 'MODE_EXPAND'
    MODE_ADD = 'MODE_ADD'
    MODE_CONCAT = 'MODE_CONCAT'
    def __init__(self,
                 num_embeddings,
                 embedding_dim,
                 mode=MODE_ADD):
        super(PositionEmbedding, self).__init__()
        self.num_embeddings = num_embeddings  
        self.embedding_dim = embedding_dim
        self.mode = mode
        if self.mode == self.MODE_EXPAND:
            self.weight = nn.Parameter(torch.Tensor(num_embeddings * 2 + 1, embedding_dim))
        else:
            self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim))
    def reset_parameters(self):
        torch.nn.init.xavier_normal_(self.weight)
    def forward(self, x):
        if self.mode == self.MODE_EXPAND:
            indices = torch.clamp(x, -self.num_embeddings, self.num_embeddings) + self.num_embeddings
            return F.embedding(indices.type(torch.LongTensor), self.weight)
        batch_size, seq_len = x.size()[:2]
        embeddings = self.weight[:seq_len, :].view(1, seq_len, self.embedding_dim)
        if self.mode == self.MODE_ADD:
            return x + embeddings
        if self.mode == self.MODE_CONCAT:
            return torch.cat((x, embeddings.repeat(batch_size, 1, 1)), dim=-1)
        raise NotImplementedError('Unknown mode: %s' % self.mode)
    def extra_repr(self):
        return 'num_embeddings={}, embedding_dim={}, mode={}'.format(
            self.num_embeddings, self.embedding_dim, self.mode,
        )


NameError: name 'nn' is not defined

In [6]:
# Residual class implements a residual module
class Residual(Module):
    def __init__(self):
        super().__init__()
        self.hidden_size = 120
        self.d1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.d2 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.dp = nn.Dropout(p=0.2)
        self.drop = True
    def forward(self, x):
        residual = x  
        x = F.relu(self.d1(x))
        if self.drop:
            x = self.d2(self.dp(x))
        else:
            x = self.d2(x)
        out = residual + x
        return out

In [7]:
# MultiHeadedAttention class is used to learn complex relationships in input data
class MultiHeadedAttention(nn.Module):
    def __init__(self, n_head, n_feat, dropout_rate):
        super(MultiHeadedAttention, self).__init__()
        assert n_feat % n_head == 0
        self.d_k = n_feat // n_head
        self.h = n_head
        self.linear_q = nn.Linear(n_feat, n_feat)
        self.linear_k = nn.Linear(n_feat, n_feat)
        self.linear_v = nn.Linear(n_feat, n_feat)
        self.linear_out = nn.Linear(n_feat, n_feat)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, query, key, value, mask):
        n_batch = query.size(0)
        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
        q = q.transpose(1, 2) 
        k = k.transpose(1, 2) 
        v = v.transpose(1, 2)  
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k)  
        if mask is not None:
            mask = mask.unsqueeze(1).eq(0) 
            min_value = float(np.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
            # scores = scores.masked_fill(mask, min_value)
            self.attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  
        else:
            self.attn = torch.softmax(scores, dim=-1) 
        p_attn = self.dropout(self.attn)
        x = torch.matmul(p_attn, v) 
        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) 
        return self.linear_out(x)  

In [6]:
# The GNN class implements a propagation module on the graph
class GNN(Module):
    def __init__(self, hidden_size, step=1):
        super(GNN, self).__init__()
        self.step = step
        self.hidden_size = hidden_size
        self.input_size = hidden_size * 2
        self.gate_size = 3 * hidden_size
        self.w_ih = Parameter(torch.Tensor(self.gate_size, self.input_size))
        self.w_hh = Parameter(torch.Tensor(self.gate_size, self.hidden_size))
        self.b_ih = Parameter(torch.Tensor(self.gate_size))
        self.b_hh = Parameter(torch.Tensor(self.gate_size))
        self.b_iah = Parameter(torch.Tensor(self.hidden_size))
        self.b_oah = Parameter(torch.Tensor(self.hidden_size))
        self.linear_edge_in = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_edge_out = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_edge_f = nn.Linear(self.hidden_size, self.hidden_size, bias=True)

    def GNNCell(self, A, hidden):
        input_in = torch.matmul(A[:, :, :A.shape[1]], self.linear_edge_in(hidden)) + self.b_iah
        input_out = torch.matmul(A[:, :, A.shape[1]: 2 * A.shape[1]], self.linear_edge_out(hidden)) + self.b_oah
        inputs = torch.cat([input_in, input_out], 2)
        gi = F.linear(inputs, self.w_ih, self.b_ih)
        gh = F.linear(hidden, self.w_hh, self.b_hh)
        i_r, i_i, i_n = gi.chunk(3, 2)
        h_r, h_i, h_n = gh.chunk(3, 2)
        resetgate = torch.sigmoid(i_r + h_r)
        inputgate = torch.sigmoid(i_i + h_i)
        newgate = torch.tanh(i_n + resetgate * h_n)
        hy = hidden - inputgate * (hidden - newgate)
        return hy

    def forward(self, A, hidden):
        for i in range(self.step):
            hidden = self.GNNCell(A, hidden)
        return hidden


In [7]:
# Graph neural network (GNN) model is used for training
class SessionGraph(Module):
    def __init__(self, opt, n_node, len_max):
        super(SessionGraph, self).__init__()
        self.hidden_size = opt.hiddenSize
        self.len_max = len_max
        self.n_node = n_node
        self.batch_size = opt.batchSize
        self.nonhybrid = opt.nonhybrid
        self.embedding = nn.Embedding(self.n_node, self.hidden_size)
        self.gnn = GNN(self.hidden_size, step=opt.step)
        self.linear_one = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_two = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
        self.linear_three = nn.Linear(self.hidden_size, 1, bias=False)
        self.linear_transform = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=True)
        self.rn = Residual()
        self.multihead_attn = nn.MultiheadAttention(self.hidden_size, 1).cuda()
        self.pe = PositionEmbedding(len_max, self.hidden_size)
        self.loss_function = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=opt.lr, weight_decay=opt.l2)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=opt.lr_dc_step, gamma=opt.lr_dc)
        self.reset_parameters()


    def reset_parameters(self):
        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.parameters():
            weight.data.uniform_(-stdv, stdv)

    def compute_scores(self, hidden, mask, self_att=True, residual=True, k_blocks=4):
        ht = hidden[torch.arange(mask.shape[0]).long(), torch.sum(mask, 1) - 1] 
        mask_self = mask.repeat(1, mask.shape[1]).view(-1, mask.shape[1], mask.shape[1])
        if self_att:            
            attn_output = hidden
            for k in range(k_blocks):
                attn_output = attn_output.transpose(0,1)
                attn_output, attn_output_weights = self.multihead_attn(attn_output, attn_output, attn_output)                
                attn_output = attn_output.transpose(0,1)                
                if residual:
                    attn_output = self.rn(attn_output)
            hn = attn_output[torch.arange(mask.shape[0]).long(), torch.sum(mask, 1) - 1]             
            a = 0.52*hn + (1-0.52)*ht  
        else:          
            q1 = self.linear_one(ht).view(ht.shape[0], 1, ht.shape[1])  
            q2 = self.linear_two(hidden)  
            alpha = self.linear_three(torch.sigmoid(q1 + q2))
            a = torch.sum(alpha * hidden * mask.view(mask.shape[0], -1, 1).float(), 1)
            if not self.nonhybrid:
                a = self.linear_transform(torch.cat([a, ht], 1))
        b = self.embedding.weight[1:]  
        scores = torch.matmul(a, b.transpose(1, 0))
        return scores

    def forward(self, inputs, A):
        hidden = self.embedding(inputs)
        hidden = self.gnn(A, hidden)
        return hidden



In [8]:
# trans_to_cuda and trans_to_cpu to convert variables (tensor) between GPU device (CUDA) and CPU
def trans_to_cuda(variable):
    if torch.cuda.is_available():
        return variable.cuda()
    else:
        return variable


def trans_to_cpu(variable):
    if torch.cuda.is_available():
        return variable.cpu()
    else:
        return variable

In [9]:
# forward class performs forward propagation.
def forward(model, i, data):
    alias_inputs, A, items, mask, targets = data.get_slice(i)
    alias_inputs = trans_to_cuda(torch.Tensor(alias_inputs).long())
    items = trans_to_cuda(torch.Tensor(items).long())
    A = trans_to_cuda(torch.Tensor(A).float())
    mask = trans_to_cuda(torch.Tensor(mask).long())
    hidden = model(items, A)
    get = lambda i: hidden[i][alias_inputs[i]]
    seq_hidden = torch.stack([get(i) for i in torch.arange(len(alias_inputs)).long()])
    return targets, model.compute_scores(seq_hidden, mask)

In [10]:
# train_test, used to train and evaluate the model on the training and testing datasets
def train_test(model, train_data, test_data):
    model.scheduler.step()
    print('start training: ', datetime.datetime.now())
    model.train()
    total_loss = 0.0
    slices = train_data.generate_batch(model.batch_size)
    for i, j in zip(slices, np.arange(len(slices))):
        model.optimizer.zero_grad()
        targets, scores = forward(model, i, train_data)
        targets = trans_to_cuda(torch.Tensor(targets).long())
        loss = model.loss_function(scores, targets - 1)
        loss.backward()
        model.optimizer.step()
        total_loss += loss
        if j % int(len(slices) / 5 + 1) == 0:
            print('[%d/%d] Loss: %.4f' % (j, len(slices), loss.item()))
    print('\tLoss:\t%.3f' % total_loss)
    print('start predicting: ', datetime.datetime.now())
    model.eval()
    hit, mrr = [], []
    slices = test_data.generate_batch(model.batch_size)
    for i in slices:
        targets, scores = forward(model, i, test_data)
        sub_scores = scores.topk(20)[1]
        sub_scores = trans_to_cpu(sub_scores).detach().numpy()
        for score, target, mask in zip(sub_scores, targets, test_data.mask):
            hit.append(np.isin(target - 1, score))
            if len(np.where(score == target - 1)[0]) == 0:
                mrr.append(0)
            else:
                mrr.append(1 / (np.where(score == target - 1)[0][0] + 1))
    hit = np.mean(hit) * 100
    mrr = np.mean(mrr) * 100
    return hit, mrr


4. Training and Testing

In [11]:
# Define hyperparameters
batch_size = 50
hidden_size = 120
num_epochs = 30
learning_rate = 0.001
lr_dc = 0.1
lr_dc_step = 3
l2 = 1e-5
step = 1
patience = 10
validation = True
valid_portion = 0.1
dynamic = False

opt = {
    'batch_size': batch_size,
    'hidden_size': hidden_size,
    'num_epochs': num_epochs,
    'learning_rate': learning_rate,
    'lr_dc': lr_dc,
    'lr_dc_step': lr_dc_step,
    'l2': l2,
    'step': step,
    'patience': patience,
    'validation': validation,
    'valid_portion': valid_portion,
    'dynamic': dynamic
}


In [12]:
# Load training data and test data
train_data = pickle.load(open("amz/train.txt", 'rb'))
if opt['validation']:
    train_data, valid_data = split_validation(train_data, opt['valid_portion'])
    test_data = valid_data
else:
    test_data = pickle.load(open("amz/test.txt", 'rb'))
train_data = Data(train_data, shuffle=True, opt=opt)
test_data = Data(test_data, shuffle=False, opt=opt)

In [None]:
# Train the model and evaluate performance
n_node = 2412095
model = SessionGraph(opt, n_node, max(len(train_data.data), len(test_data.data)))

start = time.time()
best_result = [0, 0]
best_epoch = [0, 0]
bad_counter = 0
for epoch in range(num_epochs):
    print('-------------------------------------------------------')
    print('epoch: ', epoch)
    # Placeholder for train_test function
    hit, mrr = train_test(model, train_data, test_data)
    flag = 0    
    if hit >= best_result[0]:
        best_result[0] = hit
        best_epoch[0] = epoch
        flag = 1    
    if mrr >= best_result[1]:
        best_result[1] = mrr
        best_epoch[1] = epoch
        flag = 1    
    print('Best Result:')
    print('\tRecall@20:\t%.4f\tMMR@20:\t%.4f\tEpoch:\t%d,\t%d' % (best_result[0], best_result[1], best_epoch[0], best_epoch[1]))
    bad_counter += 1 - flag
    if bad_counter >= patience:
        break
print('-------------------------------------------------------')
end = time.time()
print("Run time: %f s" % (end - start))