In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import argparse
import sys, os

In [None]:
a_fashion=pd.read_csv('/content/AMAZON_FASHION.csv')
a_craft=pd.read_csv('/content/Arts_Crafts_and_Sewing.csv')

FileNotFoundError: ignored

In [None]:
import tensorflow as tf
import numpy as np
import math


class PPGN(object):
    def __init__(self, args, iterator, norm_adj_mat, num_users, num_items_s, num_items_t, is_training):
        self.args = args
        self.iterator = iterator
        self.norm_adj_mat = norm_adj_mat
        self.num_users = num_users
        self.num_items_s = num_items_s
        self.num_items_t = num_items_t
        self.is_training = is_training
        self.n_fold = 100

        self.get_data()
        self.all_weights = self.init_weights()
        self.item_embeddings_s, self.user_embeddings, self.item_embeddings_t = self.creat_gcn_embedd()
        self.inference()
        self.saver = tf.train.Saver(tf.global_variables())


    def get_data(self):
        sample = self.iterator.get_next()
        self.user, self.item_s, self.item_t = sample['user'], sample['item_s'], sample['item_t']
        self.label_s = tf.cast(sample['label_s'], tf.float32)
        self.label_t = tf.cast(sample['label_t'], tf.float32)


    def init_weights(self):
        all_weights = dict()
        initializer = tf.truncated_normal_initializer(0.01)
        regularizer = tf.contrib.layers.l2_regularizer(self.args.regularizer_rate)

        all_weights['user_embeddings'] = tf.get_variable(
            'user_embeddings', (self.num_users, self.args.embedding_size), tf.float32, initializer, regularizer)
        all_weights['item_embeddings_s'] = tf.get_variable(
            'item_embeddings_s', (self.num_items_s, self.args.embedding_size), tf.float32, initializer, regularizer)
        all_weights['item_embeddings_t'] = tf.get_variable(
            'item_embeddings_t', (self.num_items_t, self.args.embedding_size), tf.float32, initializer, regularizer)

        self.layers_plus = [self.args.embedding_size] + self.args.gnn_layers

        for k in range(len(self.layers_plus)-1):
            all_weights['W_gc_%d' % k] = tf.get_variable(
                'W_gc_%d'% k, (self.layers_plus[k], self.layers_plus[k+ 1]), tf.float32, initializer, regularizer)
            all_weights['b_gc_%d' % k] = tf.get_variable(
                'b_gc_%d'% k, self.layers_plus[k+ 1], tf.float32, tf.zeros_initializer(), regularizer)
            all_weights['W_bi_%d' % k] = tf.get_variable(
                'W_bi_%d'% k, (self.layers_plus[k], self.layers_plus[k + 1]), tf.float32, initializer, regularizer)
            all_weights['b_bi_%d' % k] = tf.get_variable(
                'b_bi_%d'% k, self.layers_plus[k+ 1], tf.float32, tf.zeros_initializer(), regularizer)

        return all_weights


    def creat_gcn_embedd(self):
        A_fold_hat = self._split_A_hat(self.norm_adj_mat)
        embeddings = tf.concat([self.all_weights['item_embeddings_s'], self.all_weights['user_embeddings'],
                                self.all_weights['item_embeddings_t']], axis=0)
        all_embeddings = [embeddings]

        for k in range(len(self.layers_plus)-1):
            temp_embedd = [tf.sparse_tensor_dense_matmul(A_fold_hat[f], embeddings) for f in range(self.n_fold)]

            embeddings = tf.concat(temp_embedd, axis=0)
            embeddings = tf.nn.leaky_relu(tf.matmul(embeddings, self.all_weights['W_gc_%d'%k])
                                          + self.all_weights['b_gc_%d'%k])
            embeddings = tf.nn.dropout(embeddings, 1 - self.args.dropout_message)

            all_embeddings += [embeddings]

        all_embeddings = tf.concat(all_embeddings, axis=1)
        item_embeddings_s, user_embeddings, item_embeddings_t = tf.split(
            all_embeddings, [self.num_items_s, self.num_users, self.num_items_t], axis=0)

        return item_embeddings_s, user_embeddings, item_embeddings_t


    def _split_A_hat(self, X):
        fold_len = math.ceil((X.shape[0]) / self.n_fold)
        A_fold_hat = [self._convert_sp_mat_to_sp_tensor( X[i_fold*fold_len :(i_fold+1)*fold_len])
                      for i_fold in range(self.n_fold)]

        return A_fold_hat


    def _convert_sp_mat_to_sp_tensor(self, X):
        coo = X.tocoo().astype(np.float32)
        indices = np.mat([coo.row, coo.col]).transpose()

        return tf.SparseTensor(indices, coo.data, coo.shape)


    def inference(self):
        initializer = tf.truncated_normal_initializer(0.01)
        regularizer = tf.contrib.layers.l2_regularizer(self.args.regularizer_rate)

        with tf.name_scope('embedding'):
            user_embedding = tf.nn.embedding_lookup(self.user_embeddings, self.user)
            item_embedding_s = tf.nn.embedding_lookup(self.item_embeddings_s, self.item_s)
            item_embedding_t = tf.nn.embedding_lookup(self.item_embeddings_t, self.item_t)

        with tf.name_scope('propagation'):
            if self.args.NCForMF == 'MF':
                self.logits_dense_s = tf.reduce_sum(tf.multiply(user_embedding, item_embedding_s), 1)
                self.logits_dense_t = tf.reduce_sum(tf.multiply(user_embedding, item_embedding_t), 1)
            elif self.args.NCForMF == 'NCF':
                a_s = tf.concat([user_embedding, item_embedding_s], axis=-1, name='inputs_s')
                a_t = tf.concat([user_embedding, item_embedding_t], axis=-1, name='inputs_t')

                for i, units in enumerate(self.args.mlp_layers):
                    dense_s = tf.layers.dense(a_s, units, tf.nn.relu, kernel_initializer=initializer,
                                          kernel_regularizer = regularizer, name='dense_s_%d' % i)
                    a_s = tf.layers.dropout(dense_s, self.args.dropout_message)

                    dense_t = tf.layers.dense(a_t, units, tf.nn.relu, kernel_initializer=initializer,
                                          kernel_regularizer=regularizer, name='dense_t_%d' % i)
                    a_t = tf.layers.dropout(dense_t, self.args.dropout_message)


                self.logits_dense_s = tf.layers.dense(inputs=a_s,
                                                      units=1,
                                                      kernel_initializer=initializer,
                                                      kernel_regularizer=regularizer,
                                                      name='logits_dense_s')
                self.logits_dense_t = tf.layers.dense(inputs=a_t,
                                                      units=1,
                                                      kernel_initializer=initializer,
                                                      kernel_regularizer=regularizer,
                                                      name='logits_dense_t')
            else:
                raise ValueError

            self.logits_s = tf.squeeze(self.logits_dense_s)
            self.logits_t = tf.squeeze(self.logits_dense_t)

            loss_list_s = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.label_s, logits=self.logits_s,
                                                                  name='loss_s')
            loss_list_t = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.label_t, logits=self.logits_t,
                                                                  name='loss_t')
            loss_w_s = tf.map_fn(lambda x: tf.cond(tf.equal(x, 1.0), lambda: 5.0, lambda: 1.0), self.label_s)
            loss_w_t = tf.map_fn(lambda x: tf.cond(tf.equal(x, 1.0), lambda: 5.0, lambda: 1.0), self.label_t)

            self.loss_s = tf.reduce_mean(tf.multiply(loss_list_s, loss_w_s))
            self.loss_t = tf.reduce_mean(tf.multiply(loss_list_t, loss_w_t))

            self.loss = self.loss_s + self.loss_t

            self.optimizer = tf.train.AdamOptimizer(self.args.lr).minimize(self.loss)

            self.label_replica_s, self.label_replica_t = self.label_s, self.label_t

            _, self.indice_s = tf.nn.top_k(tf.sigmoid(self.logits_s), self.args.topK)
            _, self.indice_t = tf.nn.top_k(tf.sigmoid(self.logits_t), self.args.topK)


    def step(self, sess):
        if self.is_training:
            label_s, indice_s, label_t, indice_t, loss, optim = sess.run(
                [self.label_replica_s, self.indice_s, self.label_replica_t, self.indice_t, self.loss,
                 self.optimizer])

            return loss
        else:
            label_s, indice_s, label_t, indice_t = sess.run(
                [self.label_replica_s, self.indice_s, self.label_replica_t, self.indice_t])
            prediction_s = np.take(label_s, indice_s)
            prediction_t = np.take(label_t, indice_t)

            return prediction_s, label_s, prediction_t, label_t

In [None]:
import numpy as np


def mrr(gt_item, pred_items):
    if gt_item in pred_items:
        index = np.where(pred_items == gt_item)[0][0]
        return np.reciprocal(float(index + 1))
    else:
        return 0


def hit(gt_item, pred_items):
    if gt_item in pred_items:
        return 1
    return 0


def ndcg(gt_item, pred_items):
    if gt_item in pred_items:
        index = np.where(pred_items == gt_item)[0][0]
        return np.reciprocal(np.log2(index + 2))
    return 0

In [None]:
a_craft

Unnamed: 0,reviewerID,asin,overall,unixReviewTime
0,A2QG1KI2V93K3Q,0449819906,5.0,1446595200
1,A278N8QX9TY2OS,0449819906,4.0,1428192000
2,ARXZDLB2JBPPR,0449819906,4.0,1389398400
3,A2BQ7YGPNCQSO4,0449819906,4.0,1386460800
4,A1Z3IQAG4CZ0W4,0471749915,3.0,1377993600
...,...,...,...,...
322479,A9S8PIDJNOWRW,B01HJDXUDO,5.0,1485820800
322480,A2J0UN0HC0O7FH,B01HJGIQFI,4.0,1537401600
322481,A21ZGKCPACGDLB,B01HJGIQFI,5.0,1509667200
322482,A1NPWXX6SE7O32,B01HJGIQFI,5.0,1497830400


In [None]:
import os
import numpy as np
import pandas as pd
from itertools import count
from collections import defaultdict
from multiprocessing import Pool


class Dataset:
    def __init__(self, file_path, args):
        self.args = args
        self.path = '/'.join(file_path.split('/')[:-1])
        self.name = file_path.split('/')[-1].split('_')[0]
        self.train_npy_path = file_path.replace('.csv', '_train.npy')
        self.test_npy_path = file_path.replace('.csv', '_test.npy')

        if not(os.path.exists(self.train_npy_path) and os.path.exists(self.test_npy_path)):
            self._bulid_data(file_path)
        else:
            if args.data_rebuild == True:
                self._bulid_data(file_path)
            else:
                df, self.num_users, self.num_items = self._load_data(file_path)
                # self.train_df = pd.read_csv(self.path+'/%s_train_df.csv'% self.name)
                # self.test_df = pd.read_csv(self.path+'/%s_test_df.csv'% self.name)
                self.pos_dict = self._construct_pos_dict(df)
                self.train_dict = np.load(self.train_npy_path).item()
                self.test_dict = np.load(self.test_npy_path).item()


    def _bulid_data(self, file_path):
        df, self.num_users, self.num_items = self._load_data(file_path)
        self.pos_dict = self._construct_pos_dict(df)
        self.train_df, self.test_df = self._split_train_test(df)
        self.train_dict = self._construct_train(self.train_df)
        self.test_dict = self._construct_test(self.test_df)


    def _load_data(self, file_path):
        df = pd.read_csv(file_path, sep=',', usecols=[0, 1])

        # constructing index
        uiterator = count(0)
        udict = defaultdict(lambda: next(uiterator))
        [udict[user] for user in sorted(df['reviewerID'].tolist())]
        iiterator = count(0)
        idict = defaultdict(lambda: next(iiterator))
        [idict[item] for item in sorted(df['asin'].tolist())]

        self.udict = udict
        self.idict = idict

        df['uidx'] = df['reviewerID'].map(lambda x: udict[x])
        df['iidx'] = df['asin'].map(lambda x: idict[x])
        del df['reviewerID'], df['asin']
        print('Load %s data successfully with %d users, %d products and %d interactions.'
              %(self.name, len(udict), len(idict), df.shape[0]))

        return df, len(udict), len(idict)


    def _construct_pos_dict(self, df):
        # we can't build a negative dictionary cause it'll cost huge memory
        pos_dict = defaultdict(set)
        for user, item in zip(df['uidx'], df['iidx']):
            pos_dict[user].add(item)

        return pos_dict


    def _split_train_test(self, df):
        test_list = []
        print('Spliting data of train and test...')
        with Pool(self.args.processor_num) as pool:
            nargs = [(user, df, self.args.test_size) for user in range(self.num_users)]
            test_list = pool.map(self._split, nargs)

        test_df = pd.concat(test_list)
        train_df = df.drop(test_df.index)

        train_df = train_df.reset_index(drop=True)
        test_df = test_df.reset_index(drop=True)

        train_df.to_csv(self.path+'/%s_train_df.csv'% self.name, index=False)
        test_df.to_csv(self.path+'/%s_test_df.csv'% self.name, index=False)

        return train_df, test_df


    def _construct_train(self, df):
        # It's desperate to use df to calculate... so slow!!!
        print('Adding negative data to train_df...')
        users = []
        items = []
        labels = []
        with Pool(self.args.processor_num) as pool:
            nargs = [(user, item, self.num_items, self.pos_dict, self.args.train_neg_num, True)
                     for user, item in zip(df['uidx'], df['iidx'])]
            res_list = pool.map(self._add_negtive, nargs)

        for (batch_users, batch_items, batch_labels) in res_list:
            users += batch_users
            items += batch_items
            labels += batch_labels

        data_dict = {'user': users, 'item': items, 'label': labels}
        np.save(self.train_npy_path, data_dict)

        return data_dict


    def _construct_test(self, df):
        print('Adding negative data to test_df...')
        users = []
        items = []
        labels = []

        with Pool(self.args.processor_num) as pool:
            nargs = [(user, item, self.num_items, self.pos_dict, self.args.test_neg_num, False)
                     for user, item in zip(df['uidx'], df['iidx'])]
            res_list = pool.map(self._add_negtive, nargs)

        for batch_users, batch_items, batch_labels in res_list:
            users += batch_users
            items += batch_items
            labels += batch_labels

        data_dict = {'user': users, 'item': items, 'label': labels}
        np.save(self.test_npy_path, data_dict)

        return data_dict


    # The 2 functions below are designed for multiprocessing task
    @staticmethod
    def _split(args):
        user, df, test_size = args
        sample_test = df[df['uidx'] == user].sample(n=test_size)

        return sample_test


    @staticmethod
    def _add_negtive(args):
        user, item, num_items, pos_dict, neg_num, train = args
        users, items, labels = [], [], []
        neg_set = set(range(num_items)).difference(pos_dict[user])
        neg_sample_list = np.random.choice(list(neg_set), neg_num, replace=False).tolist()
        for neg_sample in neg_sample_list:
            users.append(user)
            items.append(neg_sample)
            labels.append(0) if train == True else labels.append(neg_sample)

        users.append(user)
        items.append(item)
        if train == True:
            labels.append(1)
        else:
            labels.append(item)

        return (users, items, labels)

In [None]:
import sys
sys.path.append('/content/metrics.py')
import metrics
def train(dataset_s, dataset_t, args):
    with tf.compat.v1.Session()  as sess:
        train_path = dataset_s.path + '/cross_' + '_'.join([dataset_s.name, dataset_t.name]) + '_train.npy'
        test_path = dataset_t.path + '/cross_' + '_'.join([dataset_s.name, dataset_t.name]) + '_test.npy'
        if os.path.exists(train_path) and os.path.exists(test_path) and args.cross_data_rebuild == False:
            print('Loading cross data..')
            train_dict = np.load(train_path).item()
            test_dict = np.load(test_path).item()
        else:
            print('Building cross data..')
            train_dict, test_dict = cross_data_build(dataset_s, dataset_t, args, train_path, test_path)
        print('Get cross data successfully.')

        norm_adj_mat = load_mat(dataset_s, dataset_t, args)

        print('Loading train data from train_dict...')
        train_data = tf.data.Dataset.from_tensor_slices(train_dict)
        train_data = train_data.shuffle(buffer_size=len(train_dict['user'])).batch(args.batch_size)
        print('Loading test data from test_dict...')
        test_data = tf.data.Dataset.from_tensor_slices(test_dict)
        # Test data doesn't need to be shuffled, the first item of every (test_size+test_neg_num) is the positive item.
        test_data = test_data.batch(args.test_size + args.test_neg_num)

        iterator = tf.data.Iterator.from_structure(train_data.output_types, train_data.output_shapes)

        model = PPGN(args, iterator, norm_adj_mat, dataset_s.num_users, dataset_s.num_items, dataset_t.num_items, True)

        print("Creating model with fresh parameters...")
        sess.run(tf.global_variables_initializer())

        count = 0
        loss = 0
        last_count = 0
        hr_s_list, mrr_s_list, ndcg_s_list = [], [], []
        hr_t_list, mrr_t_list, ndcg_t_list = [], [], []
        for epoch in range(1, args.epochs + 1):
            print('=' * 30 + ' EPOCH %d ' % epoch + '=' * 30)
            ################################## TRAINING ################################
            if 6 > epoch > 3:
                model.args.lr = 1e-4
            if epoch >= 6:
                model.args.lr = 1e-5
            sess.run(model.iterator.make_initializer(train_data))
            model.is_training = True
            start_time = time.time()

            try:
                while True:
                    count += 1
                    loss += model.step(sess)
                    if count % 1000 == 0:
                        print('Epoch %d, step %d, with average loss of %.4f in last %d steps;'
                              % (epoch, count, loss / (count - last_count), count - last_count))
                        loss = 0
                        last_count = count
            except tf.errors.OutOfRangeError:
                print("Epoch %d, finish training " % epoch + "took " +
                      time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time)) + ';')

            ################################## TESTING ################################
            sess.run(model.iterator.make_initializer(test_data))
            model.is_training = False
            start_time = time.time()
            HR_s, MRR_s, NDCG_s = [], [], []
            HR_t, MRR_t, NDCG_t = [], [], []
            predictions_s, labels_s, predictions_t, labels_t = model.step(sess)

            cnt = 1
            try:
                while True:
                    predictions_s, labels_s, predictions_t, labels_t = model.step(sess)
                    hr_s, mrr_s, ndcg_s = evaluate(predictions_s, labels_s)
                    hr_t, mrr_t, ndcg_t = evaluate(predictions_t, labels_t)
                    HR_s.append(hr_s)
                    MRR_s.append(mrr_s)
                    NDCG_s.append(ndcg_s)
                    HR_t.append(hr_t)
                    MRR_t.append(mrr_t)
                    NDCG_t.append(ndcg_t)
                    cnt += 1
            except tf.errors.OutOfRangeError:
                hr_s = np.array(HR_s).mean()
                mrr_s = np.array(MRR_s).mean()
                ndcg_s = np.array(NDCG_s).mean()
                hr_t = np.array(HR_t).mean()
                mrr_t = np.array(MRR_t).mean()
                ndcg_t = np.array(NDCG_t).mean()
                hr_s_list.append(hr_s)
                mrr_s_list.append(mrr_s)
                ndcg_s_list.append(ndcg_s)
                hr_t_list.append(hr_t)
                mrr_t_list.append(mrr_t)
                ndcg_t_list.append(ndcg_t)
                print("Epoch %d, finish testing " % epoch + "took: " +
                      time.strftime("%H: %M: %S", time.gmtime(time.time() - start_time)) + ';')
                print('Epoch %d, %s HR is %.4f, MRR is %.4f, NDCG is %.4f;' %
                      (epoch, dataset_s.name, hr_s, mrr_s, ndcg_s))
                print('Epoch %d, %s HR is %.4f, MRR is %.4f, NDCG is %.4f;' %
                      (epoch, dataset_t.name, hr_t, mrr_t, ndcg_t))

        print('=' * 30 + 'Finish training' + '=' * 30)
        print('%s best HR is %.4f, MRR is %.4f, NDCG is %.4f;' %
              (dataset_s.name, max(hr_s_list), max(mrr_s_list), max(ndcg_s_list)))
        print('%s best HR is %.4f, MRR is %.4f, NDCG is %.4f;' %
              (dataset_t.name, max(hr_t_list), max(mrr_t_list), max(ndcg_t_list)))

In [None]:
parser = argparse.ArgumentParser()

parser.add_argument('--cross_data_rebuild', type=bool, default=False,
                    help='whether to rebuild cross data')
parser.add_argument('--data_rebuild', type=bool, default=False,
                    help='whether to rebuild train/test dataset')
parser.add_argument('--mat_rebuild', type=bool, default=False,
                    help='whether to rebuild` adjacent mat')
parser.add_argument('--processor_num', type=int, default=12,
                    help='number of processors when preprocessing data')
parser.add_argument('--batch_size', type=int, default=256,
                    help='size of mini-batch')
parser.add_argument('--train_neg_num', type=int, default=4,
                    help='number of negative samples per training positive sample')
parser.add_argument('--test_size', type=int, default=1,
                    help='size of sampled test data')
parser.add_argument('--test_neg_num', type=int, default=99,
                    help='number of negative samples for test')
parser.add_argument('--epochs', type=int, default=20,
                    help='the number of epochs')
parser.add_argument('--gnn_layers', nargs='?', default=[32,32,16,16,8],
                    help='the unit list of layers')
parser.add_argument('--mlp_layers', nargs='?', default=[32,16,8],
                    help='the unit list of layers')
parser.add_argument('--embedding_size', type=int, default=8,
                    help='the size for embedding user and item')
parser.add_argument('--topK', type=int, default=10,
                    help='topk for evaluation')
parser.add_argument('--regularizer_rate', type=float, default=0.01,
                    help='the regularizer rate')
parser.add_argument('--lr', type=float, default=0.001,
                    help='learning rate')
parser.add_argument('--dropout_message', type=float, default=0.1,
                    help='dropout rate of message')
parser.add_argument('--NCForMF', type=str, default='NCF',
                    help='method to propagate embeddings')

args, unknown = parser.parse_known_args()


In [None]:
dataset_s = Dataset('/content/AMAZON_FASHION.csv', args)


Load AMAZON data successfully with 119600 users, 65465 products and 163306 interactions.
Spliting data of train and test...
Adding negative data to train_df...
Adding negative data to test_df...


In [None]:
dataset_t = Dataset('/content/Arts_Crafts_and_Sewing.csv', args)


Load Arts data successfully with 119600 users, 96735 products and 322484 interactions.
Spliting data of train and test...
Adding negative data to train_df...
Adding negative data to test_df...


In [None]:
train(dataset_s, dataset_t, args)

Building cross data..


AssertionError: ignored

In [None]:
def evaluate(predictions, labels):
    label = int(labels[-1])
    hr = metrics.hit(label, predictions)
    mrr = metrics.mrr(label, predictions)
    ndcg = metrics.ndcg(label, predictions)

    return hr, mrr, ndcg


def cross_data_build(dataset_s, dataset_t, args, train_path, test_path):
    # multiprocessing
    with Pool(args.processor_num) as pool:
        nargs = [(user, dataset_s.pos_dict, dataset_t.pos_dict, dataset_s.num_items, dataset_t.num_items,
                  args.train_neg_num) for user in range(dataset_s.num_users)]
        extend_list = pool.map(_cross_build, nargs)

    for (extend_users, extend_items, extend_labels, flag) in extend_list:
        if flag == 't':
            dataset_t.train_dict['user'].extend(extend_users)
            dataset_t.train_dict['item'].extend(extend_items)
            dataset_t.train_dict['label'].extend(extend_labels)
        elif flag == 's':
            dataset_s.train_dict['user'].extend(extend_users)
            dataset_s.train_dict['item'].extend(extend_items)
            dataset_s.train_dict['label'].extend(extend_labels)

    train_dict_s, test_dict_s = dataset_s.train_dict, dataset_s.test_dict
    train_dict_t, test_dict_t = dataset_t.train_dict, dataset_t.test_dict

    q_s = np.argsort(np.array(train_dict_s['user']))
    q_t = np.argsort(np.array(train_dict_t['user']))

    users_s = np.array(train_dict_s['user'])[q_s].tolist()
    users_t = np.array(train_dict_t['user'])[q_t].tolist()
    assert users_s == users_t
    users = users_s

    items_s = np.array(train_dict_s['item'])[q_s].tolist()
    labels_s = np.array(train_dict_s['label'])[q_s].tolist()

    items_t = np.array(train_dict_t['item'])[q_t].tolist()
    labels_t = np.array(train_dict_t['label'])[q_t].tolist()

    train_dict = {'user': users, 'item_s': items_s, 'item_t': items_t,'label_s': labels_s, 'label_t':labels_t}
    np.save(train_path, train_dict)

    assert test_dict_s['user'] == test_dict_t['user']
    test_dict = {'user': test_dict_s['user'], 'item_s': test_dict_s['item'], 'item_t': test_dict_t['item'],
                 'label_s': test_dict_s['label'], 'label_t':test_dict_t['label']}
    np.save(test_path, test_dict)

    return train_dict, test_dict


def _cross_build(args):
    user, posdict_s, posdict_t ,num_items_s, num_items_t, per_neg_num = args

    num_item_s = len(posdict_s[user])
    num_item_t = len(posdict_t[user])
    if num_item_s > num_item_t:
        flag = 't'
        pos_num = num_item_s - num_item_t
        neg_num = per_neg_num * pos_num
        pos_set = set(posdict_t[user])
        neg_set = set(range(num_items_t)) - pos_set
    elif num_item_t > num_item_s:
        flag = 's'
        pos_num = num_item_t-num_item_s
        neg_num = per_neg_num * pos_num
        pos_set = set(posdict_s[user])
        neg_set = set(range(num_items_s)) - pos_set
    else:
        return [],[],[],''

    extend_users = (pos_num + neg_num)*[user]
    extend_items_neg = np.random.choice(list(neg_set), neg_num, replace=True)
    extend_items_pos = np.random.choice(list(pos_set), pos_num, replace=True)
    extend_items = np.concatenate([extend_items_neg, extend_items_pos])
    extend_labels = neg_num*[0] + pos_num*[1]

    return extend_users, extend_items, extend_labels, flag


def load_mat(dataset_s, dataset_t, args):
    norm_adj_path = '%s/cross_%s_%s_norm_adj_mat.npz'% (dataset_s.path, dataset_s.name, dataset_t.name)
    if os.path.exists(norm_adj_path) and args.mat_rebuild == False:
        print('Loading adjacent mats...')
        norm_adj_mat = sp.load_npz(norm_adj_path)
    else:
        print('Building adjacent mats..')
        num_users = dataset_s.num_users
        num_items_s = dataset_s.num_items
        num_items_t = dataset_t.num_items

        train_df_s = {'user':dataset_s.train_dict['user'][args.train_neg_num::args.train_neg_num+1],
                      'item':dataset_s.train_dict['item'][args.train_neg_num::args.train_neg_num+1]}
        train_df_t = {'user':dataset_t.train_dict['user'][args.train_neg_num::args.train_neg_num+1],
                      'item':dataset_t.train_dict['item'][args.train_neg_num::args.train_neg_num+1]}

        R_s = sp.dok_matrix((num_users, num_items_s), dtype=np.float32)
        R_t = sp.dok_matrix((num_users, num_items_t), dtype=np.float32)

        for user, item in zip(train_df_s['user'], train_df_s['item']):
            R_s[user, item] = 1.0

        for user, item in zip(train_df_t['user'], train_df_t['item']):
            R_t[user, item] = 1.0

        R_s, R_t = R_s.tolil(), R_t.tolil()

        plain_adj_mat = sp.dok_matrix((num_items_s+ num_users+ num_items_t, num_items_s+ num_users+ num_items_t),
                                      dtype=np.float32).tolil()
        plain_adj_mat[num_items_s: num_items_s+ num_users, :num_items_s] = R_s
        plain_adj_mat[:num_items_s, num_items_s: num_items_s+ num_users] = R_s.T
        plain_adj_mat[num_items_s: num_items_s+ num_users, num_items_s+ num_users:] = R_t
        plain_adj_mat[num_items_s+ num_users:, num_items_s: num_items_s+ num_users] = R_t.T
        plain_adj_mat = plain_adj_mat.todok()

        norm_adj_mat = normalized_adj_single(plain_adj_mat+ sp.eye(plain_adj_mat.shape[0]))

        sp.save_npz(norm_adj_path, norm_adj_mat)

    print('Get adjacent mats successfully.')

    return norm_adj_mat


def normalized_adj_single(adj):
    rowsum = np.array(adj.sum(1))

    d_inv = np.power(rowsum, -1).flatten()
    d_inv[np.isinf(d_inv)] = 0.
    d_mat_inv = sp.diags(d_inv)

    norm_adj = d_mat_inv.dot(adj)
    return norm_adj