In [65]:
import pickle
import json
import time, datetime
import numpy as np
import torch

def read_pickle(file):
    with open(file, 'rb') as f:
        ret = pickle.load(f)
    return ret

def read_json(file):
    with open(file, 'r') as f:
        ret = [json.loads(line) for line in f]
    return ret

def write_pickle(file, data):
    with open(file, 'wb') as fw:
        pickle.dump(data, fw)

In [69]:
filepath = '../yelp_dataset/filtered/'
jsonpath = '../yelp_dataset/json/'

In [70]:
reviews = read_pickle(filepath+'reviews.pickle')
# reviews = read_pickle(filepath+'reviews-small.pickle')
print(len(reviews))

742969


In [71]:
# users_comp = read_pickle(filepath+'users-complete.pickle')
users = read_pickle(filepath+'users-complete.pickle')
# users = read_pickle(filepath+'users-small.pickle')
print(len(users))

12642


In [72]:
# busi_comp = read_pickle(filepath+'businesses-complete.pickle')
businesses = read_pickle(filepath+'businesses-complete.pickle')
# businesses = read_pickle(filepath+'businesses-small.pickle')
print(len(businesses))

12852


In [73]:
print(type(reviews))
print(reviews[0].keys())
print(len(reviews))
print(type(users))
for u in users:
    print(u.keys())
    break

<class 'list'>
dict_keys(['useful', 'text', 'review_id', 'date', 'business_id', 'cool', 'funny', 'user_id', 'stars'])
742969
<class 'list'>
dict_keys(['elite', 'compliment_writer', 'funny', 'cool', 'compliment_photos', 'useful', 'compliment_hot', 'compliment_cool', 'compliment_list', 'yelping_since', 'compliment_note', 'compliment_profile', 'compliment_more', 'review_count', 'user_id', 'compliment_funny', 'compliment_plain', 'friends', 'name', 'compliment_cute', 'fans', 'average_stars'])


In [10]:
uinds = [i for i in range(len(users))]
uid2ind = {user['user_id']:ind for user, ind in zip(users, uinds)}
ind2uid = {ind:user['user_id'] for user, ind in zip(users, uinds)}

In [11]:
b_inds = [i for i in range(len(businesses))]
bid2ind = {business['business_id']:ind for business, ind in zip(businesses, b_inds)}
ind2bid = {ind:business['business_id'] for business, ind in zip(businesses, b_inds)}

In [12]:
cities = set(busi['city'] for busi in businesses)
c_inds = [i for i in range(len(cities))]
ct_id2ind = {city:ind for city, ind in zip(cities, c_inds)}
ind2ct_id = {ind:city for city, ind in zip(cities, c_inds)}

In [13]:
print(len(cities))
print(ct_id2ind)

7
{'Summerlin': 0, 'Dallas': 1, 'Henderson': 5, 'North Las Vegas': 2, 'Spring Valley': 3, 'Las Vegas': 4, 'Boulder City': 6}


In [14]:
categories = set(category.strip() for busi in businesses for category in busi['categories'].split(','))
ca_inds = [i for i in range(len(categories))]
ca_id2ind = {category:ind for category, ind in zip(categories, ca_inds)}
ind2ca_id = {ind:category for category, ind in zip(categories, ca_inds)}

In [15]:
print(len(categories))
print(ca_id2ind)

255
{'Dive Bars': 0, 'Botanical Gardens': 1, 'Conveyor Belt Sushi': 2, 'Tapas Bars': 232, 'Burgers': 3, 'Gift Shops': 16, 'Lounges': 6, 'American (Traditional)': 7, 'Farmers Market': 8, 'Festivals': 9, 'International Grocery': 10, 'Gastropubs': 11, 'Soul Food': 12, 'Malaysian': 13, 'Barbers': 5, 'Soup': 46, 'Baby Gear & Furniture': 171, 'Fast Food': 14, 'Piano Bars': 15, 'Aquariums': 17, 'Performing Arts': 126, 'Live/Raw Food': 18, 'Home & Garden': 81, 'Pasta Shops': 19, 'Shopping': 20, 'Sporting Goods': 21, 'Venues & Event Spaces': 22, 'Food Trucks': 24, 'Wholesale Stores': 25, 'Pick Your Own Farms': 26, 'Modern European': 83, 'Bars': 27, 'Computers': 28, 'Steakhouses': 29, 'Landmarks & Historical Buildings': 31, 'Hot Pot': 35, 'Health Retreats': 87, 'Public Services & Government': 34, 'Tasting Classes': 191, 'Ethical Grocery': 37, 'Himalayan/Nepalese': 50, 'Seafood': 38, 'Do-It-Yourself Food': 40, 'Brasseries': 41, 'Latin American': 42, 'Southern': 44, 'Japanese': 45, 'Mobile Phones'

In [16]:
def dataset_split(reviews, userid_to_num, businessid_to_num, train_ratio, valid_ratio, test_ratio, n_neg_sample):
    selected_reviews = []
    
    for review in reviews:
        filtered_review = {}
        filtered_review['user_id'] = userid_to_num[review['user_id']]
        filtered_review['business_id'] = businessid_to_num[review['business_id']]
        filtered_review['rate'] = 1.0
        filtered_review['timestamp'] = time.mktime(datetime.datetime.strptime(review['date'], '%Y-%m-%d %H:%M:%S').timetuple())
        selected_reviews.append(filtered_review)
        
    selected_reviews_sorted = sorted(selected_reviews, key=lambda k: k['timestamp']) # use the earlier data to train and the later data to test
    n_reviews = len(selected_reviews_sorted)
    train_size = int(n_reviews*train_ratio)
    valid_size = int(n_reviews*valid_ratio)
    train_data = [selected_reviews_sorted[index] for index in range(train_size)]
    valid_data = [selected_reviews_sorted[index] for index in range(train_size, train_size+valid_size)]
    test_data = [selected_reviews_sorted[index] for index in range(train_size+valid_size, n_reviews)]
    
    selected_users = set()
    selected_businesses = set()
    for review in train_data:
        selected_users.add(review['user_id'])
        selected_businesses.add(review['business_id'])
        
    eval_datas = [valid_data, test_data]
#     selected_eval_datas = [[] for _ in range(len(eval_datas))]
    selected_eval_datas = [[], []]
    for eval_index in range(len(eval_datas)):
        eval_data = eval_datas[eval_index]
        for review in eval_data:
            if review['user_id'] in selected_users and review['business_id'] in selected_businesses:
                selected_eval_datas[eval_index].append(review)
    selected_valid_data, selected_test_data = selected_eval_datas
    
    data_list = [train_data, selected_valid_data, selected_test_data]
#     data_for_user_list = [{} for _ in range(len(data_list))]
    data_for_user_list = [{}, {}, {}]
    train_data_for_item = set()
    for index in range(len(data_list)):
        data = data_list[index]
        data_for_user = data_for_user_list[index]
        for review in data:
            user = review['user_id']
            item = review['business_id']
            if index == 0:
                train_data_for_item.add(item)
            if user not in data_for_user:
                data_for_user[user] = [item]
            else:
                data_for_user[user].append(item)
    train_data_for_user, valid_data_for_user, test_data_for_user = data_for_user_list # dictionary of user_id:[item_id]
    
    with_neg_list = [valid_data_for_user, test_data_for_user]
#     data_with_neg_list = [[] for _ in range(len(with_neg_list))]
    data_with_neg_list = [[], []]
    for index in range(len(with_neg_list)):
        current_data = with_neg_list[index]
        for user in current_data.keys():
            if user not in selected_users:
                continue
            user_eval = {} # a dict
            business_set = selected_businesses - set(train_data_for_user[user]) - set(current_data[user]) # items not existed in this user's records
            sample_businesses = np.random.choice(list(business_set), size=n_neg_sample, replace=False)    # sample is random.choice
            user_eval['user_id'] = user
            user_eval['pos_business_id'] = current_data[user]
            user_eval['neg_business_id'] = list(sample_businesses)
            data_with_neg_list[index].append(user_eval)
    valid_with_neg, test_with_neg = data_with_neg_list
    
    return train_data, selected_valid_data, selected_test_data, valid_with_neg, test_with_neg

In [17]:
# get adjs
def get_adj_matrix(uid2ind, bid2ind, city_id2ind, cat_id2ind, users, businesses, reviews):
    """
    metapaths: UB, UUB, UBUB, UBCaB, UBCiB
    """
    tot_users = len(uid2ind)  # tot for total
    tot_business = len(bid2ind)
    tot_city = len(city_id2ind)
    tot_category = len(cat_id2ind)
    print(tot_users, tot_business, tot_city, tot_category)
    #relation U-U
    adj_UU = np.zeros([tot_users, tot_users])
    adj_UB = np.zeros([tot_users, tot_business])
    adj_BCa = np.zeros([tot_business, tot_category])
    adj_BCi = np.zeros([tot_business, tot_city])
    print(adj_BCi.shape)
    for user in users:
        if user['user_id'] not in uid2ind:
            continue
        user_id = uid2ind[user['user_id']]
        for friend in user['friends'].split(','):
            friend = friend.strip()
            if friend in uid2ind:
                friend_id = uid2ind[friend]
                adj_UU[user_id][friend_id] = 1
                adj_UU[friend_id][user_id] = 1
    #relation U-P-B
    for review in reviews:
#         user_id = uid2ind[review['user_id']]
#         business_id = bid2ind[review['business_id']]
        user_id = review['user_id']
        business_id = review['business_id']
        adj_UB[user_id][business_id] = 1
    #relation B_Ca B_Ci
    for business in businesses:
        if business['business_id'] not in bid2ind:
            continue
        business_id = bid2ind[business['business_id']]
        city_id = city_id2ind[business['city']]
        print("business_id: %d, city_id: %d" % (business_id, city_id))
        adj_BCi[business_id][city_id] = 1
        
        # more than one category for a business
        for category in business['categories'].split(','):
            category = category.strip()
            category_id = cat_id2ind[category]
            adj_BCa[business_id][category_id] = 1

    #metapath
    adj_UUB = adj_UU.dot(adj_UB)

    adj_UBU = adj_UB.dot(adj_UB.T)

    adj_UBUB = adj_UBU.dot(adj_UB)

    adj_UBCa = adj_UB.dot(adj_BCa)
    adj_UBCaB = adj_UBCa.dot(adj_BCa.T)

    adj_UBCi = adj_UB.dot(adj_BCi)
    adj_UBCiB = adj_UBCi.dot(adj_BCi.T)

#     adj_UCaB = adj_UCa.dot(adj_CaB)
    
#     adj_UCiB = adj_UCi.dot(adj_CiB)
    
    return adj_UB, adj_UUB, adj_UBUB, adj_UBCaB, adj_UBCiB


In [18]:
train_data, valid_data, test_data, valid_with_neg_sample, test_with_neg_sample \
    = dataset_split(reviews, uid2ind, bid2ind, 0.8, 0.1, 0.1, 50)

In [22]:
print(type(train_data))
print(train_data[0])
print(reviews[0])
print(len(train_data))

<class 'list'>
{'business_id': 181, 'rate': 1.0, 'user_id': 21, 'timestamp': 1116877827.0}
{'review_id': 'kbtscdyz6lvrtGjD1quQTg', 'cool': 0, 'user_id': 'FIk4lQQu1eTe2EpzQ4xhBA', 'stars': 4.0, 'date': '2011-11-30 02:11:15', 'business_id': '8mIrX_LrOnAqWsB5JrOojQ', 'funny': 0, 'useful': 0, 'text': 'Like walking back in time, every Saturday morning my sister and I was in a bowling league and after we were done, we\'d spend a few quarters playing the pin ball machines until our mother came to pick us up.\n\nMy sister was daring and play the machines hard, she was afraid of that "tilt" showing up and freezing the game.  I, on the other hand was a bit more gentler and wanted to make sure I got my quarter\'s worth.\n\nThis place has rows and rows of machines, some are really old and some are more of a mid 80\'s theme.  There is even a Ms pac man!  It was fun to spend an afternoon playing the machines and remembering all the fun of my early teen years.'}
36692


In [23]:
path = '../yelp_dataset/rates/'
filenames = ['train_data', 'valid_data', 'test_data', 'valid_with_neg_sample', 'test_with_neg_sample']
objs = [train_data, valid_data, test_data, valid_with_neg_sample, test_with_neg_sample]
for file, obj in zip(filenames, objs):
    write_pickle(path+file+'.pickle', obj)

In [24]:
# get adj matrices
adj_UB, adj_UUB, adj_UBUB, adj_UBCaB, adj_UBCiB \
    = get_adj_matrix(uid2ind, bid2ind, ct_id2ind, ca_id2ind, users, businesses, train_data)

648 637 7 255
(637, 7)
business_id: 0, city_id: 5
business_id: 1, city_id: 4
business_id: 2, city_id: 5
business_id: 3, city_id: 4
business_id: 4, city_id: 4
business_id: 5, city_id: 4
business_id: 6, city_id: 4
business_id: 7, city_id: 4
business_id: 8, city_id: 4
business_id: 9, city_id: 4
business_id: 10, city_id: 5
business_id: 11, city_id: 4
business_id: 12, city_id: 4
business_id: 13, city_id: 4
business_id: 14, city_id: 4
business_id: 15, city_id: 4
business_id: 16, city_id: 4
business_id: 17, city_id: 4
business_id: 18, city_id: 4
business_id: 19, city_id: 4
business_id: 20, city_id: 4
business_id: 21, city_id: 4
business_id: 22, city_id: 4
business_id: 23, city_id: 4
business_id: 24, city_id: 4
business_id: 25, city_id: 4
business_id: 26, city_id: 4
business_id: 27, city_id: 4
business_id: 28, city_id: 4
business_id: 29, city_id: 4
business_id: 30, city_id: 4
business_id: 31, city_id: 4
business_id: 32, city_id: 4
business_id: 33, city_id: 4
business_id: 34, city_id: 4
busines

business_id: 405, city_id: 4
business_id: 406, city_id: 4
business_id: 407, city_id: 5
business_id: 408, city_id: 4
business_id: 409, city_id: 4
business_id: 410, city_id: 4
business_id: 411, city_id: 4
business_id: 412, city_id: 4
business_id: 413, city_id: 4
business_id: 414, city_id: 4
business_id: 415, city_id: 4
business_id: 416, city_id: 4
business_id: 417, city_id: 4
business_id: 418, city_id: 4
business_id: 419, city_id: 4
business_id: 420, city_id: 4
business_id: 421, city_id: 4
business_id: 422, city_id: 4
business_id: 423, city_id: 4
business_id: 424, city_id: 4
business_id: 425, city_id: 4
business_id: 426, city_id: 4
business_id: 427, city_id: 4
business_id: 428, city_id: 4
business_id: 429, city_id: 4
business_id: 430, city_id: 4
business_id: 431, city_id: 4
business_id: 432, city_id: 4
business_id: 433, city_id: 4
business_id: 434, city_id: 5
business_id: 435, city_id: 4
business_id: 436, city_id: 5
business_id: 437, city_id: 4
business_id: 438, city_id: 4
business_id: 4

In [25]:
print(adj_UB.shape)
print(adj_UBCaB.shape)
print(adj_UBCaB[0:100][0:100])

(648, 637)
(648, 637)
[[395.  92. 110. ... 361. 216. 454.]
 [100.  14.  21. ...  92.  58. 107.]
 [182.  46.  54. ... 195. 102. 214.]
 ...
 [ 91.  26.  22. ...  79.  47.  95.]
 [387.  86. 109. ... 353. 224. 415.]
 [114.  70.  19. ... 170.  65. 116.]]


In [26]:
adjs = [adj_UB, adj_UUB, adj_UBUB, adj_UBCaB, adj_UBCiB]
filenames = ['adj_UB', 'adj_UUB', 'adj_UBUB', 'adj_UBCaB', 'adj_UBCiB']
path = '../yelp_dataset/adjs/'
for adj, file in zip(adjs, filenames):
    write_pickle(path+file+'.pickle', adj)

In [27]:
filenames = ['uid2ind', 'bid2ind', 'ct_id2ind', 'ca_id2ind', 'ind2uid', 'ind2bid', 'ind2ct_id', 'ind2ca_id']
maps = [uid2ind, bid2ind, ct_id2ind, ca_id2ind, ind2uid, ind2bid, ind2ct_id, ind2ca_id]
path = '../yelp_dataset/adjs/'
for mapping, file in zip(maps, filenames):
    write_pickle(path+file+'.pickle', mapping)

In [8]:
# load dictionaries
path = '../yelp_dataset/adjs/'
uid2ind = read_pickle(path+'uid2ind.pickle')
bid2ind = read_pickle(path+'bid2ind.pickle')
ct_id2ind = read_pickle(path+'ct_id2ind.pickle')
ca_id2ind = read_pickle(path+'ca_id2ind.pickle')
ind2uid = read_pickle(path+'ind2uid.pickle')
ind2bid = read_pickle(path+'ind2bid.pickle')
ind2ct_id = read_pickle(path+'ind2ct_id.pickle')
ind2ca_id = read_pickle(path+'ind2ca_id.pickle')

In [30]:
del reviews

NameError: name 'reviews' is not defined

In [112]:
train_data = read_pickle('../yelp_dataset/rates/train_data.pickle')
print(type(train_data))

<class 'list'>


In [113]:
print(train_data[0])

{'business_id': 181, 'rate': 1.0, 'user_id': 21, 'timestamp': 1116877827.0}


In [99]:
def make_embedding(user_features, item_features):
    user_concat = torch.cat(user_features, 1)
    item_concat = torch.cat(item_features, 1)
    X = []
    for user in user_concat:
        tmp = [torch.cat([user,item], 0).unsqueeze(0) for item in item_concat]
        print("tmp[0].shape", tmp[0].shape)
        tmp = torch.cat(tmp, 0)
        X.append(tmp)
    X = torch.cat(X, 0)
    return X

In [3]:
def load_feature(feature_path, metapaths):
    user_features = [read_pickle(feature_path+metapath+'_user.pickle') for metapath in metapaths]
    item_features = [read_pickle(feature_path+metapath+'_item.pickle') for metapath in metapaths]
        
    return user_features, item_features


In [106]:
def make_labels(Y, n_user, n_item):
    r"""
    Parameter
    ---------
    Y: list of dict
        saves the interaction information in COO form
    
    Return
    ------
    ret: torch.tensor
        still in COO form
    """
    indices = np.array(([y['user_id'] for y in Y], [y['business_id'] for y in Y]))
    values = np.array([1. for y in Y])
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    ret = torch.sparse_coo_tensor(indices, values, size=(n_user,n_item),
                                  dtype=torch.float32, device=device, requires_grad=False)
    return ret

In [115]:
Y = make_labels(train_data, 648, 637)
print(Y.shape)
print(Y)
print(type(Y))
dense = Y.to_dense()
x = [0, 1, 2, 3]
y = [0, 1, 2, 3]
print(dense)
dense[x, y]
# print(dense[99])

torch.Size([648, 637])
tensor(indices=tensor([[ 21,  99,  99,  ..., 530, 568, 118],
                       [181, 569, 290,  ...,  34,   9, 201]]),
       values=tensor([1., 1., 1.,  ..., 1., 1., 1.]),
       device='cuda:0', size=(648, 637), nnz=36692, layout=torch.sparse_coo)
<class 'torch.Tensor'>
tensor([[0., 0., 0.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


tensor([0., 0., 0., 0.], device='cuda:0')

In [4]:
featurepath = '../yelp_dataset/mf_features/'
metapaths = ['UB', 'UUB', 'UBUB', 'UBCaB', 'UBCiB']

In [8]:
user_features, item_features = load_feature(featurepath, metapaths)

In [104]:
# this is for test
user_features = tuple(torch.Tensor(np.zeros((3, 3))) for i in range(2)) # two 3*3 matrices
item_features = tuple(torch.Tensor(np.ones((5, 3))) for i in range(2))  # two 5*3 matrices

print("user_features:", user_features)

user_concat = torch.cat(user_features, 1)
print(user_concat[:, 0:2])
print("user_concat:", user_concat)
item_concat = torch.cat(item_features, 1)
print("item_concat:", item_concat)
user = user_concat[0]
item = item_concat[0]
print(user, item)
user = user_concat[0].view(1, 2, 3)
item = item_concat[0].view(1, 2, 3)
print(user, item)
ui_concat = torch.cat([user, item], 1)
print("ui_concat:", ui_concat)

user_features: (tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]), tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]))
tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])
user_concat: tensor([[0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]])
item_concat: tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])
tensor([0., 0., 0., 0., 0., 0.]) tensor([1., 1., 1., 1., 1., 1.])
tensor([[[0., 0., 0.],
         [0., 0., 0.]]]) tensor([[[1., 1., 1.],
         [1., 1., 1.]]])
ui_concat: tensor([[[0., 0., 0.],
         [0., 0., 0.],
         [1., 1., 1.],
         [1., 1., 1.]]])


In [100]:
X = make_embedding(user_features, item_features)
print(X)

tmp[0].shape torch.Size([1, 12])
tmp[0].shape torch.Size([1, 12])
tmp[0].shape torch.Size([1, 12])
tensor([[0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.]])


In [61]:
print(X.shape)

torch.Size([648, 637, 100])


In [6]:
def filter_rare_node(users, businesses, reviews, user_threshold, business_threshold, friend_threshold):
    continue_filter = True
    filtered_users = set()
    filtered_businesses = set()
    while(continue_filter):
        continue_filter = False
        user_interact_num = {}
        business_interact_num = {}
        user_business_interact = set()
        for review in reviews:
            if not review['date']:
                continue
            user_id = review['user_id']
            business_id = review['business_id']
            user_business = str(user_id)+str(business_id)
            if user_business not in user_business_interact:
                user_interact_num[user_id] = user_interact_num.get(user_id, 0) + 1
                business_interact_num[business_id] = business_interact_num.get(business_id, 0) + 1
                user_business_interact.add(user_business)
        filtered_review_users = set(u for u in user_interact_num.keys() if user_interact_num[u]>=user_threshold)
        filtered_review_businesses = set(b for b in business_interact_num.keys() if business_interact_num[b]>=business_threshold)
        if (filtered_users != filtered_review_users) or (filtered_businesses != filtered_review_businesses):
            continue_filter = True
        # filter step 2
        #filter user and business
        user_friends_dict = {}
        for user in users:
            user_id = user['user_id']
            if user_id not in filtered_review_users:
                continue
            if not user['friends']:
                continue
            filtered_friends = [friend.strip() for friend in user['friends'].split(',') if friend.strip() in filtered_review_users]
            if len(filtered_friends) >= friend_threshold:
                user_friends_dict[user_id] = filtered_friends
        continue_inside = True
        while (continue_inside):
            friends = {}
            continue_inside = False
            for user, user_friends in user_friends_dict.items():
                filtered_friends = [friend for friend in user_friends if friend in user_friends_dict]
                if len(filtered_friends) >= friend_threshold:
                    friends[user] = filtered_friends
                else:
                    continue_inside = True
            user_friends_dict = deepcopy(friends)
        filtered_users = set(user_friends_dict.keys())
        filtered_businesses_list = []
        for business in businesses:
            business_id = business['business_id']
            if business_id not in filtered_review_businesses:
                continue
            if not business['categories']:
                continue
            if not business['city']:
                continue
            filtered_businesses_list.append(business_id)
        filtered_businesses = set(filtered_businesses_list)
        filtered_review = []
        user_business_interact = set()
        for review in reviews:
            if not review['date']:
                continue
            if (review['user_id'] in filtered_users) and (review['business_id'] in filtered_businesses):
                user_id = review['user_id']
                business_id = review['business_id']
                user_business = str(user_id) + str(business_id)
                if user_business not in user_business_interact:
                    filtered_review.append(review)
                    user_business_interact.add(user_business)
        reviews = deepcopy(filtered_review)
        print(len(list(filtered_users)))
        print(len(list(filtered_businesses)))
        print(len(reviews))
        print('filter loop')
    print('filter complete')
    return filtered_users, filtered_businesses, filtered_review


In [24]:
from copy import deepcopy
users_small, busi_small, reviews_small = filter_rare_node(users, businesses, reviews, 40, 44, 4)
print(len(users_small))
print(len(busi_small))
print(len(reviews_small))

5647
5153
349913
filter loop
3613
3426
236230
filter loop
2707
2445
174828
filter loop
2095
1953
139223
filter loop
1727
1574
115435
filter loop
1451
1350
100099
filter loop
1277
1222
89362
filter loop
1171
1082
80145
filter loop
1032
986
71864
filter loop
946
881
65722
filter loop
864
816
61480
filter loop
818
784
58698
filter loop
796
764
57019
filter loop
770
743
55129
filter loop
751
725
53627
filter loop
735
709
52327
filter loop
713
703
51216
filter loop
711
690
50583
filter loop
691
689
49752
filter loop
691
672
49025
filter loop
679
672
48564
filter loop
679
659
48005
filter loop
662
659
47346
filter loop
662
644
46705
filter loop
652
644
46319
filter loop
652
638
46064
filter loop
649
638
45947
filter loop
649
637
45904
filter loop
648
637
45865
filter loop
648
637
45865
filter loop
filter complete
648
637
45865


In [20]:
users_small = []
for user in users_comp:
    for u in users:
        if user['user_id'] == u:
            users_small.append(user)
print(len(users_small))
print(users_small[0].keys())

648
dict_keys(['elite', 'yelping_since', 'friends', 'average_stars', 'review_count', 'compliment_cute', 'compliment_note', 'user_id', 'compliment_hot', 'compliment_more', 'compliment_cool', 'cool', 'compliment_plain', 'compliment_funny', 'compliment_writer', 'fans', 'compliment_photos', 'compliment_list', 'name', 'compliment_profile', 'useful', 'funny'])


In [21]:
busi_small = []
for busi in busi_comp:
    for b in businesses:
        if busi['business_id'] == b:
            busi_small.append(busi)
print(len(busi_small))
print(busi_small[0].keys())

637
dict_keys(['categories', 'is_open', 'hours', 'attributes', 'address', 'longitude', 'name', 'state', 'postal_code', 'latitude', 'business_id', 'review_count', 'stars', 'city'])


In [22]:
write_pickle('../yelp_dataset/filtered/users-small.pickle', users_small)

In [23]:
write_pickle('../yelp_dataset/filtered/businesses-small.pickle', busi_small)

In [27]:
write_pickle('../yelp_dataset/filtered/reviews-small.pickle', reviews_small)

In [26]:
# test sparsity
adj = read_pickle('../yelp_dataset/adjs/adj_UBCiB.pickle')
print(type(adj))
# for i in adj_UB

<class 'numpy.ndarray'>


In [27]:
import scipy.sparse as sp
sparse = sp.csr_matrix(adj)

In [28]:
nnz = sparse.nnz
size = adj.shape[0] * adj.shape[1]
print(nnz / size)

0.9474751439037153


注意到除了UB的稠密度只有0.08，其余的都有至少0.85以上，说明metapath的邻接矩阵实际上很稠密

In [62]:
x = torch.ones(10)*10
x[3] = 7
x = x.unsqueeze(1)
print(x)
V = torch.ones(10, 5)
V[:, 3] *= 3
print(V)
# print(torch.matmul(x, V))
out = torch.mul(x, V).sum(1, keepdim=True)
print(out)
out_t = out.sum(0).squeeze()
print(out_t.size())
print(out_t)

tensor([[10.],
        [10.],
        [10.],
        [ 7.],
        [10.],
        [10.],
        [10.],
        [10.],
        [10.],
        [10.]])
tensor([[1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.],
        [1., 1., 1., 3., 1.]])
tensor([[70.],
        [70.],
        [70.],
        [49.],
        [70.],
        [70.],
        [70.],
        [70.],
        [70.],
        [70.]])
torch.Size([])
tensor(679.)


In [64]:
out_t = out_t.repeat(1, 2)
print(out_t)
out_t[0][1] = -out_t[0][0]
print(out_t[0][1])
print(out_t)

tensor([[ 679., -679.,  679., -679.]])
tensor(-679.)


AttributeError: 'Tensor' object has no attribute 'astype'

In [141]:
valid_data = read_pickle('../yelp_dataset/rates/valid_with_neg_sample.pickle')

In [146]:
pos = [[y['user_id'], pos_id, 1] for y in valid_data for pos_id in y['pos_business_id']]
neg = [[y['user_id'], neg_id, 0] for y in valid_data for neg_id in y['neg_business_id']]
pos

[[0,
  [515,
   507,
   187,
   165,
   567,
   526,
   365,
   204,
   591,
   520,
   534,
   97,
   618,
   93,
   577,
   338],
  1],
 [1, [29], 1],
 [2,
  [10,
   615,
   73,
   497,
   540,
   37,
   398,
   356,
   255,
   513,
   251,
   233,
   12,
   325,
   625,
   484,
   520,
   353],
  1],
 [3, [200, 471, 503, 415, 256, 602, 456, 177], 1],
 [4, [493, 395, 310], 1],
 [5, [22], 1],
 [7,
  [596,
   122,
   47,
   345,
   204,
   576,
   535,
   75,
   565,
   600,
   569,
   228,
   528,
   306,
   31,
   154,
   333,
   43,
   254,
   439],
  1],
 [9, [376, 527, 515], 1],
 [14, [225, 57, 162, 494, 54, 171], 1],
 [15, [515, 413, 294, 27, 305, 401, 6, 197, 552, 14, 585], 1],
 [17, [389, 109, 22, 176], 1],
 [18, [173, 49, 332, 183, 513, 589, 311, 546, 337, 274, 23, 106, 357], 1],
 [19, [493, 362, 345, 204, 395], 1],
 [22, [74, 598, 494, 159, 295, 117, 61, 320], 1],
 [23, [348, 75, 227, 93, 484, 197, 302, 107], 1],
 [25, [314, 156], 1],
 [26,
  [583, 426, 126, 445, 135, 171, 22