In [1]:
import math
import numpy as np
import pandas as pd
import multiprocessing as mp

## 1. Load Data

In [2]:
yelp_lv_rvs = pd.read_csv('../../dataset/las_vegas/las_vegas_review_with_stars_time_scaled_with_db_id.csv')

In [3]:
yelp_lv_rvs[:5]

Unnamed: 0,db_id,review_id,user_db_id,business_db_id,stars,year,stars_time_scaled
0,3,---3OXpexMp0oAg77xWfYA,999269,92729,5,2012,2.5
1,6,---94vtJ_5o_nikEs6hUjg,313272,122971,5,2014,4.403985
2,8,---D6-P4MpS86LYldBfX7w,735101,160943,4,2016,3.928055
3,20,---WDP9kwKyVQiw9GTgNmQ,1045600,12131,1,2014,0.880797
4,22,---zHMCae68gIbSbtXxD5w,971613,15470,4,2015,3.810297


## 2. Calculate User2Business Maps

In [4]:
users = yelp_lv_rvs.user_db_id.unique()

In [5]:
def calc_user_bizes(user):
    user_bizes = dict()
    user_bizes[user] = dict()
    rvs = yelp_lv_rvs[yelp_lv_rvs.user_db_id==user]
    for index, row in rvs.iterrows():
        biz = row['business_db_id']
        user_bizes[user][biz] = dict(stars_time_scaled=row['stars_time_scaled'], year=row['year'])
    return user_bizes

In [6]:
pool = mp.Pool(processes=12)
user_bizes = pool.map(calc_user_bizes, users)
pool.close()
pool.join()

In [7]:
user_biz_maps = {user_biz.keys()[0]: user_biz.values()[0] for user_biz in user_bizes}

In [8]:
user_biz_maps[76887]

{5813: {'stars_time_scaled': 3.8102965072897335, 'year': 2015},
 26820: {'stars_time_scaled': 3.8102965072897335, 'year': 2015},
 30707: {'stars_time_scaled': 2.8577223804672998, 'year': 2015},
 58373: {'stars_time_scaled': 2.8577223804672998, 'year': 2015},
 62410: {'stars_time_scaled': 3.8102965072897335, 'year': 2015},
 77066: {'stars_time_scaled': 1.9051482536448667, 'year': 2015},
 105514: {'stars_time_scaled': 3.8102965072897335, 'year': 2015},
 123345: {'stars_time_scaled': 3.8102965072897335, 'year': 2015},
 124443: {'stars_time_scaled': 4.762870634112167, 'year': 2015},
 148480: {'stars_time_scaled': 3.8102965072897335, 'year': 2015},
 149926: {'stars_time_scaled': 2.8577223804672998, 'year': 2015},
 154973: {'stars_time_scaled': 4.762870634112167, 'year': 2015}}

## 3. Calculate User2User Similarities

这一步的计算将会耗费很多时间，请参考博客说明

In [9]:
alpha = 1.0
def time_attenuation(delta):
    return 1 / (1 + alpha * abs(delta))

In [10]:
def _calc_similarity(bizes_a, bizes_b):
    similarity = 0
    comm_bizes = set(bizes_a.keys()) & set(bizes_b.keys()) # Common businesses
    if len(comm_bizes) < 2: # Filter out those with number of common businesses less than 2
        return similarity

    bizes_ab_product = 0
    biz_a_length = 0
    biz_b_length = 0
    for biz in comm_bizes:
        stars_a = bizes_a[biz]['stars_time_scaled']
        stars_b = bizes_b[biz]['stars_time_scaled']
        year_a = bizes_a[biz]['year']
        year_b = bizes_b[biz]['year']
        bizes_ab_product += stars_a * stars_b * time_attenuation(year_a - year_b)
        biz_a_length += pow(stars_a, 2)
        biz_b_length += pow(stars_b, 2)

    similarity = bizes_ab_product / (math.sqrt(biz_a_length) * math.sqrt(biz_b_length))
    return similarity

In [11]:
calculated = dict()

In [12]:
def calc_similarity(user_a):
    user_similarities = list()
    bizes_a = user_biz_maps[user_a]
    for user_b in users:
        if user_a == user_b:
            continue
        if (user_a, user_b) in calculated or (user_b, user_a) in calculated:
            continue
        bizes_b = user_biz_maps[user_b]
        similarity = _calc_similarity(bizes_a, bizes_b)
        if similarity:
            user_similarities.append([user_a, user_b, similarity])
            calculated[(user_a, user_b)] = True

    return user_similarities

In [13]:
pool = mp.Pool(processes=12)
user_similarities = pool.map(calc_similarity, users)
pool.close()
pool.join()

## 4. Join Results Together and Save

In [14]:
all_user_similarities = list()
for similarities in user_similarities:
    all_user_similarities.extend(similarities)

In [15]:
len(all_user_similarities)

55134458

In [16]:
similarity_df = pd.DataFrame(all_user_similarities)

In [17]:
similarity_df.to_csv('../../dataset/las_vegas/las_vegas_user2user_similarities.csv', index=False)