In [1]:
import json
import pandas as pd

In [2]:
trust_score_weight = [0.3, 0.7]

# Compute Foursquare trust score for each tuple

In [72]:
def open_fs_json(fs_file_name):
    """Read foursquare json files
    input: foursquare file name
    output: json_formatted restaurant tuples
    """
    with open(fs_file_name) as fs_json:
        fs_restaurants = json.load(fs_json)
    return fs_restaurants

In [73]:
# load foursquare restaurant tuples for each data collection day: 0324, 0331, 0406
fs_restaurants_0324 = open_fs_json('Semiar_FS_0324.json')
fs_restaurants_0331 = open_fs_json('Semiar_FS_0331.json')
fs_restaurants_0406 = open_fs_json('Semiar_FS_0406.json')

In [6]:
def json_to_checkin(fs_restaurants):
    """extract checkins for each restaurant tuple
    input: json_formatted restaurant tuples
    output: list of restaurant-wise checkins
    """
    restaurant_checkins_list = []
    for restaurant in fs_restaurants:
        checkin = restaurant['response']['venue']['stats']['checkinsCount']
        restaurant_checkins_list.append(checkin)
    return restaurant_checkins_list

In [7]:
# generate foursquare restaurant checkin lists for each data collection day: 0324, 0331, 0406
restaurant_checkins_list_0324 = json_to_checkin(fs_restaurants_0324)
restaurant_checkins_list_0331 = json_to_checkin(fs_restaurants_0331)
restaurant_checkins_list_0406 = json_to_checkin(fs_restaurants_0406)

In [8]:
def checkin_to_diff(checkins_lists, durations):
    """generate checkin differences for each duration of data collection
    input: all checkins lists, length(number of days) of each duration
    ouput: lists of checkins differences
    
    """
    checkins_diffs_list = []
    for lists_idx in range(len(checkins_lists)-1):
        # a pair of checkins is zipped together for computational convenience
        list_pair_zipped = list(zip(checkins_lists[lists_idx],checkins_lists[lists_idx+1]))
#         checkins_diff = []
        for pair_idx in range(len(list_pair_zipped)):# the element update should be assisted by index
        # daily difference
            list_pair_zipped[pair_idx] \
            = (list_pair_zipped[pair_idx][1]-list_pair_zipped[pair_idx][0])/durations[lists_idx]
        checkins_diffs_list.append(list_pair_zipped)
    return checkins_diffs_list

In [9]:
checkins_lists = [restaurant_checkins_list_0324, restaurant_checkins_list_0331, restaurant_checkins_list_0406]
durations = [7,6]

In [10]:
# lists of checkins differences
checkins_diffs_list = checkin_to_diff(checkins_lists, durations)

In [11]:
# zipped difference together
checkins_diffs_zipped = list(zip(*checkins_diffs_list))

In [12]:
# Foursquare
# sum up the daily difference to obtain the trust value for each tuple
trust_scores_list_FS = []
for diffs in checkins_diffs_zipped:
    trust_score_FS = trust_score_weight[0]*abs(diffs[0]) + trust_score_weight[1]*abs(diffs[1])
    trust_scores_list_FS.append(trust_score_FS)

In [14]:
# trust score for the first 5 restaurants
trust_scores_list_FS[0:5]

[0.04285714285714285, 0.0, 0.2333333333333333, 0.11666666666666665, 0.0]

In [18]:
# dataframe to save trust scores
trust_scores_df = pd.DataFrame({'trust_score_FS':trust_scores_list_FS})

In [20]:
trust_scores_df.to_pickle('trust_scores_df')

# Compute Yelp trust score for each tuple

In [3]:
with open('Semiar_Yelp_0331.json') as Yelp_json:
    Yelp_restaurants = json.load(Yelp_json)

In [16]:
'id' in Yelp_restaurants[4]

False

In [10]:
def open_Yelp_json(Yelp_file_name):
    """Read Yelp json files
    input: Yelp file name
    output: json_formatted restaurant tuples
    """
    with open(Yelp_file_name) as Yelp_json:
        Yelp_restaurants = json.load(Yelp_json)
    return Yelp_restaurants

In [11]:
# load foursquare restaurant tuples for each data collection day: 0324, 0331, 0406
Yelp_restaurants_0324 = open_Yelp_json('Semiar_Yelp_0324.json')
Yelp_restaurants_0331 = open_Yelp_json('Semiar_Yelp_0331.json')
Yelp_restaurants_0406 = open_Yelp_json('Semiar_Yelp_0406.json')

In [21]:
def availability_Yelp(Yelp_json_list):
    """ whether a Yelp restaurant info is available for a date
    input: Yelp json files
    output: restaurant availability
    """
    Yelp_availability = []
    for daily_json in Yelp_json_list:
        Yelp_daily_availability = []
        for restaurant in daily_json:
            Yelp_daily_availability.append('id' in restaurant)
        Yelp_availability.append(Yelp_daily_availability)
    return Yelp_availability

In [22]:
Yelp_json_list = [Yelp_restaurants_0324, Yelp_restaurants_0331, Yelp_restaurants_0406]
Yelp_availability = availability_Yelp(Yelp_json_list)

In [25]:
# zip availability
Yelp_availability_zipped = list(zip(*Yelp_availability))

In [30]:
idx_to_check = []
for availability in Yelp_availability_zipped:
    True_list  = []
    for date_idx in range(len(availability)):
        if availability[date_idx] == True:
            True_list.append(date_idx)
    idx_to_check.append(True_list)

In [35]:
def diff_Yelp(Yelp_tuple_0, Yelp_tuple_1):
    """ whether to add a trust score
    input: Yelp json tuples
    output: 1 or 0 (whethr to add or not)
    """
    if (Yelp_tuple_0['image_url'] != Yelp_tuple_1['image_url'])\
    or (Yelp_tuple_0['photos'] != Yelp_tuple_1['photos']):
        return 1
    else:
        return 0

In [37]:
trust_scores_list_Yelp = []
for restaurant_idx in range(len(Yelp_restaurants_0324)):
    if len(idx_to_check[restaurant_idx]) == 3:
        trust_score_Yelp = trust_score_weight[0] * diff_Yelp(Yelp_json_list[idx_to_check[restaurant_idx][0]][restaurant_idx], \
                                                    Yelp_json_list[idx_to_check[restaurant_idx][1]][restaurant_idx]) \
                            + trust_score_weight[1] * diff_Yelp(Yelp_json_list[idx_to_check[restaurant_idx][1]][restaurant_idx], \
                                                    Yelp_json_list[idx_to_check[restaurant_idx][2]][restaurant_idx])
    elif len(idx_to_check[restaurant_idx]) == 2:
        trust_score_Yelp = diff_Yelp(Yelp_json_list[idx_to_check[restaurant_idx][0]][restaurant_idx], \
                                     Yelp_json_list[idx_to_check[restaurant_idx][1]][restaurant_idx])
    else:
        trust_score_Yelp = 0
    trust_scores_list_Yelp.append(trust_score_Yelp)

In [42]:
trust_scores_df = pd.read_pickle('trust_scores_df')

In [44]:
se = pd.Series(trust_scores_list_Yelp)
trust_scores_df['trust_score_Yelp'] = se.values

In [46]:
se = pd.Series(idx_to_check)
trust_scores_df['idx_to_check_Yelp'] = se.values

In [54]:
trust_scores_list_FS = trust_scores_df['trust_score_FS'].tolist()

In [58]:
sorted(trust_scores_list_FS, reverse = True)[0:5]

[49.10238095238095,
 9.72857142857143,
 1.9166666666666665,
 1.5857142857142856,
 1.5]

In [56]:
# Treat 49.10238095238095 and 9.72857142857143 as outliers
# normalize trust_score_FS
trust_scores_list_FS_norm = []
min_trust_score_FS = min(trust_scores_list_FS)
max_trust_score_FS = 1.9166666666666665
for trust_score in trust_scores_list_FS:
    if trust_score > 2:
        trust_scores_list_FS_norm.append(1)
    else:
        trust_score_norm = (float(trust_score) - min_trust_score_FS)/(max_trust_score_FS - min_trust_score_FS)
        trust_scores_list_FS_norm.append(trust_score_norm)

In [59]:
se = pd.Series(trust_scores_list_FS_norm)
trust_scores_df['trust_score_FS_norm'] = se.values

In [60]:
trust_scores_df.to_pickle('trust_scores_df')

# Merge Foursquare and Yelp

In [65]:
# nullify trust_score_Yelp with no Yelp records
for restaurant_idx in range(len(trust_scores_df)):
    if not trust_scores_df['idx_to_check_Yelp'][restaurant_idx]:
        trust_scores_df['trust_score_Yelp'][restaurant_idx] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [69]:
str(trust_scores_df['trust_score_Yelp'][4] ) == 'nan'

True

In [82]:
Foursquare_prices_recent = []
Foursquare_ratings_recent = []
for restaurant in fs_restaurants_0406:
    if 'price' in restaurant['response']['venue']:
        Foursquare_prices_recent.append(restaurant['response']['venue']['price']['tier'])
    else:
        Foursquare_prices_recent.append(None)
    if 'rating' in restaurant['response']['venue']:
        Foursquare_ratings_recent.append(restaurant['response']['venue']['rating'])
    else:
        Foursquare_ratings_recent.append(None)

In [90]:
se = pd.Series(Foursquare_prices_recent)
trust_scores_df['FS_prices_recent'] = se.values

In [91]:
se = pd.Series(Foursquare_ratings_recent)
trust_scores_df['FS_ratings_recent'] = se.values

In [93]:
trust_scores_df.to_pickle('trust_scores_df')

In [96]:
Yelp_restaurants_0324[0]['price']

'$'

In [97]:
Yelp_restaurants_0324[0]['rating']

2.5

In [98]:
trust_scores_df

Unnamed: 0,trust_score_FS,trust_score_Yelp,idx_to_check_Yelp,trust_score_FS_norm,FS_prices_recent,FS_ratings_recent
0,0.042857,0.0,"[0, 1, 2]",0.022360,1.0,6.8
1,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,6.0
2,0.233333,0.0,"[0, 1, 2]",0.121739,1.0,7.3
3,0.116667,0.7,"[0, 1, 2]",0.060870,1.0,5.5
4,0.000000,,[],0.000000,1.0,6.0
5,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,7.2
6,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,6.9
7,0.000000,1.0,"[0, 2]",0.000000,1.0,5.3
8,0.116667,0.0,"[0, 1, 2]",0.060870,1.0,5.1
9,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,7.3


In [105]:
Yelp_prices_recent = []
Yelp_ratings_recent = []
for restaurant_idx in range(len(trust_scores_df)):
    if not trust_scores_df['idx_to_check_Yelp'][restaurant_idx]:
        Yelp_prices_recent.append(None)
        Yelp_ratings_recent.append(None)
    else:
        if 'price' in Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]:
            Yelp_price = Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]['price']
            if Yelp_price:
                Yelp_prices_recent.append(len(Yelp_price))
            else:
                Yelp_prices_recent.append(None)
        else:
            Yelp_prices_recent.append(None)
        if 'rating' in Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]:
            Yelp_rating = Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]['rating']
            if Yelp_rating:
                Yelp_ratings_recent.append(Yelp_rating)
            else:
                Yelp_ratings_recent.append(None)
        else:
            Yelp_ratings_recent.append(None)

In [107]:
len(Yelp_prices_recent)

1738

In [108]:
len(Yelp_ratings_recent)

1738

In [109]:
se = pd.Series(Yelp_prices_recent)
trust_scores_df['Yelp_prices_recent'] = se.values

In [110]:
se = pd.Series(Yelp_ratings_recent)
trust_scores_df['Yelp_ratings_recent'] = se.values

In [112]:
trust_scores_df.to_pickle('trust_scores_df')