In [1]:
import json
import pandas as pd

In [2]:
trust_score_weight = [0.3, 0.7]
smoother = 0.001

# Compute Foursquare trust score for each tuple

In [3]:
def open_fs_json(fs_file_name):
    """Read foursquare json files
    input: foursquare file name
    output: json_formatted restaurant tuples
    """
    with open(fs_file_name) as fs_json:
        fs_restaurants = json.load(fs_json)
    return fs_restaurants

In [5]:
# load foursquare restaurant tuples for each data collection day: 0324, 0331, 0406
fs_restaurants_0324 = open_fs_json('data/Semiar_FS_0324.json')
fs_restaurants_0331 = open_fs_json('data/Semiar_FS_0331.json')
fs_restaurants_0406 = open_fs_json('data/Semiar_FS_0406.json')

In [6]:
def json_to_checkin(fs_restaurants):
    """extract checkins for each restaurant tuple
    input: json_formatted restaurant tuples
    output: list of restaurant-wise checkins
    """
    restaurant_checkins_list = []
    for restaurant in fs_restaurants:
        checkin = restaurant['response']['venue']['stats']['checkinsCount']
        restaurant_checkins_list.append(checkin)
    return restaurant_checkins_list

In [7]:
# generate foursquare restaurant checkin lists for each data collection day: 0324, 0331, 0406
restaurant_checkins_list_0324 = json_to_checkin(fs_restaurants_0324)
restaurant_checkins_list_0331 = json_to_checkin(fs_restaurants_0331)
restaurant_checkins_list_0406 = json_to_checkin(fs_restaurants_0406)

In [8]:
def checkin_to_diff(checkins_lists, durations):
    """generate checkin differences for each duration of data collection
    input: all checkins lists, length(number of days) of each duration
    ouput: lists of checkins differences
    
    """
    checkins_diffs_list = []
    for lists_idx in range(len(checkins_lists)-1):
        # a pair of checkins is zipped together for computational convenience
        list_pair_zipped = list(zip(checkins_lists[lists_idx],checkins_lists[lists_idx+1]))
#         checkins_diff = []
        for pair_idx in range(len(list_pair_zipped)):# the element update should be assisted by index
        # daily difference
            list_pair_zipped[pair_idx] \
            = (list_pair_zipped[pair_idx][1]-list_pair_zipped[pair_idx][0])/durations[lists_idx]
        checkins_diffs_list.append(list_pair_zipped)
    return checkins_diffs_list

In [9]:
checkins_lists = [restaurant_checkins_list_0324, restaurant_checkins_list_0331, restaurant_checkins_list_0406]
durations = [7,6]

In [10]:
# lists of checkins differences
checkins_diffs_list = checkin_to_diff(checkins_lists, durations)

In [11]:
# zipped difference together
checkins_diffs_zipped = list(zip(*checkins_diffs_list))

In [12]:
# Foursquare
# sum up the daily difference to obtain the trust value for each tuple
trust_scores_list_FS = []
for diffs in checkins_diffs_zipped:
    trust_score_FS = trust_score_weight[0]*abs(diffs[0]) + trust_score_weight[1]*abs(diffs[1])
    trust_scores_list_FS.append(trust_score_FS)

In [13]:
# trust score for the first 5 restaurants
trust_scores_list_FS[0:5]

[0.04285714285714285, 0.0, 0.2333333333333333, 0.11666666666666665, 0.0]

In [14]:
# dataframe to save trust scores
trust_scores_df = pd.DataFrame({'trust_score_FS':trust_scores_list_FS})

In [15]:
trust_scores_df.to_pickle('trust_scores_df')

# Compute Yelp trust score for each tuple

In [18]:
with open('data/Semiar_Yelp_0331.json') as Yelp_json:
    Yelp_restaurants = json.load(Yelp_json)

In [19]:
'id' in Yelp_restaurants[4]

False

In [20]:
def open_Yelp_json(Yelp_file_name):
    """Read Yelp json files
    input: Yelp file name
    output: json_formatted restaurant tuples
    """
    with open(Yelp_file_name) as Yelp_json:
        Yelp_restaurants = json.load(Yelp_json)
    return Yelp_restaurants

In [21]:
# load foursquare restaurant tuples for each data collection day: 0324, 0331, 0406
Yelp_restaurants_0324 = open_Yelp_json('data/Semiar_Yelp_0324.json')
Yelp_restaurants_0331 = open_Yelp_json('data/Semiar_Yelp_0331.json')
Yelp_restaurants_0406 = open_Yelp_json('data/Semiar_Yelp_0406.json')

In [22]:
def availability_Yelp(Yelp_json_list):
    """ whether a Yelp restaurant info is available for a date
    input: Yelp json files
    output: restaurant availability
    """
    Yelp_availability = []
    for daily_json in Yelp_json_list:
        Yelp_daily_availability = []
        for restaurant in daily_json:
            Yelp_daily_availability.append('id' in restaurant)
        Yelp_availability.append(Yelp_daily_availability)
    return Yelp_availability

In [23]:
Yelp_json_list = [Yelp_restaurants_0324, Yelp_restaurants_0331, Yelp_restaurants_0406]
Yelp_availability = availability_Yelp(Yelp_json_list)

In [24]:
# zip availability
Yelp_availability_zipped = list(zip(*Yelp_availability))

In [25]:
idx_to_check = []
for availability in Yelp_availability_zipped:
    True_list  = []
    for date_idx in range(len(availability)):
        if availability[date_idx] == True:
            True_list.append(date_idx)
    idx_to_check.append(True_list)

In [26]:
def diff_Yelp(Yelp_tuple_0, Yelp_tuple_1):
    """ whether to add a trust score
    input: Yelp json tuples
    output: 1 or 0 (whethr to add or not)
    """
    if (Yelp_tuple_0['image_url'] != Yelp_tuple_1['image_url'])\
    or (Yelp_tuple_0['photos'] != Yelp_tuple_1['photos']):
        return 1
    else:
        return 0

In [27]:
trust_scores_list_Yelp = []
for restaurant_idx in range(len(Yelp_restaurants_0324)):
    if len(idx_to_check[restaurant_idx]) == 3:
        trust_score_Yelp = trust_score_weight[0] * diff_Yelp(Yelp_json_list[idx_to_check[restaurant_idx][0]][restaurant_idx], \
                                                    Yelp_json_list[idx_to_check[restaurant_idx][1]][restaurant_idx]) \
                            + trust_score_weight[1] * diff_Yelp(Yelp_json_list[idx_to_check[restaurant_idx][1]][restaurant_idx], \
                                                    Yelp_json_list[idx_to_check[restaurant_idx][2]][restaurant_idx])
    elif len(idx_to_check[restaurant_idx]) == 2:
        trust_score_Yelp = diff_Yelp(Yelp_json_list[idx_to_check[restaurant_idx][0]][restaurant_idx], \
                                     Yelp_json_list[idx_to_check[restaurant_idx][1]][restaurant_idx])
    else:
        trust_score_Yelp = 0
    trust_scores_list_Yelp.append(trust_score_Yelp)

In [28]:
trust_scores_df = pd.read_pickle('trust_scores_df')

In [29]:
se = pd.Series(trust_scores_list_Yelp)
trust_scores_df['trust_score_Yelp'] = se.values

In [30]:
se = pd.Series(idx_to_check)
trust_scores_df['idx_to_check_Yelp'] = se.values

In [31]:
trust_scores_list_FS = trust_scores_df['trust_score_FS'].tolist()

In [32]:
sorted(trust_scores_list_FS, reverse = True)[0:5]

[49.10238095238095,
 9.72857142857143,
 1.9166666666666665,
 1.5857142857142856,
 1.5]

In [33]:
# Treat 49.10238095238095 and 9.72857142857143 as outliers
# normalize trust_score_FS
trust_scores_list_FS_norm = []
min_trust_score_FS = min(trust_scores_list_FS)
max_trust_score_FS = 1.9166666666666665
for trust_score in trust_scores_list_FS:
    if trust_score > 2:
        trust_scores_list_FS_norm.append(1)
    else:
        trust_score_norm = (float(trust_score) - min_trust_score_FS)/(max_trust_score_FS - min_trust_score_FS)
        trust_scores_list_FS_norm.append(trust_score_norm)

In [34]:
se = pd.Series(trust_scores_list_FS_norm)
trust_scores_df['trust_score_FS_norm'] = se.values

In [35]:
trust_scores_df.to_pickle('trust_scores_df')

# Merge Foursquare and Yelp

In [36]:
# nullify trust_score_Yelp with no Yelp records
for restaurant_idx in range(len(trust_scores_df)):
    if not trust_scores_df['idx_to_check_Yelp'][restaurant_idx]:
        trust_scores_df['trust_score_Yelp'][restaurant_idx] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [37]:
Foursquare_prices_recent = []
Foursquare_ratings_recent = []
for restaurant in fs_restaurants_0406:
    if 'price' in restaurant['response']['venue']:
        Foursquare_prices_recent.append(restaurant['response']['venue']['price']['tier'])
    else:
        Foursquare_prices_recent.append(None)
    if 'rating' in restaurant['response']['venue']:
        Foursquare_ratings_recent.append(restaurant['response']['venue']['rating'])
    else:
        Foursquare_ratings_recent.append(None)

In [38]:
se = pd.Series(Foursquare_prices_recent)
trust_scores_df['FS_prices_recent'] = se.values

In [39]:
se = pd.Series(Foursquare_ratings_recent)
trust_scores_df['FS_ratings_recent'] = se.values

In [40]:
trust_scores_df.to_pickle('trust_scores_df')

In [41]:
Yelp_prices_recent = []
Yelp_ratings_recent = []
for restaurant_idx in range(len(trust_scores_df)):
    if not trust_scores_df['idx_to_check_Yelp'][restaurant_idx]:
        Yelp_prices_recent.append(None)
        Yelp_ratings_recent.append(None)
    else:
        if 'price' in Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]:
            Yelp_price = Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]['price']
            if Yelp_price:
                Yelp_prices_recent.append(len(Yelp_price))
            else:
                Yelp_prices_recent.append(None)
        else:
            Yelp_prices_recent.append(None)
        if 'rating' in Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]:
            Yelp_rating = Yelp_json_list[trust_scores_df['idx_to_check_Yelp'][restaurant_idx][-1]][restaurant_idx]['rating']
            if Yelp_rating:
                Yelp_ratings_recent.append(Yelp_rating)
            else:
                Yelp_ratings_recent.append(None)
        else:
            Yelp_ratings_recent.append(None)

In [42]:
se = pd.Series(Yelp_prices_recent)
trust_scores_df['Yelp_prices_recent'] = se.values

In [43]:
se = pd.Series(Yelp_ratings_recent)
trust_scores_df['Yelp_ratings_recent'] = se.values

In [44]:
trust_scores_df.to_pickle('trust_scores_df')

In [45]:
# let Yelp rating go to the same range as Foursquare
for restaurant_idx in range(len(trust_scores_df)):
    trust_scores_df['Yelp_ratings_recent'][restaurant_idx] *= 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [46]:
trust_scores_df.to_pickle('trust_scores_df')

In [47]:
prices_merge = []
ratings_merge = []
for restaurant_idx in range(len(trust_scores_df)):
    # prices
    if (str(trust_scores_df['Yelp_prices_recent'][restaurant_idx]) == 'nan')\
     or (str(trust_scores_df['trust_score_Yelp'][restaurant_idx]) == 'nan'):
        if str(trust_scores_df['FS_prices_recent'][restaurant_idx]) == 'nan':
            price_merge = None
        else:
            price_merge = trust_scores_df['FS_prices_recent'][restaurant_idx]
    else:
        if str(trust_scores_df['FS_prices_recent'][restaurant_idx]) == 'nan':
            price_merge = trust_scores_df['Yelp_prices_recent'][restaurant_idx]
        else:
            if trust_scores_df['trust_score_FS_norm'][restaurant_idx] >=\
                trust_scores_df['trust_score_Yelp'][restaurant_idx]:
                price_merge = trust_scores_df['FS_prices_recent'][restaurant_idx]
            else:
                price_merge = trust_scores_df['Yelp_prices_recent'][restaurant_idx]
    prices_merge.append(price_merge)
    
    # ratings
    if (str(trust_scores_df['Yelp_ratings_recent'][restaurant_idx]) == 'nan')\
     or (str(trust_scores_df['trust_score_Yelp'][restaurant_idx]) == 'nan'):
        if str(trust_scores_df['FS_ratings_recent'][restaurant_idx]) == 'nan':
            rating_merge = None
        else:
            rating_merge = trust_scores_df['FS_ratings_recent'][restaurant_idx]
    else:
        if str(trust_scores_df['FS_prices_recent'][restaurant_idx]) == 'nan':
            rating_merge = trust_scores_df['Yelp_ratings_recent'][restaurant_idx]
        else:
            denominator = (trust_scores_df['trust_score_FS_norm'][restaurant_idx] + \
                           trust_scores_df['trust_score_Yelp'][restaurant_idx] + 2*smoother)
            rating_merge = (trust_scores_df['trust_score_FS_norm'][restaurant_idx]+smoother)/denominator*trust_scores_df['FS_ratings_recent'][restaurant_idx]\
            + (trust_scores_df['trust_score_Yelp'][restaurant_idx]+smoother)/denominator*trust_scores_df['Yelp_ratings_recent'][restaurant_idx]
    ratings_merge.append(rating_merge)

In [48]:
se = pd.Series(prices_merge)
trust_scores_df['prices_merge'] = se.values

In [49]:
se = pd.Series(ratings_merge)
trust_scores_df['ratings_merge'] = se.values

In [50]:
trust_scores_df.to_pickle('trust_scores_df')

# Preview of Merged Price Tiers and Ratings

In [51]:
trust_scores_df['prices_merge'][0:5]

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: prices_merge, dtype: float64

In [52]:
trust_scores_df['ratings_merge'][0:5]

0    6.726109
1    4.000000
2    7.289494
3    5.040551
4    6.000000
Name: ratings_merge, dtype: float64

Unnamed: 0,trust_score_FS,trust_score_Yelp,idx_to_check_Yelp,trust_score_FS_norm,FS_prices_recent,FS_ratings_recent,Yelp_prices_recent,Yelp_ratings_recent,prices_merge,ratings_merge
0,0.042857,0.0,"[0, 1, 2]",0.022360,1.0,6.8,1.0,5.0,1.0,6.726109
1,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,6.0,1.0,2.0,1.0,4.000000
2,0.233333,0.0,"[0, 1, 2]",0.121739,1.0,7.3,1.0,6.0,1.0,7.289494
3,0.116667,0.7,"[0, 1, 2]",0.060870,1.0,5.5,1.0,5.0,1.0,5.040551
4,0.000000,,[],0.000000,1.0,6.0,,,1.0,6.000000
5,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,7.2,1.0,5.0,1.0,6.100000
6,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,6.9,1.0,8.0,1.0,7.450000
7,0.000000,1.0,"[0, 2]",0.000000,1.0,5.3,1.0,4.0,1.0,4.001297
8,0.116667,0.0,"[0, 1, 2]",0.060870,1.0,5.1,1.0,6.0,1.0,5.114315
9,0.000000,0.0,"[0, 1, 2]",0.000000,1.0,7.3,1.0,7.0,1.0,7.150000
