In [15]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import json
import string
from tqdm.notebook import tqdm

In [2]:
bussi_file = 'yelp_academic_dataset_business.json'
user_file = 'yelp_academic_dataset_user.json'
review_file = 'yelp_academic_dataset_review.json'

In [3]:
def parse_file(filename):
    with open(filename) as inputfile:
        for line in inputfile:
            yield json.loads(line)

In [4]:
busi_df = pd.DataFrame.from_records([b for b in parse_file(bussi_file)])

In [5]:
rest_df = busi_df[busi_df.apply(lambda row: isinstance(row['categories'], str) and 'Restaurants' in row['categories'], axis=1)]
rest_df.set_index('business_id', inplace=True)

In [6]:
rest_df

Unnamed: 0_level_0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,404 E Green St,Champaign,IL,61820,40.110446,-88.233073,4.5,5,1,"{'RestaurantsAttire': 'u'casual'', 'Restaurant...","Ethnic Food, Food Trucks, Specialty Food, Impo...","{'Monday': '11:30-14:30', 'Tuesday': '11:30-14..."
CsLQLiRoafpJPJSkNX2h5Q,Middle East Deli,4508 E Independence Blvd,Charlotte,NC,28205,35.194894,-80.767442,3.0,5,0,"{'RestaurantsGoodForGroups': 'True', 'OutdoorS...","Food, Restaurants, Grocery, Middle Eastern",
eBEfgOPG7pvFhb2wcG9I7w,Philthy Phillys,"15480 Bayview Avenue, unit D0110",Aurora,ON,L4G 7J1,44.010962,-79.448677,4.5,4,1,"{'RestaurantsTableService': 'False', 'Restaura...","Restaurants, Cheesesteaks, Poutineries","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
lu7vtrp_bE9PnxWfA8g4Pg,Banzai Sushi,300 John Street,Thornhill,ON,L3T 5W4,43.820492,-79.398466,4.5,7,1,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...","Japanese, Fast Food, Food Court, Restaurants",
9sRGfSVEfLhN_km60YruTA,Apadana Restaurant,13071 Yonge Street,Richmond Hill,ON,L4E 1A5,43.947011,-79.454862,3.0,3,1,"{'Ambience': '{'touristy': False, 'hipster': F...","Persian/Iranian, Turkish, Middle Eastern, Rest...","{'Tuesday': '12:0-21:0', 'Wednesday': '12:0-21..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
xVpE01l6ZXdEtVf5PkRpDg,Julep,829 E Washington Ave,Madison,WI,53703,43.081022,-89.374006,4.0,95,0,"{'NoiseLevel': 'u'average'', 'BikeParking': 'T...","Nightlife, Diners, Bars, Southern, Restaurants","{'Monday': '16:0-22:0', 'Tuesday': '16:0-22:0'..."
BAVuLTDmpSzDCk37A5HjtQ,Bruegger's Bagels,27045 Lorain Rd,North Olmsted,OH,44070,41.416568,-81.921396,3.0,13,0,"{'NoiseLevel': 'u'average'', 'BikeParking': 'F...","Restaurants, Sandwiches, Food, Breakfast & Bru...","{'Monday': '5:30-15:0', 'Tuesday': '5:30-15:0'..."
hskVqZCPqy-omm9CHi44xQ,Domino's,"6420 Rea Rd, Suite B1",Charlotte,NC,28226,35.078538,-80.818358,2.0,16,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Restaurants, Pizza, Sandwiches, Fast Food, Chi...","{'Monday': '10:30-0:0', 'Tuesday': '10:30-0:0'..."
9Q0fPWAjUweoFDk0kafuzQ,Nishi Sushi,9750 Weston Road,Vaughan,ON,L4H 2P2,43.838555,-79.559823,4.0,5,0,"{'Ambience': '{'romantic': False, 'intimate': ...","Japanese, Sushi Bars, Restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."


In [7]:
rest_cat = {}
for index, row in rest_df.iterrows():
    s = [x.strip() for x in row['categories'].split(',')]
    for c in s:
        if c not in rest_cat:
            rest_cat[c] = 0
        rest_cat[c] += 1
category_df = pd.DataFrame(rest_cat.items(), columns=['category', 'count'])

In [11]:
category_df.sort_values(by='count', inplace=True, ascending=False)
category_df

Unnamed: 0,category,count
6,Restaurants,63944
5,Food,16480
57,Nightlife,9400
52,Bars,9004
13,Fast Food,8106
...,...,...
430,Furniture Repair,1
432,Audio/Visual Equipment Rental,1
665,Health Retreats,1
667,Door Sales/Installation,1


In [22]:
# save the category_df to disk as a look dict 
category_df.to_csv('categories.csv', index=False, columns=['category', 'count'])

In [23]:
user_df = pd.DataFrame.from_records([u for u in parse_file(user_file)])

In [24]:
user_df.set_index('user_id', inplace=True)

In [25]:
review_df = pd.DataFrame.from_records([r for r in parse_file(review_file)])

In [26]:
review_df.set_index('business_id', inplace=True)

In [27]:
toronto_rest = rest_df[rest_df['city'] == 'Toronto']

In [28]:
mid_toronto_rest = toronto_rest[(toronto_rest['review_count'] >= 100) & (toronto_rest['review_count'] < 1000)]

In [29]:
mid_toronto_rest

Unnamed: 0_level_0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0QjROMVW9ACKjhSEfHqNCQ,Mi Mi Restaurant,688 Gerrard Street E,Toronto,ON,M4M 1Y3,43.666376,-79.348773,4.0,116,1,"{'RestaurantsTakeOut': 'True', 'Alcohol': 'u'b...","Vietnamese, Restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
37kk0IW6jL7ZlxZF6k2QBg,Edulis,169 Niagara Street,Toronto,ON,M5V,43.641948,-79.406580,4.0,115,1,"{'RestaurantsGoodForGroups': 'False', 'Restaur...","Restaurants, Spanish, French","{'Wednesday': '18:0-23:0', 'Thursday': '18:0-2..."
Nxg73OigmRQQq0d1pKtkUQ,Xe Lua Restaurant,"254 Spadina Avenue, 2nd Floor",Toronto,ON,M5T 2C2,43.651700,-79.397987,3.0,177,1,"{'Alcohol': 'u'beer_and_wine'', 'RestaurantsDe...","Vietnamese, Restaurants","{'Monday': '10:0-0:0', 'Tuesday': '10:0-0:0', ..."
K5Q2vkF5UpytV9Q1rB-5Yg,Akira Back,"80 Blue Jays Way, 2nd Floor",Toronto,ON,M5V 2G3,43.645297,-79.392397,4.0,124,1,"{'WiFi': ''no'', 'RestaurantsDelivery': 'False...","Restaurants, Japanese, Korean, Asian Fusion",
q0hAKzn_LmyUiScCuWS4Hg,Korean Grill House,754 Yonge Street,Toronto,ON,M4Y 2B6,43.669261,-79.386833,3.0,122,1,"{'BusinessParking': '{'garage': False, 'street...","Korean, Restaurants",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pthcdg09sHQ9M9Tj5P9gfQ,C'est What,67 Front Street E,Toronto,ON,M5E 1B5,43.648248,-79.373359,3.5,323,1,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Nightlife, Bars, Burgers, Comfort Food, Pubs, ...","{'Monday': '0:0-0:0', 'Tuesday': '11:30-17:0',..."
IE1lzZvdD9UnGeB1kXjuOQ,Aroma Espresso Bar,500 Bloor Street W,Toronto,ON,M5S 1Y3,43.665523,-79.410272,3.5,147,0,"{'RestaurantsReservations': 'False', 'WiFi': '...","Sandwiches, Coffee & Tea, Restaurants, Food, B...","{'Monday': '7:30-22:0', 'Tuesday': '7:30-22:0'..."
2auigv5pF08RjJ8CL1dIVw,O&B Canteen,330 King Street W,Toronto,ON,M5V 3X2,43.646693,-79.390210,3.0,291,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","American (New), Restaurants, Canadian (New), C...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-22:0', ..."
thzyiQZb16zD8wMliaEfRQ,Sushi Supreme,1995 Yonge Street,Toronto,ON,M4S 1Z8,43.700617,-79.396762,4.0,104,1,"{'Ambience': '{'romantic': False, 'intimate': ...","Sushi Bars, Restaurants, Japanese","{'Monday': '12:0-23:0', 'Tuesday': '12:0-23:0'..."


In [30]:
def np_encoder(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, datetime.datetime):
        return obj.__str__()

In [37]:
def convert_category_name(cat_name, prefix):
    return prefix + cat_name.translate(str.maketrans('', '', string.punctuation + ' '))
# test string
print(convert_category_name('Haha x & Y '))

TypeError: convert_category_name() missing 1 required positional argument: 'prefix'

In [38]:
category_df['colName'] = category_df.apply(lambda row: convert_category_name(row['category'], 'ca'), axis=1)
category_df

Unnamed: 0,category,count,colName
6,Restaurants,63944,caRestaurants
5,Food,16480,caFood
57,Nightlife,9400,caNightlife
52,Bars,9004,caBars
13,Fast Food,8106,caFastFood
...,...,...,...
430,Furniture Repair,1,caFurnitureRepair
432,Audio/Visual Equipment Rental,1,caAudioVisualEquipmentRental
665,Health Retreats,1,caHealthRetreats
667,Door Sales/Installation,1,caDoorSalesInstallation


In [42]:
def construct_graph(rest_df, n_rest, user_sample_size, output_dir):
    node_mapping = {}
    g = nx.Graph()
    node_idx = 0
    edge_idx = 0
    
    # Select all restaurants of a neighborhood (identified by postal code)
    groups = rest_df.groupby('postal_code')
    cum_sum = groups.size().sort_values(ascending=False).cumsum()
    postal_codes = cum_sum[cum_sum <= n_rest].index.to_list()
    if len(postal_codes) > 0:
        restaurants = pd.concat([groups.get_group(p) for p in postal_codes])
        leftover = n_rest - cum_sum.loc[postal_codes[-1]]
    else:
        restaurants = pd.DataFrame()
        leftover = n_rest
    
    if leftover > 0:
        next_postal_code = cum_sum[cum_sum > n_rest][:1].index.to_list()[0]
        postal_codes.append(next_postal_code)
        restaurants = pd.concat([restaurants, groups.get_group(next_postal_code).sample(leftover)])
    print('postal codes: ', postal_codes, leftover)

    for bs_id, a in tqdm(restaurants.iterrows(), total=len(restaurants)):
        # Add a restaurant node
        g.add_node(node_idx, original_id=bs_id, label=a['name'], type='restaurant', **a.to_dict())
        # Add category columns to this restaurant node
        temp_cat_dict = {}
        for x in a['categories'].split(','):
            temp_cat_dict[x.strip()] = True
        for cat_idx, cat_row in category_df.iterrows():
            g.nodes[node_idx][cat_row['colName']] = 1 if cat_row['category'] in temp_cat_dict else 0

        node_mapping[bs_id] = node_idx
        node_idx += 1

        reviews = review_df.loc[bs_id]
        # print(bs_id, a['postal_code'], a['name'], '\t review count:', len(reviews))
        sample_reviews = reviews.sample(min(user_sample_size, len(reviews)))
        for idx, r in sample_reviews.iterrows():
            user_id = r['user_id']
            u = user_df.loc[user_id]
            if user_id not in node_mapping:
                # Add a user node
                g.add_node(node_idx, original_id=r['user_id'], label=u['name'], type='user', **u.to_dict())
                node_mapping[user_id] = node_idx
                node_idx += 1
            # else:
                # print('user found: ', user_id)

            # Add an edge
            g.add_edge(node_mapping[bs_id], node_mapping[user_id], star=r['stars'], date=r['date'], text=r['text'])
    
    print('# nodes: ', g.number_of_nodes(), '\t # edges: ', g.number_of_edges())
    json_data = nx.node_link_data(g)
    if not os.path.exists(output_dir):
        print('Create directory: ', output_dir)
        os.makedirs(output_dir)
    json.dump(json_data, open(os.path.join(output_dir, 'graph.json'), 'w'), indent=2, allow_nan=False, default=np_encoder)
    return g

In [43]:
construct_graph(toronto_rest, 20, 50, 'graphs/toronto-rest-20-user-50')

postal codes:  ['M5T 1L1'] 20


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


# nodes:  693 	 # edges:  741


<networkx.classes.graph.Graph at 0x7faf9bbd9e50>

In [44]:
construct_graph(toronto_rest, 50, 50, 'graphs/toronto-rest-50-user-50')

postal codes:  ['M5T 1L1', 'M1V 5N1'] 15


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


# nodes:  1265 	 # edges:  1438


<networkx.classes.graph.Graph at 0x7faf9bbdc950>

In [45]:
construct_graph(toronto_rest, 100, 100, 'graphs/toronto-rest-100-user-100')

postal codes:  ['M5T 1L1', 'M1V 5N1', 'M5T', 'M5V 1J5'] 3


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


# nodes:  2832 	 # edges:  3498


<networkx.classes.graph.Graph at 0x7faf9bbd3790>

In [46]:
construct_graph(toronto_rest, 200, 150, 'graphs/toronto-rest-200-user-150')

postal codes:  ['M5T 1L1', 'M1V 5N1', 'M5T', 'M5V 1J5', 'M5V', 'M6A 2T9', 'M4K 1P1', 'M5B 2H1'] 5


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


# nodes:  7098 	 # edges:  9366


<networkx.classes.graph.Graph at 0x7faf9bb1f310>