In [74]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import json
from tqdm.notebook import tqdm

In [2]:
bussi_file = 'yelp_academic_dataset_business.json'
user_file = 'yelp_academic_dataset_user.json'
review_file = 'yelp_academic_dataset_review.json'

In [3]:
def parse_file(filename):
    with open(filename) as inputfile:
        for line in inputfile:
            yield json.loads(line)

In [4]:
busi_df = pd.DataFrame.from_records([b for b in parse_file(bussi_file)])

In [19]:
rest_df = busi_df[busi_df.apply(lambda row: isinstance(row['categories'], str) and 'Restaurants' in row['categories'], axis=1)]
rest_df.set_index('business_id', inplace=True)

In [7]:
user_df = pd.DataFrame.from_records([u for u in parse_file(user_file)])

In [17]:
user_df.set_index('user_id', inplace=True)

In [8]:
review_df = pd.DataFrame.from_records([r for r in parse_file(review_file)])

In [18]:
review_df.set_index('business_id', inplace=True)

In [20]:
toronto_rest = rest_df[rest_df['city'] == 'Toronto']

In [21]:
mid_toronto_rest = toronto_rest[(toronto_rest['review_count'] >= 100) & (toronto_rest['review_count'] < 1000)]

In [22]:
mid_toronto_rest

Unnamed: 0_level_0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0QjROMVW9ACKjhSEfHqNCQ,Mi Mi Restaurant,688 Gerrard Street E,Toronto,ON,M4M 1Y3,43.666376,-79.348773,4.0,116,1,"{'RestaurantsTakeOut': 'True', 'Alcohol': 'u'b...","Vietnamese, Restaurants","{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
37kk0IW6jL7ZlxZF6k2QBg,Edulis,169 Niagara Street,Toronto,ON,M5V,43.641948,-79.406580,4.0,115,1,"{'RestaurantsGoodForGroups': 'False', 'Restaur...","Restaurants, Spanish, French","{'Wednesday': '18:0-23:0', 'Thursday': '18:0-2..."
Nxg73OigmRQQq0d1pKtkUQ,Xe Lua Restaurant,"254 Spadina Avenue, 2nd Floor",Toronto,ON,M5T 2C2,43.651700,-79.397987,3.0,177,1,"{'Alcohol': 'u'beer_and_wine'', 'RestaurantsDe...","Vietnamese, Restaurants","{'Monday': '10:0-0:0', 'Tuesday': '10:0-0:0', ..."
K5Q2vkF5UpytV9Q1rB-5Yg,Akira Back,"80 Blue Jays Way, 2nd Floor",Toronto,ON,M5V 2G3,43.645297,-79.392397,4.0,124,1,"{'WiFi': ''no'', 'RestaurantsDelivery': 'False...","Restaurants, Japanese, Korean, Asian Fusion",
q0hAKzn_LmyUiScCuWS4Hg,Korean Grill House,754 Yonge Street,Toronto,ON,M4Y 2B6,43.669261,-79.386833,3.0,122,1,"{'BusinessParking': '{'garage': False, 'street...","Korean, Restaurants",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pthcdg09sHQ9M9Tj5P9gfQ,C'est What,67 Front Street E,Toronto,ON,M5E 1B5,43.648248,-79.373359,3.5,323,1,"{'RestaurantsDelivery': 'False', 'RestaurantsP...","Nightlife, Bars, Burgers, Comfort Food, Pubs, ...","{'Monday': '0:0-0:0', 'Tuesday': '11:30-17:0',..."
IE1lzZvdD9UnGeB1kXjuOQ,Aroma Espresso Bar,500 Bloor Street W,Toronto,ON,M5S 1Y3,43.665523,-79.410272,3.5,147,0,"{'RestaurantsReservations': 'False', 'WiFi': '...","Sandwiches, Coffee & Tea, Restaurants, Food, B...","{'Monday': '7:30-22:0', 'Tuesday': '7:30-22:0'..."
2auigv5pF08RjJ8CL1dIVw,O&B Canteen,330 King Street W,Toronto,ON,M5V 3X2,43.646693,-79.390210,3.0,291,1,"{'RestaurantsGoodForGroups': 'True', 'Alcohol'...","American (New), Restaurants, Canadian (New), C...","{'Monday': '8:0-21:0', 'Tuesday': '8:0-22:0', ..."
thzyiQZb16zD8wMliaEfRQ,Sushi Supreme,1995 Yonge Street,Toronto,ON,M4S 1Z8,43.700617,-79.396762,4.0,104,1,"{'Ambience': '{'romantic': False, 'intimate': ...","Sushi Bars, Restaurants, Japanese","{'Monday': '12:0-23:0', 'Tuesday': '12:0-23:0'..."


In [49]:
def np_encoder(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, datetime.datetime):
        return obj.__str__()

In [77]:
def construct_graph(rest_df, n_rest, user_sample_size, output_dir):
    node_mapping = {}
    g = nx.Graph()
    node_idx = 0
    edge_idx = 0
    
    # Select all restaurants of a neighborhood (identified by postal code)
    groups = rest_df.groupby('postal_code')
    cum_sum = groups.size().sort_values(ascending=False).cumsum()
    postal_codes = cum_sum[cum_sum <= n_rest].index.to_list()
    if len(postal_codes) > 0:
        restaurants = pd.concat([groups.get_group(p) for p in postal_codes])
        leftover = n_rest - cum_sum.loc[postal_codes[-1]]
    else:
        restaurants = pd.DataFrame()
        leftover = n_rest
    
    if leftover > 0:
        next_postal_code = cum_sum[cum_sum > n_rest][:1].index.to_list()[0]
        postal_codes.append(next_postal_code)
        restaurants = pd.concat([restaurants, groups.get_group(next_postal_code).sample(leftover)])
    print('postal codes: ', postal_codes, leftover)

    for bs_id, a in tqdm(restaurants.iterrows(), total=len(restaurants)):
        # Add a restaurant node
        g.add_node(node_idx, original_id=bs_id, label=a['name'], type='restaurant', **a.to_dict())
        node_mapping[bs_id] = node_idx
        node_idx += 1

        reviews = review_df.loc[bs_id]
        # print(bs_id, a['postal_code'], a['name'], '\t review count:', len(reviews))
        sample_reviews = reviews.sample(min(user_sample_size, len(reviews)))
        for idx, r in sample_reviews.iterrows():
            user_id = r['user_id']
            u = user_df.loc[user_id]
            if user_id not in node_mapping:
                # Add a user node
                g.add_node(node_idx, original_id=r['user_id'], label=u['name'], type='user', **u.to_dict())
                node_mapping[user_id] = node_idx
                node_idx += 1
            # else:
                # print('user found: ', user_id)

            # Add an edge
            g.add_edge(node_mapping[bs_id], node_mapping[user_id], star=r['stars'], date=r['date'], text=r['text'])
    
    print('# nodes: ', g.number_of_nodes(), '\t # edges: ', g.number_of_edges())
    json_data = nx.node_link_data(g)
    if not os.path.exists(output_dir):
        print('Create directory: ', output_dir)
        os.makedirs(output_dir)
    json.dump(json_data, open(os.path.join(output_dir, 'graph.json'), 'w'), indent=2, allow_nan=False, default=np_encoder)
    return g

In [78]:
construct_graph(toronto_rest, 20, 50, 'graphs/toronto-rest-20-user-50')

postal codes:  ['M5T 1L1'] 20


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))


# nodes:  637 	 # edges:  680


<networkx.classes.graph.Graph at 0xa1baafed0>

In [79]:
construct_graph(toronto_rest, 50, 50, 'graphs/toronto-rest-50-user-50')

postal codes:  ['M5T 1L1', 'M1V 5N1'] 15


HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))


# nodes:  1225 	 # edges:  1401
Create directory:  graphs/toronto-rest-50-user-50


<networkx.classes.graph.Graph at 0xa1ba926d0>

In [80]:
construct_graph(toronto_rest, 100, 100, 'graphs/toronto-rest-100-user-100')

postal codes:  ['M5T 1L1', 'M1V 5N1', 'M5T', 'M5V 1J5'] 3


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


# nodes:  2902 	 # edges:  3557
Create directory:  graphs/toronto-rest-100-user-100


<networkx.classes.graph.Graph at 0xa1ba96110>

In [81]:
construct_graph(toronto_rest, 200, 150, 'graphs/toronto-rest-200-user-150')

postal codes:  ['M5T 1L1', 'M1V 5N1', 'M5T', 'M5V 1J5', 'M5V', 'M6A 2T9', 'M4K 1P1', 'M5B 2H1'] 5


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


# nodes:  7074 	 # edges:  9285
Create directory:  graphs/toronto-rest-200-user-150


<networkx.classes.graph.Graph at 0xa1bab7e10>