In [78]:
import ast    
import json
import random
import pickle
from collections import defaultdict

In [17]:
def get_hierarchy(hierarchy_file='Taxonomy_100'):
    nodes = {}
    with open(hierarchy_file) as f:
        for line in f:
            node = ast.literal_eval(line)
            title = node['title']
            del node['title']
            nodes[title] = node
    return nodes

In [21]:
def get_reviews(yelp_review_file='yelp_academic_dataset_review.json'):
    bz_reviews = defaultdict(list)
    with open(yelp_review_file) as f:
        for line in f:
            data = json.loads(line)
            bz_reviews[data['business_id']].append(data['text'])
    return bz_reviews

In [22]:
%%time
bz_reviews = get_reviews()

CPU times: user 30.4 s, sys: 2.48 s, total: 32.9 s
Wall time: 34 s


In [24]:
len(bz_reviews)

174567

In [61]:
def get_bz_categories(nodes, yelp_bz_file='yelp_academic_dataset_business.json'):
    bz_cats = {}
    with open(yelp_bz_file) as f:
        for line in f:
            data = json.loads(line)
            # delete businesses with no categories
            if data['categories']: 
                # delete categories that are not in our hierarchy tree
                cats = [cat for cat in data['categories'].split(', ') if cat in nodes]
                if cats:
                    bz_cats[data['business_id']] = cats
    return bz_cats

In [62]:
bz_cats = get_bz_categories(nodes)

In [79]:
def merge_review_cats(bz_cats, bz_reviews, n_max_reviews=3, save_dir='train_data.pickle'):
    # based on bz_cats
    data = {}
    for bid in bz_cats:
        if bid in bz_reviews:
            reviews = bz_reviews[bid]
            if len(reviews) > n_max_reviews:
                reviews = random.sample(reviews, n_max_reviews)
            data[bid] = {'categories': bz_cats[bid], 'reviews': ' '.join(reviews)}
    if save_dir:
        with open(save_dir, 'w') as file:
            pickle.dump(data, file)
    return data

In [80]:
data = merge_review_cats(bz_cats, bz_reviews)