In [1]:
import json
import os
import random

In [2]:
yelp_path = '../../narre_all_data/yelp/yelp_dataset'

In [3]:
def load_user2num(min_num=20, max_num=100):
    """
    根据评论数目加载user
    min_num: 最小数目
    max_num: 最大数目
    """
    user2num = {}
    with open(os.path.join(yelp_path, 'user.json'), 'r', encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break

            user = json.loads(line)
            id = user['user_id']
            num = user['review_count']
            
            if num >= min_num and num <= max_num:
                user2num[id] = num

#     user2num = dict(sorted(user2num.items(), key=lambda item:item[1]))
    return user2num

In [4]:
def load_item2num(min_num=20, max_num=100):
    """
    根据评论数目加载item
    min_num: 最小数目
    max_num: 最大数目
    """
    item2num = {}
    with open(os.path.join(yelp_path, 'business.json'), 'r', encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break

            item = json.loads(line)
            id = item['business_id']
            num = item['review_count']

            if num >= min_num and num <= max_num:
                item2num[id] = num

#     item2num = dict(sorted(item2num.items(), key=lambda item:item[1]))
    return item2num

In [5]:
def load_review(users, items, sample_num=200000):    
    """
    根据users和items随机采样评论
    users: 
    items: 
    sample_num: 采样数目
    """
    datas = []
    with open(os.path.join(yelp_path, 'review.json'), 'r', encoding='utf-8') as f:
        while True:
            line = f.readline()
            if not line:
                break

            item = json.loads(line)
            user_id = item['user_id']
            item_id = item['business_id']
            review = item['text']
            rating = item['stars']
            date = item['date']
            
            if date >= '2018':
                continue
            
            if user_id in users and item_id in items:
                datas.append({'reviewerID':user_id, 'asin':item_id, 'reviewText':review, 'overall':rating})
    
#     datas = random.sample(datas, sample_num)
    return datas

In [6]:
user2num = load_user2num()
item2num = load_item2num()

In [7]:
datas = load_review(user2num.keys(), item2num.keys())

In [8]:
def preprocess(min_review_num=5):
    users = {}
    items = {}
    for data in datas:
        num = users.setdefault(data['reviewerID'], 0) + 1
        users[data['reviewerID']] = num
        num = items.setdefault(data['asin'], 0) + 1
        items[data['asin']] = num
        
    deleteUsers = [user for user,num in users.items() if num<min_review_num]
    deleteItems = [item for item,num in items.items() if num<min_review_num]
    
    print('user num: %d, delete num: %d' % (len(users), len(deleteUsers)))
    print('item num: %d, delete num: %d' % (len(items), len(deleteItems)))
    
    tempDatas = []
    for data in datas:
        user = data['reviewerID']
        item = data['asin']
        
        if user not in deleteUsers and item not in deleteItems:
            tempDatas.append(data)
    return tempDatas

In [9]:
tempDatas = preprocess(10)

user num: 110021, delete num: 96976
item num: 43268, delete num: 23028


In [10]:
print(len(list(set([data['reviewerID'] for data in tempDatas]))))
print(len(list(set([data['asin'] for data in tempDatas]))))

13045
19745


In [11]:
with open(os.path.join(yelp_path, '../', 'yelp.json'), 'w', encoding='utf-8') as f:
    for data in tempDatas:
        f.write(json.dumps(data) + '\n')        