In [1]:
import json
import os
import pickle
from tqdm import tqdm
from typing import *
import random
import numpy as np
import shutil
from multiprocessing import Pool
seed = 1945 # 种子值
np.random.seed(seed)
random.seed(seed)

In [2]:
cities = ['Boston', 'Chicago', 'Los Angeles', 'New York', 'San Francisco']
base_dir = os.path.join("data","yelp-vistanet")
raw_dir = os.path.join(base_dir, "raw")
raw_train_file = os.path.join(raw_dir, "train.json")
raw_valid_file = os.path.join(raw_dir, "valid.json")
raw_test_files = [os.path.join(raw_dir, "test", f"{city}_test.json") for city in cities]
base_dir, raw_dir, raw_train_file, raw_valid_file, raw_test_files

('data/yelp-vistanet',
 'data/yelp-vistanet/raw',
 'data/yelp-vistanet/raw/train.json',
 'data/yelp-vistanet/raw/valid.json',
 ['data/yelp-vistanet/raw/test/Boston_test.json',
  'data/yelp-vistanet/raw/test/Chicago_test.json',
  'data/yelp-vistanet/raw/test/Los Angeles_test.json',
  'data/yelp-vistanet/raw/test/New York_test.json',
  'data/yelp-vistanet/raw/test/San Francisco_test.json'])

In [3]:
photos_dir = os.path.join(base_dir, "photos")
photos_dir

'data/yelp-vistanet/photos'

In [5]:
def check_photo(_id:str):
    path = os.path.join(photos_dir, _id[:2], _id + ".jpg")
    return os.path.exists(path)

In [9]:
def read_reviews(file_path:str, clean_data:bool) -> List[Dict[str, str]]: 
    # 读入数据
    reviews = None
    if file_path.endswith(".json"):
         with open(file_path, 'r', encoding="utf-8") as f:
            reviews = []
            for line in tqdm(f, "Read json"):
                review = json.loads(line)
                imgs = []
                captions = []
                for photo in review['Photos']:
                    _id = photo['_id']
                    caption = photo["Caption"]
                    if clean_data:
                        if check_photo(_id):
                            imgs.append(_id)
                            captions.append(caption)
                    else:
                        imgs.append(_id)
                        captions.append(caption)
                reviews.append({'_id': review['_id'],
                      'Text': review['Text'],
                      'Photos': imgs,
                      'Captions': captions,
                      'Rating': review['Rating']})
    elif file_pathle.endswith(".pickle"):
        with open(file_path, 'rb') as f:
            reviews = pickle.load(f) # 直接从pickle中加载
    else:
        raise RuntimeError("Illegal file path!")
    return reviews

In [10]:
pathes = [raw_train_file, raw_valid_file, *raw_test_files]
pathes

['data/yelp-vistanet/raw/train.json',
 'data/yelp-vistanet/raw/valid.json',
 'data/yelp-vistanet/raw/test/Boston_test.json',
 'data/yelp-vistanet/raw/test/Chicago_test.json',
 'data/yelp-vistanet/raw/test/Los Angeles_test.json',
 'data/yelp-vistanet/raw/test/New York_test.json',
 'data/yelp-vistanet/raw/test/San Francisco_test.json']

In [11]:
total = []
for path in pathes:
    total += read_reviews(path, True)
len(total)

Read json: 35435it [00:09, 3791.53it/s] 
Read json: 2215it [00:00, 15372.57it/s]
Read json: 315it [00:00, 8599.75it/s]
Read json: 325it [00:00, 10662.26it/s]
Read json: 3730it [00:00, 19220.10it/s]
Read json: 1715it [00:00, 17650.33it/s]
Read json: 570it [00:00, 14839.91it/s]


44305

In [12]:
total[0]

{'_id': 'VaGecZmwWgbneUqCjv_RQQ',
 'Text': "this place is really nice and quiet , the design made me feel like i was just entered into european house .|||the place is cozy and full of items floor to ceiling .|||i have ordered salad and pasta with alfredo sauce the pasta was really good how ever the salad was a bit boring since there was too much lettuce .|||the menu was n't clear and it was a bit difficult for me to read it .|||overall my experience was good the bad part was when i received the check and they added 20 % tip to the check .|||( dont get me wrong i would tip the guys how ever i think its my decision of how much i want to tip .",
 'Photos': ['e_RnludZr4L0d51KB7wLmA',
  'URRZ3k3weK4-eNbJLBlZNQ',
  'I4MAmxOVzx-m3EYgiA9yww'],
 'Captions': ['hoto of Benito One - New York, NY, United States',
  'Please let me decide how much tip I want to leave',
  'hoto of Benito One - New York, NY, United States'],
 'Rating': 2}

In [13]:
random.shuffle(total)
total[0]

{'_id': 'eGMAM2WzRbGvgMevwM4ahw',
 'Text': "i always feel unsure giving a place 5 stars after only one visit but i feel like soba-ya would not disappoint .|||the 3 of us got their early on a sunday , pretty much right when it opened and i would say within an hour or so the place was filled and people were already waiting for a table .|||so unless you want an early dinner be prepared to have to wait .|||i 'm not a soba expert in any way shape or form but the mori was amazing .|||as someone who has eaten ramen all his life and has read about ramen and more or less immersed himself in the culture of it , i feel this is exactly what really good soba would be like .|||the menu can be a little daunting given how big it is but i guess you kinda just have to go in there with a game plan .|||the rest of our dinner ranged from fried chicken to sauteed duck to mushroom tempura , which btw i 'm i fell in love with and would go back just for that .|||service was prompt and super friendly .|||do n't

In [14]:
dividing_point = len(total) // 5
clear_data = {}
clear_data["train"] = total[:-2*dividing_point]
clear_data["valid"] = total[-2*dividing_point:-dividing_point]
clear_data["test"] = total[-dividing_point:]
len(clear_data["train"]), len(clear_data["valid"]), len(clear_data["test"])

(26583, 8861, 8861)

In [16]:
_622_base_dir = os.path.join(base_dir, "622data")
clear_data_file = os.path.join(_622_base_dir, "clear_data.pickle")
clear_data_file

'data/yelp-vistanet/622data/clear_data.pickle'

In [17]:
with open(clear_data_file, "wb") as o:
    pickle.dump(clear_data, o, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
def get_imgs(reviews):
    res = []
    for r in reviews:
        res += r["Photos"]
    return res

In [19]:
all_imgs_id = get_imgs(clear_data["train"]) + get_imgs(clear_data["valid"]) + get_imgs(clear_data["test"])
len(all_imgs_id)

165427