In [22]:
import json
import os
import pickle
from tqdm import tqdm
from typing import *
import random
import numpy as np
import shutil
from multiprocessing import Pool
seed = 1997 # 种子值
np.random.seed(seed)
random.seed(seed)

In [32]:
cities = ['Boston', 'Chicago', 'Los Angeles', 'New York', 'San Francisco']

In [3]:
base_dir = os.path.join("data","yelp-vistanet")
raw_dir = os.path.join(base_dir, "raw")
raw_train_file = os.path.join(raw_dir, "train.json")
raw_valid_file = os.path.join(raw_dir, "valid.json")
raw_test_files = [os.path.join(raw_dir, "test", f"{city}_test.json") for city in cities]
base_dir, raw_dir, raw_train_file, raw_valid_file, raw_test_files

('data/yelp-vistanet',
 'data/yelp-vistanet/raw',
 'data/yelp-vistanet/raw/train.json',
 'data/yelp-vistanet/raw/valid.json',
 ['data/yelp-vistanet/raw/test/Boston_test.json',
  'data/yelp-vistanet/raw/test/Chicago_test.json',
  'data/yelp-vistanet/raw/test/Los Angeles_test.json',
  'data/yelp-vistanet/raw/test/New York_test.json',
  'data/yelp-vistanet/raw/test/San Francisco_test.json'])

In [4]:
raw_photos_dir = os.path.join(raw_dir, "photos")
light_photos_dir = os.path.join("data", "yelp-vistanet", "light_photos")
raw_photos_dir, light_photos_dir

('data/yelp-vistanet/raw/photos', 'data/yelp-vistanet/light_photos')

In [5]:
def check_photo(_id:str, mv:bool=False):
    path = os.path.join(raw_photos_dir, _id[:2], _id + ".jpg")
    if not os.path.exists(path):
        return False
    if cp: # 如果需要复制
        src = path
        dest = os.path.join(light_photos_dir, _id[:2])
        if not os.path.exists(dest):
            os.makedirs(dest)
        shutil.copyfile(src, os.path.join(dest, _id + ".jpg"))
    return True

In [6]:
def read_reviews(file_path:str, clean_data:bool=False) -> List[Dict[str, str]]: 
    # 读入数据
    reviews = None
    if file_path.endswith(".json"):
         with open(file_path, 'r', encoding="utf-8") as f:
            reviews = []
            for line in tqdm(f, "Read json"):
                review = json.loads(line)
                imgs = []
                captions = []
                for photo in review['Photos']:
                    _id = photo['_id']
                    caption = photo["Caption"]
                    if clean_data:
                        if check_photo(_id, False):
                            imgs.append(_id)
                            captions.append(caption)
                    else:
                        imgs.append(_id)
                        captions.append(caption)
                reviews.append({'_id': review['_id'],
                      'Text': review['Text'],
                      'Photos': imgs,
                      'Captions': captions,
                      'Rating': review['Rating']})
    elif file_pathle.endswith(".pickle"):
        with open(file_path, 'rb') as f:
            reviews = pickle.load(f) # 直接从pickle中加载
    else:
        raise RuntimeError("Illegal file path!")
    return reviews

In [7]:
pathes = [raw_train_file, raw_valid_file, *raw_test_files]
pathes

['data/yelp-vistanet/raw/train.json',
 'data/yelp-vistanet/raw/valid.json',
 'data/yelp-vistanet/raw/test/Boston_test.json',
 'data/yelp-vistanet/raw/test/Chicago_test.json',
 'data/yelp-vistanet/raw/test/Los Angeles_test.json',
 'data/yelp-vistanet/raw/test/New York_test.json',
 'data/yelp-vistanet/raw/test/San Francisco_test.json']

In [8]:
total = []
for path in pathes:
    total += read_reviews(path, True)
len(total)

Read json: 35435it [00:01, 24520.17it/s]
Read json: 2215it [00:00, 27590.83it/s]
Read json: 315it [00:00, 27188.66it/s]
Read json: 325it [00:00, 21412.28it/s]
Read json: 3730it [00:00, 25033.81it/s]
Read json: 1715it [00:00, 26424.91it/s]
Read json: 570it [00:00, 23662.62it/s]


44305

In [9]:
total[0]

{'_id': 'VaGecZmwWgbneUqCjv_RQQ',
 'Text': "this place is really nice and quiet , the design made me feel like i was just entered into european house .|||the place is cozy and full of items floor to ceiling .|||i have ordered salad and pasta with alfredo sauce the pasta was really good how ever the salad was a bit boring since there was too much lettuce .|||the menu was n't clear and it was a bit difficult for me to read it .|||overall my experience was good the bad part was when i received the check and they added 20 % tip to the check .|||( dont get me wrong i would tip the guys how ever i think its my decision of how much i want to tip .",
 'Photos': ['e_RnludZr4L0d51KB7wLmA',
  'URRZ3k3weK4-eNbJLBlZNQ',
  'I4MAmxOVzx-m3EYgiA9yww'],
 'Captions': ['hoto of Benito One - New York, NY, United States',
  'Please let me decide how much tip I want to leave',
  'hoto of Benito One - New York, NY, United States'],
 'Rating': 2}

In [10]:
random.shuffle(total)
total[0]

{'_id': 'ZKAOXX0Zc3-G-s1c_uBTUw',
 'Text': "after reading these reviews , i do n't know if i had the displeasure of going into the same store as everybody else .|||it was my first time here and the only reason we went in was to purchase some of their roasted almonds ( in shell ) .|||i tried some while visiting an elderly couple who i used to live next to , back in glendale days , and i really enjoyed these nuts.so much so that i wanted to buy some to take home .|||from the outside , it looks super nice.costco status .|||i was waiting to be wowed by the spectacularness of the inside .|||walking in , i felt like i was in an asian supermarket ; similar to giant grocery stores that have little shops and restaurants inside .|||golden farms was big and stocked but it looked dirty .|||there were women mopping the floors but it looked like the gunk on the floor was pretty stuck on .|||all in all , i would have gave them three stars because they can not control who walks into the store and what

In [11]:
dividing_point = len(total) // 10
clear_data = {}
clear_data["train"] = total[:-2*dividing_point]
clear_data["valid"] = total[-2*dividing_point:-dividing_point]
clear_data["test"] = total[-dividing_point:]
len(clear_data["train"]), len(clear_data["valid"]), len(clear_data["test"])

(35445, 4430, 4430)

In [12]:
clear_data_file = os.path.join(base_dir, "clear_data.pickle")
clear_data_file

'data/yelp-vistanet/clear_data.pickle'

In [14]:
with open(clear_data_file, "wb") as o:
    pickle.dump(clear_data, o, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
def get_imgs(reviews):
    res = []
    for r in reviews:
        res += r["Photos"]
    return res

In [19]:
all_imgs_id = get_imgs(clear_data["train"]) + get_imgs(clear_data["valid"]) + get_imgs(clear_data["test"])
len(all_imgs_id)

165427

In [25]:
njobs = 10
split_imgs_id = [[] for _ in range(njobs)]
split_imgs_id

[[], [], [], [], [], [], [], [], [], []]

In [26]:
for i, _id in enumerate(all_imgs_id):
    split_imgs_id[i%njobs].append(_id)

In [27]:
def cp_imgs(imgs_id):
    res = 0
    for _id in imgs_id:
        res += check_photo(_id, True) #
    return res

In [30]:
pool = Pool(processes=njobs)
result = []
for i, j in enumerate(split_imgs_id):
    result.append(pool.apply_async(cp_imgs, (j,)))
pool.close()
pool.join()
result

[<multiprocessing.pool.ApplyResult at 0x7f8f50a5a150>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5a210>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5a750>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5a810>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5a890>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5aa10>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5ab10>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5ac10>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5ad10>,
 <multiprocessing.pool.ApplyResult at 0x7f8f50a5a9d0>]

In [31]:
[i.get() for i in result]

[16543, 16543, 16543, 16543, 16543, 16543, 16543, 16542, 16542, 16542]