In [1]:
import sys
sys.path.append("/home/ly/workspace/mmsa")

import json
import os
import pickle
import collections
import numpy as np
from typing import *
from tqdm import tqdm
from copy import deepcopy
from transformers import ElectraTokenizer
seed = 1024
np.random.seed(seed)

In [2]:
base_dir = os.path.join("data","yelp-vistanet")

In [3]:
def load_data():
    with open("data/yelp-vistanet/clear_data.pickle", "rb") as r:
        return pickle.load(r)

In [4]:
class YelpElectraDataBuilder(object):
    def __init__(self):
        self.tokenizer = ElectraTokenizer('pretrained/electra_small/vocab.txt')
        self.SENT_DELIMITER = '|||'
    def tokenize_and_padding(self, text):
        res = []
        for sent in text.split(self.SENT_DELIMITER):
            tokens = [deepcopy(self.tokenizer.cls_token)] # 
            if len(sent) > 0: # 有可能有空字符串
                tokens += self.tokenizer.tokenize(sent)
                tokens.append(deepcopy(self.tokenizer.sep_token)) 
                res.append(tokens)
        return res
    def build(self, text):
        tokens_list = self.tokenize_and_padding(text)
        return [self.tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list]

In [5]:
data = load_data()

In [6]:
builder = YelpElectraDataBuilder()

In [7]:
def photo_exist(_id):
    path = os.path.join(base_dir, "light_photos", _id[:2], _id + ".jpg")
    return os.path.exists(path)
def build_electra_data(builder, reviews): # 
    res = []
    for review in tqdm(reviews):
        d = {}
        d["Text"] = builder.build(review["Text"])
        d["Photos"] = list(filter(photo_exist, review["Photos"]))
        d["Rating"] = review["Rating"]
        res.append(d)
    return res

In [8]:
%%time
electra_data = {}
for key in ["train", "valid", "test"]:
    electra_data[key] = build_electra_data(builder, data[key])

100%|██████████| 35445/35445 [01:11<00:00, 493.56it/s]
100%|██████████| 4430/4430 [00:08<00:00, 502.84it/s]
100%|██████████| 4430/4430 [00:08<00:00, 504.90it/s]

CPU times: user 1min 29s, sys: 992 ms, total: 1min 30s
Wall time: 1min 29s





In [9]:
electra_data_path = os.path.join(base_dir, "electra_data.pickle")
electra_data_path

'data/yelp-vistanet/electra_data.pickle'

In [11]:
with open(electra_data_path, "wb") as w:
    pickle.dump(electra_data, w, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
def load_vgg_features(i): # 事实上所有review都只有三张图
    path = os.path.join(base_dir, "raw", "photo_features", i[:2], i + ".npy")
    if os.path.exists(path):
        return np.load(path)
    else:
        return None
def build_electra_and_vgg_data(tokenizer, reviews:List[dict]):
    res = []
    for review in tqdm(reviews):
        d = {}
        d["Text"] = builder.build(review["Text"])
        d["Photos"] = []
        for _id in review["Photos"]:
            features = load_vgg_features(_id)
            if features is not None:
                d["Photos"].append(features)
        d["Rating"] = review["Rating"]
        res.append(d)
    return res

In [19]:
%%time
electra_vgg_data = {}
for key in ["train", "valid", "test"]:
    electra_vgg_data[key] = build_electra_and_vgg_data(builder, data[key])

100%|██████████| 35445/35445 [22:38<00:00, 26.09it/s]
100%|██████████| 4430/4430 [02:48<00:00, 26.28it/s]
100%|██████████| 4430/4430 [02:46<00:00, 26.55it/s]

CPU times: user 6min 54s, sys: 27 s, total: 7min 21s
Wall time: 28min 13s





In [20]:
electra_vgg_data_path = os.path.join(base_dir, "electra_vgg_data.pickle")
electra_vgg_data_path

'data/yelp-vistanet/electra_vgg_data.pickle'

In [22]:
%%time
with open(electra_vgg_data_path, "wb") as w:
    pickle.dump(electra_vgg_data, w, protocol=pickle.HIGHEST_PROTOCOL)

CPU times: user 1.74 s, sys: 2.06 s, total: 3.8 s
Wall time: 16.2 s
