In [1]:
from json import load
import sys
sys.path.append("/home/ly/workspace/mmsa")

import json
import os
import pickle
import collections
import numpy as np
from typing import *
from tqdm import tqdm

from models import bigru
from utils.tokenization import BasicTokenizer

seed = 1024
np.random.seed(seed)

In [2]:
base_dir = 

In [3]:
def load_data():
    with open("data/yelp-vistanet/clear_data.pickle", "rb") as r:
        return pickle.load(r)
class YelpSimpleTokenizer(BasicTokenizer):
    def __init__(self, vocab:Dict[str, int]=None, do_lower_case:bool=True) -> None:
        super(YelpSimpleTokenizer, self).__init__(do_lower_case)
        self.SENT_DELIMITER = '|||'
        self.vocab = vocab
        self.UNK = len(vocab) + 1 if vocab is not None else None # 

    def tokenize(self, text:str) -> List[str]: # 默认切成2d
        res = []
        for sent in text.split(self.SENT_DELIMITER):
            if len(sent) > 0: # 有一定几率出现空字符串
                res.append(super(YelpSimpleTokenizer, self).tokenize(sent))
        return res

    def _getidx(self, token:str):
        return self.vocab.get(token, self.UNK)
        
    def to_idx(self, text:str) -> List[int]:
        assert self.vocab is not None, "No vocab!"
        sents = self.tokenize(text)
        res = []
        for sent in sents:
            res.append([self._getidx(token) for token in sent])
        return res

In [4]:
def count_word_freq(reviews:List[dict], freq_dict:Dict[str, int]=None) -> Dict[str, int]: 
    # 统计词频
    tokenizer = YelpSimpleTokenizer(do_lower_case=True)
    if freq_dict is None:
        freq_dict = {}
    for review in tqdm(reviews, "Count word frequency"):
        text = review["Text"]
        for sent in tokenizer.tokenize(text):
            for token in sent:
                freq_dict[token] = freq_dict.get(token, 0) + 1
    return freq_dict

In [5]:
data = load_data()

In [6]:
freq_dict = count_word_freq(data["train"])
freq_dict = count_word_freq(data["valid"], freq_dict)

Count word frequency: 100%|██████████| 35445/35445 [00:58<00:00, 610.79it/s]
Count word frequency: 100%|██████████| 4430/4430 [00:07<00:00, 616.02it/s]


In [7]:
def load_vocab_file(path:str):
    token2idx = collections.OrderedDict()
    idx2token = []
    idx = 0
    with open(path, "r") as r:
        for line in tqdm(r):
            key = line.strip()
            idx2token.append(key)
            token2idx[key] = idx
            idx += 1
    return token2idx, idx2token

In [8]:
token2idx, idx2token = load_vocab_file("pretrained/glove6B/vocab.txt")

400000it [00:00, 1236673.22it/s]


In [11]:
def build_vocab_from_glove(freq_dict:Dict[str, int]):
    glove_dict, _ = load_vocab_file(os.path.join("pretrained", "glove6B", "vocab.txt"))
    _vocab = list(filter(lambda item: item[0] in glove_dict, freq_dict.items())) # 删除掉不在glove中的词
    _vocab = sorted(_vocab, key=lambda item: item[1], reverse=True) # 降序排序
    print(f"There are {len(_vocab)} words in vocab.")
    token2idx = collections.OrderedDict()
    glove_idx = []
    idx2token = ["[PAD]"]
    idx = 1
    for key, val in _vocab:
        token2idx[key] = idx
        idx2token.append(key)
        glove_idx.append(glove_dict[key]) # 用来读取glove词向量
        idx += 1
    idx2token.append("[UNK]")
    return token2idx, idx2token, glove_idx

In [12]:
token2idx, idx2token, glove_idx = build_vocab_from_glove(freq_dict)

400000it [00:00, 1267564.38it/s]


There are 42822 words in vocab.


In [13]:
def save_vocab(token2idx, idx2token, glove_idx):
    vals = {"token2idx" : token2idx,
           "idx2token" : idx2token,
           "glove_idx" : glove_idx}
    path = os.path.join(base_dir, "glove_vocab.pickle")
    with open(path, "wb") as o:
        pickle.dump(vals, o, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
save_vocab(token2idx, idx2token, glove_idx)

In [15]:
def load_glove_vocab():
    path = os.path.join(base_dir, "glove_vocab.pickle")
    with open(path, "rb") as r:
        return pickle.load(r)

In [16]:
vocab = load_glove_vocab()

In [17]:
def load_glove_weight(d:int):
    path = os.path.join("pretrained", "glove6B", "glove.6B." + str(d) + "d.txt")
    NUM = 400000
    weight = np.empty((NUM, d), dtype=np.float32)
    with open(path, "r", encoding='utf-8') as r:
        for i, line in enumerate(tqdm(r.readlines(), "Load glove")):
            values = line.split()
            weight[i] = np.asarray(values[1:], dtype=np.float32)
    return weight

In [18]:
len(vocab["token2idx"]), len(vocab["idx2token"]), len(vocab["glove_idx"])

(42822, 42824, 42822)

In [19]:
def get_yelp_glove_weight(d:int, _uniform:float=0.1):
    path = os.path.join(base_dir, "glove6B" + str(d) + "d.npy")
    if os.path.exists(path):
        return np.load(path)
    glove_weight = load_glove_weight(d)
    vocab = load_glove_vocab()
    n = len(vocab["token2idx"]) 
    weight = np.empty((n + 12, d), dtype=np.float32) 
    weight[0] = np.zeros(d, dtype=np.float32) # [PAD]
    glove_weight = glove_weight[vocab["glove_idx"]]
    weight[1:n+1] = glove_weight # 正文
    weight[n+1] = glove_weight.mean(axis=0) # [UNK]
    weight[n+2:] = np.random.uniform(-_uniform, _uniform, size=(10, d))
    np.save(path, weight)
    return weight

In [20]:
all_d = [50, 100, 200, 300]
for d in all_d:
    w = get_yelp_glove_weight(d)

Load glove: 100%|██████████| 400000/400000 [00:04<00:00, 95054.50it/s] 
Load glove: 100%|██████████| 400000/400000 [00:07<00:00, 52383.67it/s]
Load glove: 100%|██████████| 400000/400000 [00:12<00:00, 30890.69it/s]
Load glove: 100%|██████████| 400000/400000 [00:18<00:00, 21389.14it/s]


In [22]:
glove_tokenizer = YelpSimpleTokenizer(vocab["token2idx"], do_lower_case=True)

In [None]:
glove_tokenizer.to_idx(data["train"][0]["Text"])

In [24]:
def load_vgg_features(i): # 事实上所有review都只有三张图
    path = os.path.join(base_dir, "raw", "photo_features", i[:2], i + ".npy")
    if os.path.exists(path):
        return np.load(path)
    else:
        return None
def build_glove_and_vgg_data(tokenizer, reviews:List[dict]):
    res = []
    for review in tqdm(reviews):
        d = {}
        d["Text"] = tokenizer.to_idx(review["Text"])
        d["Photos"] = []
        for _id in review["Photos"]:
            features = load_vgg_features(_id)
            if features is not None:
                d["Photos"].append(features)
        d["Rating"] = review["Rating"]
        res.append(d)
    return res

In [25]:
%%time
glove_vgg_data = {}
for key in ["train", "valid", "test"]:
    glove_vgg_data[key] = build_glove_and_vgg_data(glove_tokenizer, data[key])

100%|██████████| 35445/35445 [01:24<00:00, 420.35it/s]
100%|██████████| 4430/4430 [00:10<00:00, 430.21it/s]
100%|██████████| 4430/4430 [00:10<00:00, 415.06it/s]

CPU times: user 1min 42s, sys: 3.21 s, total: 1min 45s
Wall time: 1min 45s





In [26]:
glove_vgg_data_path = os.path.join(base_dir, "glove_vgg_data.pickle")
glove_vgg_data_path

'data/yelp-vistanet/glove_vgg_data.pickle'

In [None]:
with open(glove_vgg_data_path, "wb") as w:
    pickle.dump(glove_vgg_data, w, protocol=pickle.HIGHEST_PROTOCOL)

In [163]:
imgs_num = []
for key in ["train", "valid", "test"]:
    for review in glove_vgg_data[key]:
        imgs_num.append(len(review["Photos"]))
len(imgs_num)

44305

In [164]:
a = np.array(imgs_num)

In [165]:
(a == 3).sum()

44305