In [1]:
import os
import pickle
import jsonlines
import pandas as pd
import numpy as np
import json
import copy
from tqdm import tqdm

In [2]:
data = json.load(open("./handled/item2attributes_B.json", "r"))

In [3]:
len(data)

2847

In [4]:
example_dict = {}
for item_dict in tqdm(data.values()):
    example_dict.update(item_dict)

100%|██████████| 2847/2847 [00:00<00:00, 492602.76it/s]


In [5]:
id_map = json.load(open("./handled/id_map.json", "r"))["item_dict"]["1"]["str2id"]
title_data = {}
for key, value in tqdm(data.items()):
    title_data[id_map[key]] = value["title"][:100]

100%|██████████| 2847/2847 [00:00<00:00, 696522.60it/s]


In [6]:
title_list = []
for id in range(1, len(id_map)+1):
    if id not in title_data.keys():
        title_list.append("no name")
    else:
        title_list.append(title_data[id])

assert len(title_list) == len(id_map)

with open("./handled/title_B.pkl", "wb") as f:
    pickle.dump(title_list, f)

In [7]:
# the number of items that do not have name
print("the number of items that do not have name: {}".format(len(id_map.values()) - len(data)))

the number of items that do not have name: 5


In [8]:
example_dict.keys()

dict_keys(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details'])

In [9]:
example_dict["description"][0]

'BRENDA BLETHYN is Vera Stanhope, a sharp detective with a messy life Blethyn is wonderful. ???The Times (U.K.) A first-rate mystery series ???Library Journal Captivating ???Library Bookwatch Classy crime drama ???The Daily Telegraph (U.K.) Two-time Oscar nominee Brenda Blethyn (Pride & Prejudice, Secrets & Lies) returns as Vera Stanhope???a police detective with a disheveled exterior, a sharp tongue, and an uncanny ability to solve crimes. Tough, dedicated, and more than a little irreverent, DCI Stanhope is assisted by her long-suffering sergeant, Joe Ashworth (David Leon, RocknRolla); DC Kenny Lockhart (Jon Morrison, High Times); and forensic pathologist Billy Cartwright (Paul Ritter, The Eagle). Inspired by Ann Cleeves???s bestselling mysteries, Vera is set in the Northumberland of the original books. In four new feature-length dramas, Stanhope delves into a world of shadows and suppressed passions where everyone has something to hide. Guest stars include Rose Leslie (Game of Throne

In [10]:
def get_attri(item_str, attri, item_info):

    if attri not in item_info.keys() or len(item_info[attri]) > 100:
        new_str = item_str.replace(f"<{attri.upper()}>", "unknown")
    else:
        new_str = item_str.replace(f"<{attri.upper()}>", str(item_info[attri]))

    return new_str

In [11]:
def get_feat(item_str, feat, item_info):

    if feat not in item_info.keys():
        return ""
    
    assert isinstance(item_info[feat], list)
    feat_str = ""
    for meta_feat in item_info[feat]:
        feat_str = feat_str + meta_feat + "; "
    new_str = item_str.replace(f"<{feat.upper()}>", feat_str)

    if len(new_str) > 128: # avoid exceed the input length limitation
        return new_str[:128]

    return new_str

In [12]:
prompt_template = "The movies and TV item has following attributes: \n name is <TITLE>; brand is <BRAND>; price is <PRICE>, rating is <DATE>, price is <PRICE>. \n"
feat_template = "The item has following features: <CATEGORY>. \n"
desc_template = "The item has following descriptions: <DESCRIPTION>. \n"

In [13]:
item_data = {}
for key, value in tqdm(data.items()):
    item_str = copy.deepcopy(prompt_template)
    item_str = get_attri(item_str, "title", value)
    item_str = get_attri(item_str, "brand", value)
    item_str = get_attri(item_str, "date", value)
    # item_str = get_attri(item_str, "rank", value)
    item_str = get_attri(item_str, "price", value)

    feat_str = copy.deepcopy(feat_template)
    feat_str = get_feat(feat_str, "category", value)
    desc_str = copy.deepcopy(desc_template)
    desc_str = get_feat(desc_str, "description", value)
    
    item_data[key] = item_str + feat_str + desc_str

100%|██████████| 2847/2847 [00:00<00:00, 107392.47it/s]


In [14]:
len_list = []
for item_str in item_data.values():
    len_list.append(len(item_str))
np.mean(len_list)

362.1022128556375

In [15]:
json.dump(item_data, open("./handled/item_str_B_truncate.json", "w"))

In [16]:
# convert to jsonline
def save_data(data_path, data):
    '''write all_data list to a new jsonl'''
    with jsonlines.open("./handled/"+ data_path, "w") as w:
        for meta_data in data:
            w.write(meta_data)

id_map = json.load(open("./handled/id_map.json", "r"))["item_dict"]["1"]["str2id"]
json_data = []
for key, value in item_data.items():
    json_data.append({"input": value, "target": "", "item": key, "item_id": id_map[key]})

json_data = sorted(json_data, key=lambda x: x["item_id"])
save_data("item_str_B_truncate.jsonline", json_data)

In [17]:
import requests
import json

In [18]:
def get_response(prompt):
    url = "https://chatapi.littlewheat.com/v1/embeddings"

    payload = json.dumps({
    "model": "text-embedding-3-large",
    "input": prompt
    })
    headers = {
    'Authorization': 'Bearer sk-L7XOrSKuS5sfsoNYkmj3rSyLahhwBdtG1iefI8xTXFaJ8Oh0',
    'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
    'Content-Type': 'application/json'
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    re_json = json.loads(response.text)

    return re_json["data"][0]["embedding"]

In [19]:
value_list = []

for key, value in tqdm(item_data.items()):
    if len(value) > 4096:
        value_list.append(key)

100%|██████████| 2847/2847 [00:00<00:00, 1365486.96it/s]


In [20]:
if os.path.exists("./handled/item_emb_B.pkl"):    # check whether some item emb exist in cache
    item_emb = pickle.load(open("./handled/item_emb_B.pkl", "rb"))
else:
    item_emb = {}

In [21]:
count = 1
while 1:    # avoid broken due to internet connection
    # 如果数据全部跑完，退出循环
    if len(item_emb) == len(item_data):
        break
        
    try:
        for key, value in tqdm(item_data.items()):
            if key not in item_emb.keys():
                # 截断过长文本
                if len(value) > 4096:
                    value = value[:4095]
                
                # 获取响应
                item_emb[key] = get_response(value)
                
                # 【修改点】每隔 10 次存储一次
                if count % 10 == 0:
                    pickle.dump(item_emb, open("./handled/item_emb_A.pkl", "wb"))
                
                count += 1
                
    except:
        # 如果报错崩溃，依然执行一次保存，作为兜底
        pickle.dump(item_emb, open("./handled/item_emb_A.pkl", "wb"))

 39%|███▉      | 1114/2847 [55:59<1:27:06,  3.02s/it]
100%|██████████| 2847/2847 [1:13:28<00:00,  1.55s/it]  


In [22]:
id_map = json.load(open("./handled/id_map.json", "r"))["item_dict"]["1"]["id2str"]
emb_list = []
for id in range(1, len(id_map)+1):
    try:    # 有一个物品没有属性，给其赋0向量
        meta_emb = item_emb[id_map[str(id)]]
    except:
        meta_emb = [0] * len(list(item_emb.values())[0])
    emb_list.append(meta_emb)

emb_list = np.array(emb_list)
pickle.dump(emb_list, open("./handled/itm_emb_np_B.pkl", "wb"))

In [None]:
# 确保LLM embedding和物品的数量是相同的
assert len(emb_list) == len(id_map)