In [6]:
import json
import pickle
from scipy.sparse import coo_matrix
import os
import numpy as np
from sklearn.model_selection import train_test_split

# 加载ui_dict.json
with open('../datasets/yelp_tiny/ui_dict.json', 'r') as f:
    ui_dict = json.load(f)

# 创建用户和项目的映射
user2id = {user: idx for idx, user in enumerate(ui_dict.keys())}
item_set = set(item for items in ui_dict.values() for item in items.keys())
item2id = {item: idx for idx, item in enumerate(item_set)}

# 构建数据集并采样 1/3 的交互
rows, cols, data = [], [], []
for user, items in ui_dict.items():
    for item in items.keys():
        rows.append(user2id[user])
        cols.append(item2id[item])
        data.append(1)  # 所有评分置为1

# 随机采样 1/3 的交互
total_interactions = len(data)
sample_size = total_interactions // 3
sample_indices = np.random.choice(total_interactions, sample_size, replace=False)
rows = [rows[i] for i in sample_indices]
cols = [cols[i] for i in sample_indices]
data = [data[i] for i in sample_indices]

# 划分训练集、验证集和测试集
indices = np.arange(len(data))
train_idx, test_idx = train_test_split(indices, test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(test_idx, test_size=1/3, random_state=42)

train_rows = [rows[i] for i in train_idx]
train_cols = [cols[i] for i in train_idx]
train_data = [data[i] for i in train_idx]

val_rows = [rows[i] for i in val_idx]
val_cols = [cols[i] for i in val_idx]
val_data = [data[i] for i in val_idx]

test_rows = [rows[i] for i in test_idx]
test_cols = [cols[i] for i in test_idx]
test_data = [data[i] for i in test_idx]

# 构建稀疏矩阵
train_matrix = coo_matrix((train_data, (train_rows, train_cols)))
val_matrix = coo_matrix((val_data, (val_rows, val_cols)))
test_matrix = coo_matrix((test_data, (test_rows, test_cols)))

# 保存稀疏矩阵和映射文件
os.makedirs('../mydatasets/yelp/', exist_ok=True)
with open('../mydatasets/yelp/trnMat.pkl', 'wb') as f:
    pickle.dump(train_matrix, f)
with open('../mydatasets/yelp/valMat.pkl', 'wb') as f:
    pickle.dump(val_matrix, f)
with open('../mydatasets/yelp/tstMat.pkl', 'wb') as f:
    pickle.dump(test_matrix, f)
with open('../mydatasets/yelp/user2id.json', 'w') as f:
    json.dump(user2id, f)
with open('../mydatasets/yelp/item2id.json', 'w') as f:
    json.dump(item2id, f)

In [7]:
import pickle

def ds_info(path):
    with open(path, 'rb') as f:
        matrix = pickle.load(f)
    print('Matrix shape:', matrix.shape)
    print('Number of non-zero entries:', matrix.nnz)

ds_info('../mydatasets/yelp/trnMat.pkl')
ds_info('../mydatasets/yelp/valMat.pkl')
ds_info('../mydatasets/yelp/tstMat.pkl')

Matrix shape: (37397, 32491)
Number of non-zero entries: 165008
Matrix shape: (37395, 32490)
Number of non-zero entries: 47145
Matrix shape: (37390, 32490)
Number of non-zero entries: 23573


In [8]:
import numpy as np
from safetensors.torch import load_file

# 加载多模态特征文件
image_embs = load_file('../datasets/yelp_tiny/item_image_emb.safetensors')
text_embs = load_file('../datasets/yelp_tiny/item_text_embs.safetensors')

# 筛选出交互数据中的项目并按照item_id顺序堆叠
valid_items = set(item2id.keys())

image_features = []
text_features = []
for item, idx in sorted(item2id.items(), key=lambda x: x[1]):
    if item in valid_items:
        image_features.append(image_embs[item].numpy())
        text_features.append(text_embs[item].numpy())

image_features = np.stack(image_features, axis=0)  # (item_num, dim)
text_features = np.stack(text_features, axis=0)  # (item_num, dim)

# 保存为.npy格式
np.save('../mydatasets/yelp/image_feat.npy', image_features)
np.save('../mydatasets/yelp/text_feat.npy', text_features)

In [None]:
import numpy as np

def modal_info(path):
    data = np.load(path)
    print('Data shape:', data.shape)
    print('Data type:', data.dtype)

modal_info('../mydatasets/yelp/image_feat.npy')
modal_info('../mydatasets/yelp/text_feat.npy')

: 