In [6]:
import json
import pickle
from scipy.sparse import coo_matrix
import os
import numpy as np
from sklearn.model_selection import train_test_split

# 加载ui_dict.json
with open('../datasets/yelp_tiny/ui_dict.json', 'r') as f:
    ui_dict = json.load(f)

# 创建用户和项目的映射
user2id = {user: idx for idx, user in enumerate(ui_dict.keys())}
item_set = set(item for items in ui_dict.values() for item in items.keys())
item2id = {item: idx for idx, item in enumerate(item_set)}

# 构建数据集并采样 1/3 的交互
rows, cols, data = [], [], []
for user, items in ui_dict.items():
    for item in items.keys():
        rows.append(user2id[user])
        cols.append(item2id[item])
        data.append(1)  # 所有评分置为1

# 随机采样 1/3 的交互
total_interactions = len(data)
sample_size = total_interactions // 3
sample_indices = np.random.choice(total_interactions, sample_size, replace=False)
rows = [rows[i] for i in sample_indices]
cols = [cols[i] for i in sample_indices]
data = [data[i] for i in sample_indices]

# 划分训练集、验证集和测试集
indices = np.arange(len(data))
train_idx, test_idx = train_test_split(indices, test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(test_idx, test_size=1/3, random_state=42)

train_rows = [rows[i] for i in train_idx]
train_cols = [cols[i] for i in train_idx]
train_data = [data[i] for i in train_idx]

val_rows = [rows[i] for i in val_idx]
val_cols = [cols[i] for i in val_idx]
val_data = [data[i] for i in val_idx]

test_rows = [rows[i] for i in test_idx]
test_cols = [cols[i] for i in test_idx]
test_data = [data[i] for i in test_idx]

# 构建稀疏矩阵
train_matrix = coo_matrix((train_data, (train_rows, train_cols)))
val_matrix = coo_matrix((val_data, (val_rows, val_cols)))
test_matrix = coo_matrix((test_data, (test_rows, test_cols)))

# 保存稀疏矩阵和映射文件
os.makedirs('../mydatasets/yelp/', exist_ok=True)
with open('../mydatasets/yelp/trnMat.pkl', 'wb') as f:
    pickle.dump(train_matrix, f)
with open('../mydatasets/yelp/valMat.pkl', 'wb') as f:
    pickle.dump(val_matrix, f)
with open('../mydatasets/yelp/tstMat.pkl', 'wb') as f:
    pickle.dump(test_matrix, f)
with open('../mydatasets/yelp/user2id.json', 'w') as f:
    json.dump(user2id, f)
with open('../mydatasets/yelp/item2id.json', 'w') as f:
    json.dump(item2id, f)

In [7]:
import pickle

def ds_info(path):
    with open(path, 'rb') as f:
        matrix = pickle.load(f)
    print('Matrix shape:', matrix.shape)
    its = matrix.nnz
    print('Number of non-zero entries:', its)
    return its

files = ['../mydatasets/ifashion/trnMat.pkl',
         '../mydatasets/ifashion/valMat.pkl',
         '../mydatasets/ifashion/tstMat.pkl']
all_its = 0
for file in files:
    all_its += ds_info(file)
print('Total number of interactions:', all_its)

Matrix shape: (38403, 20000)
Number of non-zero entries: 133967
Matrix shape: (38403, 20000)
Number of non-zero entries: 38276
Matrix shape: (38396, 19999)
Number of non-zero entries: 19139
Total number of interactions: 191382


In [8]:
import numpy as np
from safetensors.torch import load_file

# 加载多模态特征文件
image_embs = load_file('../datasets/yelp_tiny/item_image_emb.safetensors')
text_embs = load_file('../datasets/yelp_tiny/item_text_embs.safetensors')

# 筛选出交互数据中的项目并按照item_id顺序堆叠
valid_items = set(item2id.keys())

image_features = []
text_features = []
for item, idx in sorted(item2id.items(), key=lambda x: x[1]):
    if item in valid_items:
        image_features.append(image_embs[item].numpy())
        text_features.append(text_embs[item].numpy())

image_features = np.stack(image_features, axis=0)  # (item_num, dim)
text_features = np.stack(text_features, axis=0)  # (item_num, dim)

# 保存为.npy格式
np.save('../mydatasets/yelp/image_feat.npy', image_features)
np.save('../mydatasets/yelp/text_feat.npy', text_features)

In [None]:
import numpy as np

def modal_info(path):
    data = np.load(path)
    print('Data shape:', data.shape)
    print('Data type:', data.dtype)

modal_info('../mydatasets/yelp/image_feat.npy')
modal_info('../mydatasets/yelp/text_feat.npy')

: 

## 给师妹缩小的数据集

In [14]:
import numpy as np
import pickle
import os
from scipy.sparse import coo_matrix

np.random.seed(42)

def load_mat(file_path) -> coo_matrix:
    with open(file_path, 'rb') as f:
        return pickle.load(f)

trnMat = load_mat('../Datasets/sports/trnMat.pkl').tocsr()
valMat = load_mat('../Datasets/sports/valMat.pkl').tocsr()
tstMat = load_mat('../Datasets/sports/tstMat.pkl').tocsr()

image_feat = np.load('../Datasets/sports/image_feat.npy')
text_feat = np.load('../Datasets/sports/text_feat.npy')

num_users = 18164
num_items = 14514

all_users = np.arange(trnMat.shape[0])
all_items = np.arange(trnMat.shape[1])

# 随机选择用户和项目
selected_users = np.random.choice(all_users, num_users, replace=False)
selected_items = np.random.choice(all_items, num_items, replace=False)
print(f"Demo: {selected_users[0]}, {selected_items[0]}")

train_sub = trnMat[selected_users, :][:, selected_items]
valid_sub = valMat[selected_users, :][:, selected_items]
test_sub = tstMat[selected_users, :][:, selected_items]

# 筛选多模态特征
image_feat_sub = image_feat[selected_items, :]
text_feat_sub = text_feat[selected_items, :]

os.makedirs('../Datasets/sports_tiny/', exist_ok=True)
with open('../Datasets/sports_tiny/trnMat.pkl', 'wb') as f:
    pickle.dump(train_sub, f)

with open('../Datasets/sports_tiny/valMat.pkl', 'wb') as f:
    pickle.dump(valid_sub, f)

with open('../Datasets/sports_tiny/tstMat.pkl', 'wb') as f:
    pickle.dump(test_sub, f)

np.save('../Datasets/sports_tiny/image_feat.npy', image_feat_sub)
np.save('../Datasets/sports_tiny/text_feat.npy', text_feat_sub)

print("done")

  return pickle.load(f)


Demo: 647, 16170
done


In [12]:
import pickle

def ds_info(path):
    with open(path, 'rb') as f:
        matrix = pickle.load(f)
    print('Matrix shape:', matrix.shape)
    its = matrix.nnz
    print('Number of non-zero entries:', its)
    return its

In [13]:
ds_info('../Datasets/sports_tiny/trnMat.pkl')
ds_info('../Datasets/sports_tiny/valMat.pkl')
ds_info('../Datasets/sports_tiny/tstMat.pkl')

Matrix shape: (18164, 14514)
Number of non-zero entries: 88175
Matrix shape: (18164, 14514)
Number of non-zero entries: 16132
Matrix shape: (18164, 14514)
Number of non-zero entries: 15401


15401

检查前后数据是否一致

In [16]:
origin = np.load('../Datasets/sports/image_feat.npy')
print(origin[16170])
new = np.load('../Datasets/sports_tiny/image_feat.npy')
print(new[0])

[1.48689997 0.         2.39369988 ... 0.         0.         0.43309999]
[1.48689997 0.         2.39369988 ... 0.         0.         0.43309999]


In [4]:
ds_info('../Datasets/sports/trnMat.pkl')
ds_info('../Datasets/sports/valMat.pkl')
ds_info('../Datasets/sports/tstMat.pkl')

Matrix shape: (35598, 18357)
Number of non-zero entries: 218409
Matrix shape: (35598, 18357)
Number of non-zero entries: 40029
Matrix shape: (35598, 18357)
Number of non-zero entries: 37899


  matrix = pickle.load(f)


37899