# 处理 tiny 数据集

## ifashion

### 编码映射

先用户再项目

In [22]:
import os
import json

user2id: dict[str, int] = {}
item2id: dict[str, int] = {}
item_set = set()

with open('ifashion_tiny/ui_dict.json', 'r') as file:
    ui_dict: dict[str, dict[str, int]] = json.load(file)

# 用户编号
for user, item_ratings in ui_dict.items():
    user2id[user] = len(user2id)
    item_set.update(item_ratings.keys())

# 获取用户编号的最大值
max_user_id = max(user2id.values()) if user2id else -1
assert max_user_id != -1

# 项目编号(接着用户编号继续)
for item in item_set:
    item2id[item] = max_user_id + 1
    max_user_id += 1

print("first uid:", list(user2id.values())[0])
print("last uid:", list(user2id.values())[-1])
print("first iid:", list(item2id.values())[0])

os.makedirs("ifashion_tiny/remap_col", exist_ok=True)
with open("ifashion_tiny/remap_col/user2id.json", 'w', encoding='utf-8') as f1:
    json.dump(user2id, f1, ensure_ascii=False)
with open("ifashion_tiny/remap_col/item2id.json", 'w', encoding='utf-8') as f2:
    json.dump(item2id, f2, ensure_ascii=False)

first uid: 0
last uid: 38402
first iid: 38403


In [23]:
with open("ifashion_tiny/remap_col/user2id.json", 'r', encoding='utf-8') as f1:
    uid = json.load(f1)
print(len(uid))

with open("ifashion_tiny/remap_col/item2id.json", 'r', encoding='utf-8') as f2:
    iid = json.load(f2)
print(len(iid))

38403
20000


### 多模态特征

图像特征

In [25]:
import json
from safetensors.torch import load_file
from tqdm import tqdm
import torch

with open("ifashion_tiny/remap_col/item2id.json", 'r', encoding='utf-8') as f2:
    item2id: dict[str, int] = json.load(f2)

image_id2embs: dict[int, torch.Tensor] = {}
image_tensors = load_file("ifashion_ds/ifashion_image.safetensors")

for item in tqdm(image_tensors, desc='image embs'):
    if item not in item2id:
        continue
    image_id2embs[item2id[item]] = torch.squeeze(image_tensors[item], dim=0)

# 提取键并排序
sorted_keys = sorted(image_id2embs.keys(), key=int)
assert sorted_keys[1] > sorted_keys[0]
sorted_tensors = [image_id2embs[key] for key in sorted_keys]
result_tensor = torch.stack(sorted_tensors, dim=0)  # (all_item_num, dim)

print(result_tensor.shape)
os.makedirs('ifashion_tiny/mmgcn', exist_ok=True)
torch.save(result_tensor, 'ifashion_tiny/mmgcn/v_feat.pt')

image embs: 100%|██████████| 51939/51939 [00:00<00:00, 507248.37it/s]


torch.Size([20000, 512])


文本特征

In [26]:
import json
from safetensors.torch import load_file
from tqdm import tqdm
import torch

with open("ifashion_tiny/remap_col/item2id.json", 'r', encoding='utf-8') as f2:
    item2id: dict[str, int] = json.load(f2)

text_id2embs: dict[int, torch.Tensor] = {}
text_tensors = load_file("ifashion_ds/ifashion_text.safetensors")

for item in tqdm(text_tensors, desc='text embs'):
    if item not in item2id:
        continue
    text_id2embs[item2id[item]] = torch.squeeze(text_tensors[item], dim=0)

# 提取键并排序
sorted_keys = sorted(text_id2embs.keys(), key=int)
assert sorted_keys[1] > sorted_keys[0]
sorted_tensors = [text_id2embs[key] for key in sorted_keys]
result_tensor = torch.stack(sorted_tensors, dim=0)  # (all_item_num, dim)

print(result_tensor.shape)
os.makedirs('ifashion_tiny/mmgcn', exist_ok=True)
torch.save(result_tensor, 'ifashion_tiny/mmgcn/t_feat.pt')

text embs: 100%|██████████| 51939/51939 [00:00<00:00, 601323.16it/s]


torch.Size([20000, 1024])


### 交互数据

训练集其实就是将逐行的 interactions 用 `.npy` 存起来，内容一样

In [27]:
import numpy as np
import json

with open("ifashion_tiny/remap_col/user2id.json", 'r', encoding='utf-8') as f1:
    user2id: dict[str, int] = json.load(f1)

with open("ifashion_tiny/remap_col/item2id.json", 'r', encoding='utf-8') as f2:
    item2id: dict[str, int] = json.load(f2)

user_item_pairs = []

with open('ifashion_tiny/train.txt', 'r') as train_file:
    for line in train_file:
        user, item, _ratings = line.split(' ')
        user_item_pairs.append([user2id[user], item2id[item]])

user_item_array = np.array(user_item_pairs)
print(user_item_array.shape)

(251914, 2)


In [28]:
np.save('ifashion_tiny/mmgcn/train.npy', user_item_array)

测试/验证集由于作者使用了一个很怪的格式，所以只能手动处理了

In [29]:
from collections import defaultdict

def v2list(interactions: dict[int, set]) -> dict[int, list[int]]:
    ui_dict = {}
    for k,v in interactions.items():
        ui_dict[k] = list(v)
    return ui_dict

def trans_ds(txt_path: str, save_path: str, user2id: dict[str, int], item2id: dict[str, int]):
    out_json: dict[int, set] = defaultdict(set[int])
    with open(txt_path, 'r', encoding='utf-8') as f:
        for line in f:
            user, item, _ratings = line.split(' ')
            out_json[user2id[user]].add(item2id[item])

    ui_dict = v2list(out_json)
    array = []
    for user, items in ui_dict.items():
        merge = [int(user)]
        merge.extend(items)
        array.append(merge)
    
    np_array = np.array(array, dtype=object)
    np.save(save_path, np_array, allow_pickle=True)

    print(f"{txt_path} convert to {save_path}")

In [30]:
trans_ds('ifashion_tiny/val.txt', 'ifashion_tiny/mmgcn/val.npy', user2id, item2id)
t = np.load('ifashion_tiny/mmgcn/val.npy', allow_pickle=True)
print(t.shape)

ifashion_tiny/val.txt convert to ifashion_tiny/mmgcn/val.npy
(17165,)


In [31]:
trans_ds('ifashion_tiny/test.txt', 'ifashion_tiny/mmgcn/test.npy', user2id, item2id)
t = np.load('ifashion_tiny/mmgcn/test.npy', allow_pickle=True)
print(t.shape)

ifashion_tiny/test.txt convert to ifashion_tiny/mmgcn/test.npy
(38372,)


In [32]:
%%bash
rsync -avcP ifashion_tiny/mmgcn/ ../../MMGCN/Data/ifashion/

sending incremental file list


t_feat.pt
     81,921,111 100%  402.55MB/s    0:00:00 (xfr#1, to-chk=5/7)
test.npy
        749,981 100%    2.58MB/s    0:00:00 (xfr#2, to-chk=4/7)
train.npy
      4,030,752 100%   13.35MB/s    0:00:00 (xfr#3, to-chk=3/7)
v_feat.pt
     40,961,111 100%  100.42MB/s    0:00:00 (xfr#4, to-chk=1/7)
val.npy
        252,322 100%  603.94kB/s    0:00:00 (xfr#5, to-chk=0/7)

sent 127,947,006 bytes  received 111 bytes  85,298,078.00 bytes/sec
total size is 131,027,702  speedup is 1.02


### 交互字典

In [33]:
import json

with open("ifashion_tiny/remap_col/user2id.json", 'r', encoding='utf-8') as f1:
    user2id: dict[str, int] = json.load(f1)

with open("ifashion_tiny/remap_col/item2id.json", 'r', encoding='utf-8') as f2:
    item2id: dict[str, int] = json.load(f2)

with open("ifashion_tiny/ui_dict.json", 'r', encoding='utf-8') as f1:
    ui_dict: dict[str, dict[str, int]] = json.load(f1)

ui_list_dict: dict[int, list[int]] = {}
for user, item_ratings in ui_dict.items():
    ui_list_dict[int(user2id[user])] = [item2id[item] for item in item_ratings.keys()]

user_item_array = np.array(ui_list_dict, dtype=object)
np.save('ifashion_tiny/mmgcn/user_item_dict.npy', user_item_array)

In [34]:
t = np.load('ifashion_tiny/mmgcn/user_item_dict.npy', allow_pickle=True)
print(t.shape)

()


In [35]:
%%bash
rsync -avcP ifashion_tiny/mmgcn/ ../../MMGCN/Data/ifashion/

sending incremental file list


user_item_dict.npy
      1,569,817 100%  488.61MB/s    0:00:00 (xfr#1, to-chk=2/7)

sent 1,570,535 bytes  received 35 bytes  1,047,046.67 bytes/sec
total size is 129,485,094  speedup is 82.44
