In [1]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
import json
from data_process import New_Amazon, Amazon_meta
from collections import defaultdict

把两个独立的Amazon数据读进来

In [2]:
domain_A = "Clothing_Shoes_and_Jewelry"
domain_B = "Sports_and_Outdoors"
domain_C = "AMAZON_FASHION"


In [None]:
data_A = New_Amazon(domain_A, 0)
data_B = New_Amazon(domain_B, 0)
data_C = New_Amazon(domain_C, 0)

In [3]:
data_A = pickle.load(open(f'./processed_data/data_{domain_A}_processed.pkl', 'rb'))
data_B = pickle.load(open(f'./processed_data/data_{domain_B}_processed.pkl', 'rb'))
data_C = pickle.load(open(f'./processed_data/data_{domain_C}_processed.pkl', 'rb'))

In [4]:
# 给每个交互添加domain id
new_data_A, new_data_B, new_data_C = [], [], []
for inter in tqdm(data_A):
    new_inter = list(inter)
    new_data_A.append(new_inter)
for inter in tqdm(data_B):
    new_inter = list(inter)
    new_data_B.append(new_inter)
for inter in tqdm(data_C):
    new_inter = list(inter)
    new_data_C.append(new_inter)

100%|██████████| 32292099/32292099 [00:26<00:00, 1232146.82it/s]
100%|██████████| 12980837/12980837 [00:02<00:00, 4673127.97it/s]
100%|██████████| 883636/883636 [00:09<00:00, 95621.87it/s]  


1. read_data: 把所有数据读出来，然后存到一个list中
2. filter: 单纯过滤掉交互过小的交互，返回的还是list
3. id_map: 制作user和item的dict映射，并拆掉list，变成一个用户的交互序列

In [5]:
def count_inter(data, t_min, t_max):
    
    user_count = {}
    item_count = {}
    for inter in data:
        user_id, item_id, time, _ = inter
        
        if user_id not in user_count.keys():
            user_count[user_id] = 1
        else:
            if time > t_min and time < t_max:
                user_count[user_id] += 1

        if item_id not in item_count.keys():
            item_count[item_id] = 1
        else:
            if time > t_min and time < t_max:
                item_count[item_id] += 1

    
    return user_count, item_count

In [6]:
def filter(data, user_minmum, item_minimum, t_min=1451577600, t_max=1459440000):   # 过滤掉交互少的数据
    
    user_count, item_count = count_inter(data, t_min=t_min, t_max=t_max)
    domain_set = {0: {"user": [], "item": []},
                  1: {"user": [], "item": []},
                  2: {"user": [], "item": []},
                  }
    new_data = []

    for inter in tqdm(data):
        user_id, item_id, time, domain_id = inter
        
        if item_count[item_id] > item_minimum and user_count[user_id] > user_minmum \
           and time > t_min and time < t_max:    # 只取2016-01-01到2016-01-15之间的数据
            
            new_data.append(inter)
            domain_set[domain_id]["user"].append(user_id)
            domain_set[domain_id]["item"].append(item_id)
    
    print("filter done!")

    return new_data, domain_set

In [7]:
def make_sequence(data):

    seq = {}
    domain_seq = {}

    for inter in tqdm(data):
        user_id, item_id, time, domain_id = inter
        if user_id not in seq.keys():
            seq[user_id] = [item_id]
            domain_seq[user_id] = [domain_id]
        else:
            seq[user_id].append(item_id)
            domain_seq[user_id].append(domain_id)

    return seq, domain_seq

In [8]:
def id_map(data, domain_set):
    
    final_data, final_domain = {}, {}
    temp_data = {}
    new_user_id = 1
    temp_item_count = {domain_id: len(set(domain_set[domain_id]["item"])) for domain_id in domain_set.keys()}
    item_count = {0: 1, 1: 1, 2: 1}
    item_dict = {
        0: {"str2id": {}, "id2str": {},},
        1: {"str2id": {}, "id2str": {},},
        2: {"str2id": {}, "id2str": {},},
    }
    user_dict = {"str2id": {}, "id2str": {},}

    for inter in tqdm(data):
        user_id, item_id, time, domain_id = inter
            
        if item_id not in item_dict[domain_id]["str2id"].keys():
            new_item_id = item_count[domain_id]
            item_dict[domain_id]["str2id"][item_id] = new_item_id
            item_dict[domain_id]["id2str"][new_item_id] = item_id
            item_count[domain_id] += 1
        
        if user_id not in user_dict["str2id"].keys():
            user_dict["str2id"][user_id] = new_user_id
            user_dict["id2str"][new_user_id] = user_id
            temp_data[new_user_id] = [(item_dict[domain_id]["str2id"][item_id], domain_id, time)]
            new_user_id += 1
        else:
            temp_data[user_dict["str2id"][user_id]].append((item_dict[domain_id]["str2id"][item_id], domain_id, time))

    print("map done!")

    for user_id, inter in tqdm(temp_data.items()):

        inter.sort(key=lambda x: x[2])
        final_data[user_id] = [temp_tuple[0] for temp_tuple in inter]
        final_domain[user_id] = [temp_tuple[1] for temp_tuple in inter]

    print("sort done!")
    
    return final_data, final_domain, user_dict, item_dict, item_count


In [9]:
# K-core user_core item_core
def check_Kcore(data, user_core, item_core):

    user_count = {}
    item_count = {}
    for inter in data:
        user_id, item_id, time, _ = inter
        
        if user_id not in user_count.keys():
            user_count[user_id] = 1
        else:
            user_count[user_id] += 1

        if item_id not in item_count.keys():
            item_count[item_id] = 1
        else:
            item_count[item_id] += 1

    for _, num in user_count.items():
        if num < user_core:
            return user_count, item_count, False
    for _, num in item_count.items():
        if num < item_core:
            return user_count, item_count, False
        
    return user_count, item_count, True

In [10]:
# 循环过滤 K-core
def filter_Kcore(data, user_core, item_core): # user 接所有items
    
    user_count, item_count, isKcore = check_Kcore(data, user_core, item_core)
    
    new_data = data

    while not isKcore:

        temp_data = []
        domain_set = {
            0: {"user": [], "item": []},
            1: {"user": [], "item": []},
            2: {"user": [], "item": []},
        }

        for inter in tqdm(new_data):
            user_id, item_id, time, domain_id = inter
            
            if item_count[item_id] > item_core and user_count[user_id] > user_core:    # 只取2016-01-01到2016-01-15之间的数据
                
                temp_data.append(inter)
                domain_set[domain_id]["user"].append(user_id)
                domain_set[domain_id]["item"].append(item_id)
        user_count, item_count, isKcore = check_Kcore(temp_data, user_core, item_core)

        new_data = temp_data

    print("K-core filter done!")

    return new_data, domain_set

In [11]:
def filter_time(data, t_min=1451577600, t_max=1459440000):   # 过滤掉交互少的数据
    

    new_data = []

    for inter in tqdm(data):
        _, _, time, _ = inter
        
        if time > t_min and time < t_max:    # 只取2016-01-01到2016-01-15之间的数据
            
            new_data.append(inter)

    print("filter time done!")

    return new_data

In [None]:
data_A[0]

In [12]:
all_data = new_data_A + new_data_B + new_data_C
# new_data, domain_set = filter(all_data, user_minmum=10, item_minimum=10)
all_data = filter_time(all_data, t_min=1514736000, t_max=1577808000)
new_data, domain_set = filter_Kcore(all_data, user_core=5, item_core=3)
final_data, final_domain, user_dict, item_dict, item_count = id_map(new_data, domain_set)
item_count = {domain_id: len(set(domain_set[domain_id]["item"])) for domain_id in domain_set.keys()}
item_dict["item_count"] = item_count

100%|██████████| 46156572/46156572 [00:04<00:00, 9912639.63it/s] 


filter time done!


100%|██████████| 4186440/4186440 [00:01<00:00, 2459320.56it/s]
100%|██████████| 504148/504148 [00:00<00:00, 1812700.84it/s]
100%|██████████| 323739/323739 [00:00<00:00, 1952516.15it/s]
100%|██████████| 229437/229437 [00:00<00:00, 1961110.17it/s]
100%|██████████| 197405/197405 [00:00<00:00, 2066732.85it/s]
100%|██████████| 171180/171180 [00:00<00:00, 2122508.52it/s]
100%|██████████| 159643/159643 [00:00<00:00, 2128314.88it/s]
100%|██████████| 148208/148208 [00:00<00:00, 2190069.78it/s]
100%|██████████| 142486/142486 [00:00<00:00, 2251756.18it/s]
100%|██████████| 136458/136458 [00:00<00:00, 2245164.58it/s]
100%|██████████| 133026/133026 [00:00<00:00, 2303937.61it/s]
100%|██████████| 129272/129272 [00:00<00:00, 2323604.19it/s]
100%|██████████| 127003/127003 [00:00<00:00, 2373647.35it/s]
100%|██████████| 124567/124567 [00:00<00:00, 2362928.59it/s]
100%|██████████| 123054/123054 [00:00<00:00, 2392839.39it/s]
100%|██████████| 121390/121390 [00:00<00:00, 2346470.53it/s]
100%|██████████| 12042

K-core filter done!


100%|██████████| 114653/114653 [00:00<00:00, 837107.05it/s]


map done!


100%|██████████| 10868/10868 [00:00<00:00, 397638.58it/s]

sort done!





In [13]:
final_data 

{1: [1, 1470, 1511, 2043, 2313, 2795],
 2: [1, 1074, 1961, 2795, 3213, 789],
 3: [2480, 2230, 1, 2110, 2795, 1604, 2828],
 4: [2, 9, 57, 142, 143, 150, 222, 223, 224, 263, 264],
 5: [2,
  9,
  57,
  142,
  143,
  150,
  222,
  223,
  224,
  263,
  264,
  607,
  809,
  1142,
  1344,
  1400,
  1415,
  1960,
  1962,
  1963,
  2023,
  2029,
  2031,
  2057,
  2076,
  2124,
  2142,
  2165,
  2585,
  720],
 6: [2,
  9,
  57,
  142,
  143,
  150,
  222,
  223,
  224,
  263,
  264,
  607,
  809,
  1142,
  1344,
  1400,
  1415,
  1960,
  1962,
  1963,
  2023,
  2029,
  2031,
  2057,
  2076,
  2124,
  2142,
  2165,
  2585,
  720],
 7: [2,
  9,
  57,
  142,
  143,
  150,
  222,
  223,
  224,
  263,
  264,
  607,
  809,
  1142,
  1344,
  1400,
  1415,
  1960,
  1962,
  1963,
  2023,
  2029,
  2031,
  2057,
  2076,
  2124,
  2142,
  2165,
  2585,
  720],
 8: [2,
  9,
  57,
  142,
  143,
  150,
  222,
  223,
  224,
  263,
  264,
  575,
  1400,
  1415,
  1960,
  1962,
  1963,
  2023,
  2029,
  2031,
 

In [20]:
# book和movie两个domain交集的用户数量
len(set(domain_set[0]["user"]) & set(domain_set[2]["user"])), len(set(domain_set[0]["user"])), len(set(domain_set[1]["user"])), len(set(domain_set[2]["user"]))

(276, 10473, 4605, 339)

In [None]:
# domain A中物品数量, domian B中物品数量, 用户数量
len(item_dict[0]["str2id"]), len(item_dict[1]["str2id"]), len(user_dict["str2id"])

In [None]:
# 验证map是否能对上
print(item_count)
max(item_dict[0]["str2id"].values()), max(item_dict[1]["str2id"].values())

把所有数据先存下来
可以使用final_domain去进行数据筛选

In [16]:
with open("./handled/id_map.json", "w") as f:
    json.dump({"user_dict": user_dict, "item_dict": item_dict}, f)
with open("./handled/amazon_all.pkl", "wb") as f:
    pickle.dump((final_data, final_domain), f)

In [17]:
with open("./handled/id_map.json", "r") as f:
    map_dict = json.load(f)
user_dict = map_dict["user_dict"]
item_dict = map_dict["item_dict"]

with open("./handled/amazon_all.pkl", "rb") as f:
    final_data, final_domain = pickle.load(f)

筛选book-movie两个domain

这里选的是book和movie两个domain

In [40]:
item_dict['2']

{'str2id': {'B000GHMRLW': 1,
  'B000GHRZN2': 2,
  'B000JOOR7O': 3,
  'B000KPIHQ4': 4,
  'B000P0X15G': 5,
  'B000PHANNM': 6,
  'B000V0IBDM': 7,
  'B000YFSR5G': 8,
  'B000YFSR4W': 9,
  'B0014F8TIU': 10,
  'B0017U1KBK': 11,
  'B001IKJOLW': 12,
  'B00201ER88': 13,
  'B0058YEJ5K': 14,
  'B0014F7B98': 15,
  'B009MA34NY': 16,
  'B00CZ5067A': 17,
  'B00DQYPSJU': 18,
  'B0092UF54A': 19,
  'B005AGO4LU': 20,
  'B00I0VHS10': 21,
  'B00OIWG7IU': 22,
  'B00PSM5ENS': 23,
  'B00T6NJGAW': 24,
  'B00U5QQCI8': 25,
  'B00XT15P8E': 26,
  'B00XTM0ZPG': 27,
  'B00YP2TNZ2': 28,
  'B00ZW3SCF0': 29,
  'B010RGY55M': 30,
  'B010RRWKT4': 31,
  'B014IBJKNO': 32,
  'B01AHD2NQS': 33,
  'B01BVBRD74': 34,
  'B01DW59Z46': 35,
  'B01FO04LGE': 36,
  'B01FQ114LG': 37,
  'B01G5VFO58': 38,
  'B01GEJG5M2': 39},
 'id2str': {'1': 'B000GHMRLW',
  '2': 'B000GHRZN2',
  '3': 'B000JOOR7O',
  '4': 'B000KPIHQ4',
  '5': 'B000P0X15G',
  '6': 'B000PHANNM',
  '7': 'B000V0IBDM',
  '8': 'B000YFSR5G',
  '9': 'B000YFSR4W',
  '10': 'B0014F8TIU

In [38]:
final_domain

{1: [0, 0, 0, 0, 0, 0],
 2: [0, 0, 0, 0, 0, 1],
 3: [0, 0, 0, 0, 0, 0, 0],
 4: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 5: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 6: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 7: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 8: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  0,
  0],
 9: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 10: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1],
 11: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 

In [None]:
## 先筛选final_data和final_domain
bm_data, bm_domain = {}, {}
for user_id, inter in tqdm(final_domain.items()):
    inter = np.array(inter)

    inter_data = np.array(final_data[user_id])
    bm_data[user_id] = inter_data[np.where(np.logical_or(inter==0, inter == 1,inter==2))]
    bm_domain[user_id] = inter[np.where(np.logical_or(inter==0, inter == 1,inter==2))]

  0%|          | 0/10868 [00:00<?, ?it/s]

100%|██████████| 10868/10868 [00:00<00:00, 163027.15it/s]


In [46]:
## 先筛选final_data和final_domain
bm_data, bm_domain = {}, {}

# 分析したいドメインIDをリストで定義
TARGET_DOMAIN_IDS = [0, 1, 2] 

for user_id, inter in tqdm(final_domain.items()):
    inter = np.array(inter)
    inter_data = np.array(final_data[user_id])
    
    # 修正点：np.isinを使い、interの各要素がTARGET_DOMAIN_IDSに含まれるかチェック
    mask = np.isin(inter, TARGET_DOMAIN_IDS)
    
    bm_data[user_id] = inter_data[mask]
    bm_domain[user_id] = inter[mask]

100%|██████████| 10868/10868 [00:00<00:00, 52688.78it/s]


In [47]:
len(bm_data)

10868

In [42]:
domain_stats = []
for inter in bm_domain.values():
    domain_stats.append(np.mean(inter))

In [None]:
# 统计两个domain中overlap的用户
domain_stats = np.array(domain_stats)
domain_stats[domain_stats==0].shape[0], domain_stats[domain_stats==1].shape[0], domain_stats.shape[0]

In [None]:
# 统计整体序列的长度
inter_len = []
for inter in bm_data.values():
    inter_len.append(len(inter))
print(np.mean(inter_len))
plt.hist(inter_len, bins=30)

In [None]:
min(inter_len)

In [None]:
# 统计物品的交互次数
item_freq = {
        0: np.zeros(item_count[0]+1),
        1: np.zeros(item_count[1]+1),
    }
for user_id in tqdm(final_data.keys()):
    seq = final_data[user_id]
    domain_seq = final_domain[user_id]
    for i in range(len(seq)):
        item_freq[domain_seq[i]][seq[i]] += 1

In [47]:
# 方便画频率分布直方图
item_freq[0][item_freq[0]>30] = 30
item_freq[1][item_freq[1]>30] = 30

In [None]:
np.mean(item_freq[0]), np.mean(item_freq[1])

In [None]:
plt.hist(item_freq[0], bins=30)

In [None]:
plt.hist(item_freq[1], bins=30)

In [None]:
inter_len = np.array(inter_len)
len(inter_len[inter_len>200]) / len(inter_len)

In [52]:
with open("./handled/cloth_sport.pkl", "wb") as f:
    pickle.dump((bm_data, bm_domain), f)

In [48]:
with open("./handled/cloth_sport_fashion.pkl", "wb") as f:
    pickle.dump((bm_data,bm_domain),f)

In [53]:
# 统计重复交互的问题
# _, i_counts = np.unique(bm_data[0], return_counts=True)
# np.sum(i_counts), len(i_counts)

get attributes

In [61]:
def get_attribute_Amazon(meta_infos, datamaps, attribute_core):

    attributes = defaultdict(int)
    # 做映射
    attribute2id = {}
    id2attribute = {}
    attributeid2num = defaultdict(int)
    attribute_id = 1
    items2attributes = {}
    attribute_lens = []

    for iid, attributes in meta_infos.items():
        item_id = datamaps['item2id'][iid]
        items2attributes[item_id] = []
        for attribute in attributes:
            if attribute not in attribute2id:
                attribute2id[attribute] = attribute_id
                id2attribute[attribute_id] = attribute
                attribute_id += 1
            attributeid2num[attribute2id[attribute]] += 1
            items2attributes[item_id].append(attribute2id[attribute])
        attribute_lens.append(len(items2attributes[item_id]))
    print(f'before delete, attribute num:{len(attribute2id)}')
    print(f'attributes len, Min:{np.min(attribute_lens)}, Max:{np.max(attribute_lens)}, Avg.:{np.mean(attribute_lens):.4f}')
    # 更新datamap
    datamaps['attribute2id'] = attribute2id
    datamaps['id2attribute'] = id2attribute
    datamaps['attributeid2num'] = attributeid2num
    return len(attribute2id), np.mean(attribute_lens), datamaps, items2attributes

In [4]:
from data_process import parse_meta
import json

In [None]:
import gzip
def parse_meta(path): # for Amazon
    g = gzip.open(path, 'rb')
    inter_list = []
    for l in tqdm(g):
        json_str = l.decode()
        inter_list.append(json.loads(l))

    return inter_list


In [7]:
def Amazon_meta(dataset_name, data_maps):
    datas = {}
    meta_flie = './raw/meta_' + str(dataset_name) + '.json.gz'
    item_asins = list(data_maps['str2id'].keys())

    for info in tqdm(parse_meta(meta_flie)):
        if info['asin'] not in item_asins:
            continue
        datas[info['asin']] = info
    return datas

In [None]:
meta_data_A = Amazon_meta(domain_A, item_dict["0"])
# meta_data_B = Amazon_meta(domain_B, item_dict["1"])

In [None]:
len(meta_data_A)

In [72]:
json_str = json.dumps(meta_data_A)
with open("./handled/item2attributes_A.json", 'w') as out:
    out.write(json_str)

In [8]:
meta_data_B = Amazon_meta(domain_B, item_dict["1"])

962300it [01:58, 8105.48it/s] 
100%|██████████| 962300/962300 [00:07<00:00, 127379.36it/s]


In [9]:
json_str = json.dumps(meta_data_B)
with open("./handled/item2attributes_B.json", 'w') as out:
    out.write(json_str)

In [None]:
meta_data_C = Amazon_meta(domain_C, item_dict["2"])


In [None]:
json_str = json.dumps(meta_data_C)
with open("./handled/item2attributes_C.json", 'w') as out:
    out.write(json_str)