In [9]:
import pandas as pd

In [10]:
# ======== Step 1. Load raw data ========
dir_path = './amazon-book-LightKG/'
train_inter = pd.read_csv(dir_path + 'Amazon-book.train.inter', sep='\t')
valid_inter = pd.read_csv(dir_path + 'Amazon-book.valid.inter', sep='\t')
test_inter = pd.read_csv(dir_path + 'Amazon-book.test.inter', sep='\t')
kg = pd.read_csv(dir_path + 'Amazon-book.kg', sep='\t')
link = pd.read_csv(dir_path + 'Amazon-book.link', sep='\t')

In [11]:
train_inter = pd.concat([train_inter, valid_inter], ignore_index=True)
inter = pd.concat([train_inter, test_inter], ignore_index=True)

In [12]:
outdir_path = './amazon-book-reproduce/'

In [13]:
# ======== Step 2. Re-index users, items, entities, relations ========

# --- user_list.txt ---
unique_users = inter['user_id:token'].unique()
user2id = {u: i for i, u in enumerate(unique_users)}
pd.DataFrame(list(user2id.items()), columns=['org_id', 'remap_id']) \
  .to_csv(outdir_path + 'user_list.txt', sep='\t', index=False, header=True)

In [14]:
# --- relation_list.txt ---
unique_relations = kg['relation_id:token'].unique()
rel2id = {r: i for i, r in enumerate(unique_relations)}
pd.DataFrame(list(rel2id.items()), columns=['org_id', 'remap_id']) \
  .to_csv(outdir_path + 'relation_list.txt', sep='\t', index=False, header=True)

In [15]:
# --- item_list.txt ---
unique_items = inter['item_id:token'].unique()

# --- entity_list.txt ---
# First, map all items (as entities) then remaining entities from kg
item_entity_map = {row['item_id:token']: row['entity_id:token'] for _, row in link.iterrows()}
unique_entities = set(link['entity_id:token']).union(set(kg['head_id:token'])).union(set(kg['tail_id:token']))

# Reindex entities so that items come first
entity2id = {}
item2id = {}
cnt = 0
for i in unique_items:   # first map item entities
    entity2id[item_entity_map[i]] = cnt
    item2id[item_entity_map[i]] = cnt
    cnt += 1
    
# then add remaining entities
for e in unique_entities:
    if e not in entity2id:
        entity2id[e] = cnt
        cnt += 1

pd.DataFrame(list(item2id.items()), columns=['org_id', 'remap_id']) \
  .to_csv(outdir_path + 'item_list.txt', sep='\t', index=False, header=True)

pd.DataFrame(list(entity2id.items()), columns=['org_id', 'remap_id']) \
  .to_csv(outdir_path + 'entity_list.txt', sep='\t', index=False, header=True)

In [16]:
# ======== Step 3. Build train.txt ========
train_dict = {}
for _, row in train_inter.iterrows():
    u = user2id[row['user_id:token']]
    i = item2id[item_entity_map[row['item_id:token']]]
    train_dict.setdefault(u, []).append(i)

with open(outdir_path + 'train.txt', 'w') as f:
    for u, items in train_dict.items():
        f.write(f"{u} " + " ".join(map(str, items)) + "\n")
        
test_dict = {}
for _, row in test_inter.iterrows():
    u = user2id[row['user_id:token']]
    i = item2id[item_entity_map[row['item_id:token']]]
    test_dict.setdefault(u, []).append(i)
    
with open(outdir_path + 'test.txt', 'w') as f:
    for u, items in test_dict.items():
        f.write(f"{u} " + " ".join(map(str, items)) + "\n")

# ======== Step 4. Build kg.txt ========
with open(outdir_path + 'kg.txt', 'w') as f:
    for _, row in kg.iterrows():
        h = entity2id[row['head_id:token']]
        r = rel2id[row['relation_id:token']]
        t = entity2id[row['tail_id:token']]
        f.write(f"{h} {r} {t}\n")

print("✅ Conversion complete! Files generated:\n- train.txt\n- kg.txt\n- user_list.txt\n- item_list.txt\n- entity_list.txt\n- relation_list.txt")


✅ Conversion complete! Files generated:
- train.txt
- kg.txt
- user_list.txt
- item_list.txt
- entity_list.txt
- relation_list.txt
