In [1]:
import pandas as pd
import numpy as np

In [2]:
NAMES = ['book', 'Amazon-KG-5core-Books']

In [3]:
import pandas as pd

def leave_k_out_split(interaction_df: pd.DataFrame, val_k: int, test_k: int, k_core: int, user_col, item_col, time_col):
    """
    Split dataset using Chronological split with Iterative K-Core Filtering.
    Cho phép số lượng item tập Val khác số lượng item tập Test.
    
    Args:
        interaction_df: Dataframe of user-item interactions
        val_k (int): Số lượng item rút ra cho Validation (Ví dụ: 1)
        test_k (int): Số lượng item rút ra cho Test (Ví dụ: 2)
        k_core (int): Ngưỡng lọc K-core. (Cần lớn hơn tổng val_k + test_k)
    """
    # 1. Cập nhật điều kiện kiểm tra an toàn
    if k_core <= (val_k + test_k):
        raise ValueError(f"k_core ({k_core}) phải lớn hơn tổng val_k + test_k ({val_k + test_k}) để đảm bảo tập Train có dữ liệu!")

    if val_k >= test_k:
        print(f"⚠️ Lưu ý: Bạn đang set val_k ({val_k}) >= test_k ({test_k}). Thường thì tập Val nên nhỏ hơn tập Test.")

    df_filtered = interaction_df.copy()
    
    #########################################################
    #### 1. ITERATIVE K-CORE FILTERING (Bộ lọc sinh tử)
    #########################################################
    print(f"Bắt đầu lọc {k_core}-core...")
    while True:
        start_len = len(df_filtered)
        
        # Lọc User
        user_counts = df_filtered[user_col].value_counts()
        valid_users = user_counts[user_counts >= k_core].index
        df_filtered = df_filtered[df_filtered[user_col].isin(valid_users)]
        
        # Lọc Item
        item_counts = df_filtered[item_col].value_counts()
        valid_items = item_counts[item_counts >= k_core].index
        df_filtered = df_filtered[df_filtered[item_col].isin(valid_items)]
        
        # Nếu số lượng dòng không giảm nữa -> Đồ thị đã ổn định
        if len(df_filtered) == start_len:
            break
            
    print(f"Sau khi lọc: Còn lại {df_filtered[user_col].nunique()} Users và {df_filtered[item_col].nunique()} Items.")
    
    #########################################################
    #### 2. Sort dataframe by timestamp
    #########################################################
    df_filtered[time_col] = pd.to_datetime(df_filtered[time_col])        
    df_filtered = df_filtered.sort_values(by=[user_col, time_col]).reset_index(drop=True)

    #########################################################
    #### 3. Split train, validation, test dataset
    #########################################################
    
    # Rút test_k tương tác MỚI NHẤT làm tập Test
    test_indices = df_filtered.groupby(user_col).tail(test_k).index
    test_interaction_df = df_filtered.loc[test_indices]
    
    remaining_after_test = df_filtered.drop(test_indices)
    
    # Rút val_k tương tác KẾ TIẾP (áp chót) làm tập Validation
    val_indices = remaining_after_test.groupby(user_col).tail(val_k).index
    val_interaction_df = df_filtered.loc[val_indices]
    
    # Toàn bộ phần còn lại (cũ nhất) đưa vào tập Train
    train_indices = remaining_after_test.drop(val_indices).index
    train_interaction_df = df_filtered.loc[train_indices]
    
    return train_interaction_df, val_interaction_df, test_interaction_df, df_filtered

In [4]:
def build_graph_and_interactions(interactions: pd.DataFrame, link: pd.DataFrame, static_graph: pd.DataFrame):
    #########################################################
    #### 1. Build static knowledge graph
    #########################################################
    user_tokens = interactions['user_id:token'].unique()
    num_users = len(user_tokens)

    entity_tokens = link['entity_id:token'].unique()

    head_tokens = static_graph['head_id:token'].unique()
    tail_tokens = static_graph['tail_id:token'].unique()

    all_entity_tokens = pd.unique(np.concatenate([entity_tokens, head_tokens, tail_tokens]))
    entity2id = {entity: (num_users + idx + 1) for idx, entity in enumerate(all_entity_tokens)}

    static_graph['head_id'] = static_graph['head_id:token'].map(entity2id)
    static_graph['tail_id'] = static_graph['tail_id:token'].map(entity2id)

    static_graph['relation_id'] = static_graph['relation_id:token'].astype('category').cat.codes  
    static_graph['relation_id'] = static_graph['relation_id'] + 1   

    static_graph = static_graph.dropna()
    static_graph = static_graph.astype({'head_id': 'long', 'relation_id': 'long', 'tail_id': 'long'})
    static_graph = static_graph[['head_id', 'relation_id', 'tail_id',
             'head_id:token', 'relation_id:token', 'tail_id:token']]

    static_graph = static_graph.sort_values(by=['head_id', 'tail_id'])

    #########################################################
    #### 2. Build interactions
    #########################################################
    user2id = {user: (idx + 1) for idx, user in enumerate(user_tokens)}

    item2entity = dict(zip(link['item_id:token'], link['entity_id:token']))
    item2entity_id = {item: entity2id[entity] for item, entity in item2entity.items()}

    interactions['entity_id:token'] = interactions['item_id:token'].map(item2entity)
    interactions['user_id'] = interactions['user_id:token'].map(user2id)
    interactions['entity_id'] = interactions['item_id:token'].map(item2entity_id)

    interactions = interactions.dropna()
    interactions = interactions.astype({'user_id': 'long', 'entity_id': 'long'})
    interactions = interactions[['user_id', 'entity_id', 'timestamp','user_id:token',
                                 'entity_id:token', 'item_id:token']]

    interactions = interactions.sort_values(by=['user_id', 'entity_id'])

    return static_graph, interactions, entity2id, user2id, item2entity, item2entity_id

In [5]:
if __name__ == '__main__':
    name0 = NAMES[0]
    name1 = NAMES[1]

    interactions = pd.read_csv(f'./data/{name0}/{name0}_interaction.csv', sep= ',')
    link = pd.read_csv(f'./data/{name0}/{name1}.link', sep="\t")
    static_graph = pd.read_csv(f'./data/{name0}/{name1}.kg', sep="\t")
    
    train_df, val_df, test_df, filtered_interactions = leave_k_out_split(interactions, val_k= 1, test_k = 2, k_core= 8,
                                                                            user_col= 'user_id:token', 
                                                                            item_col= 'item_id:token', 
                                                                            time_col='timestamp')


    static_graph, interactions, entity2id, user2id, item2entity, item2entity_id = build_graph_and_interactions(filtered_interactions, link, static_graph)
    
    static_graph.to_csv(f'./data/{name0}/{name0}_processed_static_graph.csv', index=False)
    interactions.to_csv(f'./data/{name0}/{name0}_processed_interactions.csv', index= False)

    

    train_df['entity_id:token'] = train_df['item_id:token'].map(item2entity)
    train_df['user_id'] = train_df['user_id:token'].map(user2id)
    train_df['entity_id'] = train_df['item_id:token'].map(item2entity_id)
    train_df = train_df[['user_id', 'entity_id', 'timestamp','user_id:token',
                                'entity_id:token', 'item_id:token']]
    train_df = train_df.sort_values(by=['user_id', 'entity_id'])
    

    val_df['entity_id:token'] = val_df['item_id:token'].map(item2entity)
    val_df['user_id'] = val_df['user_id:token'].map(user2id)
    val_df['entity_id'] = val_df['item_id:token'].map(item2entity_id)
    val_df = val_df[['user_id', 'entity_id', 'timestamp','user_id:token',
                                'entity_id:token', 'item_id:token']]
    val_df = val_df.sort_values(by=['user_id', 'entity_id'])

    test_df['entity_id:token'] = test_df['item_id:token'].map(item2entity)
    test_df['user_id'] = test_df['user_id:token'].map(user2id)
    test_df['entity_id'] = test_df['item_id:token'].map(item2entity_id)
    test_df = test_df[['user_id', 'entity_id', 'timestamp','user_id:token',
                                'entity_id:token', 'item_id:token']]
    test_df = test_df.sort_values(by=['user_id', 'entity_id'])

    train_df.to_csv(f'./data/{name0}/{name0}_train_interactions.csv', index= False)
    val_df.to_csv(f'./data/{name0}/{name0}_val_interactions.csv', index= False)
    test_df.to_csv(f'./data/{name0}/{name0}_test_interactions.csv', index= False)

Bắt đầu lọc 8-core...
Sau khi lọc: Còn lại 2890 Users và 2068 Items.
