In [1]:
import pandas as pd

In [2]:
import os

for i in range(1, 11):
    name_folder = f'amazon_books_60core/amazon_books_60core_split_{i}'
    os.makedirs(name_folder)

In [6]:
# split data into train and test
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('amazon_books_60core/amazon_books_60core/amazon_books_60core.inter', sep='\t')
amz_train, amz_test = train_test_split(dataset, test_size=0.2, random_state=42)
amz_valid = pd.DataFrame({'user_id:token': [], 'item_id:token': [], 'label:float': []})

display(dataset.head(5))

print(len(amz_train))
print(len(amz_test))

full_users = set(dataset['user_id:token'])
full_items = set(dataset['item_id:token'])

train_users = set(amz_train['user_id:token'])
train_items = set(amz_train['item_id:token'])

print(len(full_users), len(full_items))
print(len(train_users), len(train_items))

amz_train.to_csv('amazon_books_60core/amazon_books_60core/amazon_books_60core.train.inter', sep='\t', index=False)
amz_test.to_csv('amazon_books_60core/amazon_books_60core/amazon_books_60core.test.inter', sep='\t', index=False)
amz_valid.to_csv('amazon_books_60core/amazon_books_60core/amazon_books_60core.valid.inter', sep='\t', index=False)

Unnamed: 0,user_id:token,item_id:token,rating:float
0,A2S166WSCFIFP5,000100039X,5
1,A2XQ5LZHTD4AFT,000100039X,5
2,A1TT4CY55WLHAR,000100039X,5
3,A1NPNGWBVD9AK3,000100039X,5
4,AWLFVCT9128JV,000100039X,5


1172696
293175
22155 54458
22125 53959


In [7]:
def create_subsets_dataframe(dataframe, num_subsets):
    subsets = []
    subset_size = len(dataframe) // num_subsets

    shuffled_dataframe = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)
    
    for i in range(1, num_subsets + 1):
        subset = shuffled_dataframe.head(i * subset_size)
        subset = subset.sort_values(by=['user_id:token','item_id:token'], ascending=[True, True])
        subsets.append(subset)
    
    return subsets


data = pd.read_csv('amazon_books_60core/amazon_books_60core.train.inter', sep='\t')
num_subsets = 10

resulting_subsets = create_subsets_dataframe(data, num_subsets)

for i, subset in enumerate(resulting_subsets):
    print(f'len {i+1}: {len(subset)}')
    subset.to_csv(f'amazon_books_60core_split_{i+1}/amazon_books_60core_split_{i+1}.train.inter', sep='\t', index=False)


len 1: 117269
len 2: 234538
len 3: 351807
len 4: 469076
len 5: 586345
len 6: 703614
len 7: 820883
len 8: 938152
len 9: 1055421
len 10: 1172690


In [13]:
import shutil

for i in range(1, 11):

    kg = pd.read_csv('amazon_books_60core/amazon_books_60core.kg', sep='\t')
    link = pd.read_csv('amazon_books_60core/amazon_books_60core.link', sep='\t')

    inter_k = pd.read_csv(f'amazon_books_60core_split_{i}/amazon_books_60core_split_{i}.train.inter', sep='\t')
    items_k = set(inter_k['item_id:token'])
    entity_k = set(link[link['item_id:token'].isin(items_k)]['entity_id:token'])
    
    kg_k = kg[kg['head_id:token'].isin(entity_k) | kg['tail_id:token'].isin(entity_k)].sort_values(by=['head_id:token'], ascending=[True])
    link_k = link[link['item_id:token'].isin(items_k)].sort_values(by=['item_id:token'], ascending=[True])

    kg_k.to_csv(f'amazon_books_60core_split_{i}/amazon_books_60core_split_{i}.kg', sep='\t', index=False)
    link_k.to_csv(f'amazon_books_60core_split_{i}/amazon_books_60core_split_{i}.link', sep='\t', index=False)

    shutil.copy('amazon_books_60core/amazon_books_60core.test.inter', f'amazon_books_60core_split_{i}/amazon_books_60core_split_{i}.test.inter')
    shutil.copy('amazon_books_60core/amazon_books_60core.valid.inter', f'amazon_books_60core_split_{i}/amazon_books_60core_split_{i}.valid.inter')

    print(i,'\t',len(kg_k))

1 	 76456
2 	 87352
3 	 91648
4 	 93372
5 	 94100
6 	 95102
7 	 95608
8 	 95928
9 	 96261
10 	 96365


In [18]:
def stats_split(dataset):
    interactions_df = pd.read_csv(dataset, sep='\t')

    num_users = interactions_df['user_id:token'].nunique()
    num_items = interactions_df['item_id:token'].nunique()

    num_likes = interactions_df[interactions_df['rating:float'] >= 4].shape[0]
    total_interactions = interactions_df.shape[0]
    percentage_likes = (num_likes / total_interactions) * 100

    sparsity = 1 - (total_interactions / (num_users * num_items))

    avg_rating_per_user = (total_interactions / num_users) * 100
    avg_rating_per_item = (total_interactions / num_items) * 100

    print(f"Number of Users: {num_users}")
    print(f"Number of Items: {num_items}")
    print(f"Percentage of Likes: {percentage_likes:.2f}%")
    print(f"Sparsity: {sparsity:.4f}")
    print(f"Average Rating per User: {avg_rating_per_user:.2f}")
    print(f"Average Rating per Item: {avg_rating_per_item:.2f}")

for i in range(1,11):
    print(f'\n-----\nSplit {i}')
    stats_split(f'amazon_books_60core/amazon_books_60core_split_{i}/amazon_books_60core_split_{i}.train.inter')



-----
Split 1
Number of Users: 20032
Number of Items: 37184
Percentage of Likes: 80.65%
Sparsity: 0.9998
Average Rating per User: 585.41
Average Rating per Item: 315.37

-----
Split 2
Number of Users: 21293
Number of Items: 44906
Percentage of Likes: 80.79%
Sparsity: 0.9998
Average Rating per User: 1101.48
Average Rating per Item: 522.29

-----
Split 3
Number of Users: 21669
Number of Items: 48287
Percentage of Likes: 80.62%
Sparsity: 0.9997
Average Rating per User: 1623.55
Average Rating per Item: 728.57

-----
Split 4
Number of Users: 21839
Number of Items: 50180
Percentage of Likes: 80.61%
Sparsity: 0.9996
Average Rating per User: 2147.88
Average Rating per Item: 934.79

-----
Split 5
Number of Users: 21944
Number of Items: 51372
Percentage of Likes: 80.63%
Sparsity: 0.9995
Average Rating per User: 2672.01
Average Rating per Item: 1141.37

-----
Split 6
Number of Users: 21999
Number of Items: 52159
Percentage of Likes: 80.64%
Sparsity: 0.9994
Average Rating per User: 3198.39
Averag