In [42]:
import pandas as pd

In [43]:
import os

for i in range(1, 11):
    name_folder = f'ml1m/movielens_split_{i}'
    os.makedirs(name_folder)

In [44]:
def create_subsets_dataframe(dataframe, num_subsets):
    subsets = []
    subset_size = len(dataframe) // num_subsets

    shuffled_dataframe = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)
    
    for i in range(1, num_subsets + 1):
        subset = shuffled_dataframe.head(i * subset_size)
        subset = subset.sort_values(by=['user_id:token','item_id:token'], ascending=[True, True])
        subsets.append(subset)
    
    return subsets


data = pd.read_csv('/movielens/movielens.train.inter', sep='\t')
num_subsets = 10

resulting_subsets = create_subsets_dataframe(data, num_subsets)

# kg = pd.read_csv('')

for i, subset in enumerate(resulting_subsets):
    print(f'len {i+1}: {len(subset)}')
    # subset.to_csv(f'movielens_split_{i+1}')
    subset.to_csv(f'movielens_split_{i+1}/movielens_split_{i+1}.train.inter', sep='\t', index=False)


len 1: 75745
len 2: 151490
len 3: 227235
len 4: 302980
len 5: 378725
len 6: 454470
len 7: 530215
len 8: 605960
len 9: 681705
len 10: 757450


In [46]:
import shutil

for i in range(1, 11):

    kg = pd.read_csv('ml1m/movielens/movielens.kg', sep='\t')
    link = pd.read_csv('ml1m/movielens/movielens.link', sep='\t')

    inter_k = pd.read_csv(f'ml1m/movielens_split_{i}/movielens_split_{i}.train.inter', sep='\t')
    items_k = set(inter_k['item_id:token'])
    
    kg_k = kg[kg['head_id:token'].isin(items_k)].sort_values(by=['head_id:token'], ascending=[True])
    link_k = link[link['item_id:token'].isin(items_k)].sort_values(by=['item_id:token'], ascending=[True])

    kg_k.to_csv(f'ml1m/movielens_split_{i}/movielens_split_{i}.kg', sep='\t', index=False)
    link_k.to_csv(f'ml1m/movielens_split_{i}/movielens_split_{i}.link', sep='\t', index=False)

    shutil.copy('ml1m/movielens/movielens.test.inter', f'ml1m/movielens_split_{i}/movielens_split_{i}.test.inter')
    shutil.copy('ml1m/movielens/movielens.valid.inter', f'ml1m/movielens_split_{i}/movielens_split_{i}.valid.inter')

    print(i,'\t',len(kg_k))

1 	 66820
2 	 68661
3 	 69121
4 	 69439
5 	 69707
6 	 69890
7 	 70068
8 	 70186
9 	 70310
10 	 70341


In [54]:
def stats_split(dataset):
    
    interactions_df = pd.read_csv(dataset, sep='\t')

    num_users = interactions_df['user_id:token'].nunique()
    num_items = interactions_df['item_id:token'].nunique()

    num_likes = interactions_df[interactions_df['label:float'] == 1].shape[0]
    total_interactions = interactions_df.shape[0]
    percentage_likes = (num_likes / total_interactions) * 100

    sparsity = 1 - (total_interactions / (num_users * num_items))

    avg_rating_per_user = (total_interactions / num_users) * 100
    avg_rating_per_item = (total_interactions / num_items) * 100

    print(f"Number of Users: {num_users}")
    print(f"Number of Items: {num_items}")
    print(f"Percentage of Likes: {percentage_likes:.2f}%")
    print(f"Sparsity: {sparsity:.4f}")
    print(f"Average Rating per User: {avg_rating_per_user:.2f}")
    print(f"Average Rating per Item: {avg_rating_per_item:.2f}")

for i in range(1,11):
    print(f'\n-----\nSplit {i}')
    stats_split(f'movielens_split_{i}/movielens_split_{i}.train.inter')



-----
Split 1
Number of Users: 5878
Number of Items: 2925
Percentage of Likes: 57.24%
Sparsity: 0.9956
Average Rating per User: 1288.62
Average Rating per Item: 2589.57

-----
Split 2
Number of Users: 6023
Number of Items: 3050
Percentage of Likes: 57.15%
Sparsity: 0.9918
Average Rating per User: 2515.19
Average Rating per Item: 4966.89

-----
Split 3
Number of Users: 6036
Number of Items: 3093
Percentage of Likes: 57.16%
Sparsity: 0.9878
Average Rating per User: 3764.66
Average Rating per Item: 7346.75

-----
Split 4
Number of Users: 6036
Number of Items: 3118
Percentage of Likes: 57.22%
Sparsity: 0.9839
Average Rating per User: 5019.55
Average Rating per Item: 9717.13

-----
Split 5
Number of Users: 6036
Number of Items: 3140
Percentage of Likes: 57.23%
Sparsity: 0.9800
Average Rating per User: 6274.44
Average Rating per Item: 12061.31

-----
Split 6
Number of Users: 6036
Number of Items: 3156
Percentage of Likes: 57.14%
Sparsity: 0.9761
Average Rating per User: 7529.32
Average Rati