In [1]:
import pandas as pd
import numpy as np
from preprocess import k_user_interactions, k_core
from utils import load_raw_df, df_to_sparse, gini

In [2]:
def get_stat_dict(rating_matrix):
    NUM_USERS, NUM_ITEMS = rating_matrix.shape
    NUM_RATINGS = rating_matrix.nnz
    NUM_RATINGS_PER_USER = NUM_RATINGS / NUM_USERS

    DENSITY = NUM_RATINGS / (NUM_USERS * NUM_ITEMS)
    SPARSITY = 1 - DENSITY
    SHAPE = NUM_USERS / NUM_ITEMS
    
    user_popularity = rating_matrix.sum(1).A.reshape(-1)
    item_popularity = rating_matrix.sum(0).A.reshape(-1)

    sorted_user_popularity = np.sort(user_popularity)
    sorted_item_popularity = np.sort(item_popularity)

    GINI_USER = gini(sorted_user_popularity)
    GINI_ITEM = gini(sorted_item_popularity)

    CONCENTRATION = sum(sorted_item_popularity[-int(len(item_popularity) * 0.05):]) / NUM_RATINGS

    ret = {
        '# Users': NUM_USERS,
        '# Items': NUM_ITEMS,
        '# Ratings': NUM_RATINGS,
        '# Ratings per user': NUM_RATINGS_PER_USER,
        'Sparsity': SPARSITY,
        'Shape': SHAPE,
        'Gini User': GINI_USER,
        'Gini Item': GINI_ITEM,
        'Concen.': CONCENTRATION
    }
    return ret


In [3]:
data_to_file = {
    'ml-1m': 'data/ml-1m/ratings.dat',
    'ml-10m': 'data/ml-10m/ratings.dat',
    'ml-20m': 'data/ml-20m/ratings.csv',
    'ml-25m': 'data/ml-25m/ratings.csv',
    'amusic': 'data/amusic/Digital_Music_5.json',
    'epinions': 'data/epinions/ratings_data.txt',
    'yelp2015': 'data/yelp2015/yelp.rating',
    'yelp2018': 'data/yelp2018/yelp.csv',
    'gowalla': 'data/gowalla/loc-gowalla_totalCheckins.txt',
    'citeulike': 'data/citeulike/users.dat',
    'pinterest': 'data/pinterest/pinterest.csv'
}
data_to_sep = {
    'ml-1m': '::',
    'ml-10m': '::',
    'ml-20m': ',',
    'ml-25m': ',',
    'amusic': ',',
    'epinions': ' ',
    'yelp2015': '\t',
    'yelp2018': ',',
    'gowalla': '\t',
    'citeulike': ' ',
    'pinterest': ','
}


In [4]:
dataset_list = list(data_to_file.keys())

In [5]:
raw_df_dict = {d: None for d in data_to_file}

# Original

In [6]:
%%time
stat_df = pd.DataFrame(
    columns=['dataset', '# Users', '# Items', '# Ratings', '# Ratings per user', 'Sparsity', 'Shape', 'Gini User', 'Gini Item', 'Concen.'])
for dataset in dataset_list:
    print(dataset)
    if raw_df_dict[dataset] is None:
        raw_df = load_raw_df(dataset, data_to_file, data_to_sep)
        raw_df_dict[dataset] = raw_df
    else:
        raw_df = raw_df_dict[dataset]
    
    raw_df = k_user_interactions(raw_df, 1)
    
    num_unique_users = len(pd.unique(raw_df.user))
    num_unique_items = len(pd.unique(raw_df.item))
    
    shape = (num_unique_users, num_unique_items)
    rating_matrix = df_to_sparse(raw_df, shape)

    stat_dict = get_stat_dict(rating_matrix)
    stat_dict['dataset'] = dataset
    stat_df = stat_df.append(stat_dict, ignore_index=True)

ml-1m
initial user, item: 6040 3706
after filter : 6040 3706
ml-10m
initial user, item: 69878 10677
after filter : 69878 10677
ml-20m
initial user, item: 138493 26744
after filter : 138493 26744
ml-25m
initial user, item: 162541 59047
after filter : 162541 59047
  0%|          | 0/169781 [00:00<?, ?it/s]amusic
100%|██████████| 169781/169781 [00:02<00:00, 83557.14it/s]
initial user, item: 16566 11797
after filter : 16566 11797
epinions
initial user, item: 40163 139738
after filter : 40163 139738
yelp2015
initial user, item: 25677 25815
after filter : 25677 25815
yelp2018
initial user, item: 45919 45538
after filter : 45919 45538
gowalla
initial user, item: 107092 1280969
after filter : 107092 1280969
citeulike
initial user, item: 5551 16980
after filter : 5551 16980
pinterest
initial user, item: 55187 9916
after filter : 55187 9916
CPU times: user 8min 15s, sys: 52.7 s, total: 9min 8s
Wall time: 9min 7s


In [7]:
stat_df

Unnamed: 0,dataset,# Users,# Items,# Ratings,# Ratings per user,Sparsity,Shape,Gini User,Gini Item,Concen.
0,ml-1m,6040,3706,1000209,165.597517,0.955316,1.62979,0.528624,0.633562,0.282754
1,ml-10m,69878,10677,10000054,143.10733,0.986597,6.544722,0.5707,0.805249,0.516455
2,ml-20m,138493,26744,20000263,144.41353,0.9946,5.17847,0.580701,0.902942,0.714144
3,ml-25m,162541,59047,25000095,153.807932,0.997395,2.752739,0.589469,0.941897,0.844548
4,amusic,16566,11797,145292,8.770494,0.999257,1.404255,0.3756,0.431506,0.24757
5,epinions,40163,139738,664823,16.553121,0.999882,0.287416,0.676261,0.693562,0.533856
6,yelp2015,25677,25815,696865,27.139658,0.998949,0.994654,0.45089,0.603678,0.351168
7,yelp2018,45919,45538,1183609,25.776019,0.999434,1.008367,0.426837,0.581005,0.345775
8,gowalla,107092,1280969,3981334,37.176764,0.999971,0.083602,0.662713,0.539023,0.362772
9,citeulike,5551,16980,204986,36.927761,0.997825,0.326914,0.470634,0.369647,0.209829


# 10 user interactions

In [8]:
%%time
stat_df = pd.DataFrame(
    columns=['dataset', '# Users', '# Items', '# Ratings', '# Ratings per user', 'Sparsity', 'Shape', 'Gini User', 'Gini Item', 'Concen.'])
for dataset in dataset_list:
    print(dataset)
    if raw_df_dict[dataset] is None:
        raw_df = load_raw_df(dataset, data_to_file, data_to_sep)
        raw_df_dict[dataset] = raw_df
    else:
        raw_df = raw_df_dict[dataset]
    
    raw_df = k_user_interactions(raw_df, 10)
    
    num_unique_users = len(pd.unique(raw_df.user))
    num_unique_items = len(pd.unique(raw_df.item))
    
    shape = (num_unique_users, num_unique_items)
    rating_matrix = df_to_sparse(raw_df, shape)

    stat_dict = get_stat_dict(rating_matrix)
    stat_dict['dataset'] = dataset
    stat_df = stat_df.append(stat_dict, ignore_index=True)

ml-1m
initial user, item: 6040 3706
after filter : 6040 3706
ml-10m
initial user, item: 69878 10677
after filter : 69878 10677
ml-20m
initial user, item: 138493 26744
after filter : 138493 26744
ml-25m
initial user, item: 162541 59047
after filter : 162541 59047
amusic
initial user, item: 16566 11797
after filter : 3951 11483
epinions
initial user, item: 40163 139738
after filter : 15786 132964
yelp2015
initial user, item: 25677 25815
after filter : 24930 25799
yelp2018
initial user, item: 45919 45538
after filter : 45842 45538
gowalla
initial user, item: 107092 1280969
after filter : 68709 1247158
citeulike
initial user, item: 5551 16980
after filter : 5551 16980
pinterest
initial user, item: 55187 9916
after filter : 55187 9916
CPU times: user 3min 14s, sys: 25.6 s, total: 3min 39s
Wall time: 3min 39s


In [9]:
stat_df

Unnamed: 0,dataset,# Users,# Items,# Ratings,# Ratings per user,Sparsity,Shape,Gini User,Gini Item,Concen.
0,ml-1m,6040,3706,1000209,165.597517,0.955316,1.62979,0.528624,0.633562,0.282754
1,ml-10m,69878,10677,10000054,143.10733,0.986597,6.544722,0.5707,0.805249,0.516455
2,ml-20m,138493,26744,20000263,144.41353,0.9946,5.17847,0.580701,0.902942,0.714144
3,ml-25m,162541,59047,25000095,153.807932,0.997395,2.752739,0.589469,0.941897,0.844548
4,amusic,3951,11483,75044,18.993672,0.998346,0.344074,0.306667,0.450453,0.240486
5,epinions,15786,132964,580752,36.789054,0.999723,0.118724,0.481602,0.676767,0.519681
6,yelp2015,24930,25799,690381,27.69278,0.998927,0.966317,0.448181,0.603781,0.351037
7,yelp2018,45842,45538,1182917,25.804219,0.999433,1.006676,0.426709,0.581081,0.345821
8,gowalla,68709,1247158,3831386,55.762506,0.999955,0.055092,0.54579,0.536477,0.361238
9,citeulike,5551,16980,204986,36.927761,0.997825,0.326914,0.470634,0.369647,0.209829


# 20 user interactions

In [15]:
%%time
stat_df = pd.DataFrame(
    columns=['dataset', '# Users', '# Items', '# Ratings', '# Ratings per user', 'Sparsity', 'Shape', 'Gini User', 'Gini Item', 'Concen.'])
for dataset in dataset_list:
    print(dataset)
    if raw_df_dict[dataset] is None:
        raw_df = load_raw_df(dataset, data_to_file, data_to_sep)
        raw_df_dict[dataset] = raw_df
    else:
        raw_df = raw_df_dict[dataset]
    
    raw_df = k_user_interactions(raw_df, 20)
    
    num_unique_users = len(pd.unique(raw_df.user))
    num_unique_items = len(pd.unique(raw_df.item))
    
    shape = (num_unique_users, num_unique_items)
    rating_matrix = df_to_sparse(raw_df, shape)

    stat_dict = get_stat_dict(rating_matrix)
    stat_dict['dataset'] = dataset
    stat_df = stat_df.append(stat_dict, ignore_index=True)

ml-1m
initial user, item: 6040 3706
after filter : 6040 3706
ml-10m
initial user, item: 69878 10677
after filter : 69878 10677
ml-20m
initial user, item: 138493 26744
after filter : 138493 26744
ml-25m
initial user, item: 162541 59047
after filter : 162541 59047
amusic
initial user, item: 16566 11797
after filter : 1074 10116
epinions
initial user, item: 40163 139738
after filter : 8693 123330
yelp2015
initial user, item: 25677 25815
after filter : 9788 25373
yelp2018
initial user, item: 45919 45538
after filter : 17137 45447
gowalla
initial user, item: 107092 1280969
after filter : 47752 1183848
citeulike
initial user, item: 5551 16980
after filter : 3097 16792
pinterest
initial user, item: 55187 9916
after filter : 52190 9909
CPU times: user 3min 10s, sys: 24.4 s, total: 3min 34s
Wall time: 3min 34s


In [16]:
stat_df

Unnamed: 0,dataset,# Users,# Items,# Ratings,# Ratings per user,Sparsity,Shape,Gini User,Gini Item,Concen.
0,ml-1m,6040,3706,1000209,165.597517,0.955316,1.62979,0.528624,0.633562,0.282754
1,ml-10m,69878,10677,10000054,143.10733,0.986597,6.544722,0.5707,0.805249,0.516455
2,ml-20m,138493,26744,20000263,144.41353,0.9946,5.17847,0.580701,0.902942,0.714144
3,ml-25m,162541,59047,25000095,153.807932,0.997395,2.752739,0.589469,0.941897,0.844548
4,amusic,1074,10116,37526,34.94041,0.996546,0.106168,0.267339,0.446593,0.229787
5,epinions,8693,123330,482849,55.544576,0.99955,0.070486,0.419244,0.652472,0.498334
6,yelp2015,9788,25373,489820,50.04291,0.998028,0.385764,0.397219,0.595894,0.33459
7,yelp2018,17137,45447,806078,47.037288,0.998965,0.377077,0.376848,0.585314,0.33805
8,gowalla,47752,1183848,3530010,73.923815,0.999938,0.040336,0.49457,0.528868,0.355783
9,citeulike,3097,16792,171391,55.340975,0.996704,0.184433,0.393914,0.38732,0.213891


# 10 core

In [12]:
%%time
stat_df = pd.DataFrame(
    columns=['dataset', '# Users', '# Items', '# Ratings', '# Ratings per user', 'Sparsity', 'Shape', 'Gini User', 'Gini Item', 'Concen.'])
for dataset in dataset_list:
    print(dataset)
    if raw_df_dict[dataset] is None:
        raw_df = load_raw_df(dataset, data_to_file, data_to_sep)
        raw_df_dict[dataset] = raw_df
    else:
        raw_df = raw_df_dict[dataset]
    
    raw_df = k_core(raw_df, 10)
    
    num_unique_users = len(pd.unique(raw_df.user))
    num_unique_items = len(pd.unique(raw_df.item))
    if num_unique_users == 0 or num_unique_items == 0:
        print('skip', dataset)
        continue
    
    shape = (num_unique_users, num_unique_items)
    rating_matrix = df_to_sparse(raw_df, shape)

    stat_dict = get_stat_dict(rating_matrix)
    stat_dict['dataset'] = dataset
    stat_df = stat_df.append(stat_dict, ignore_index=True)

ml-1m
initial user, item: 6040 3706
cycle 1 U 6040 -> 6040
cycle 1 I 3706 -> 3260
False True
cycle 2 U 6040 -> 6040
cycle 2 I 3260 -> 3260
False False
ml-10m
initial user, item: 69878 10677
cycle 1 U 69878 -> 69878
cycle 1 I 10677 -> 9708
False True
cycle 2 U 69878 -> 69878
cycle 2 I 9708 -> 9708
False False
ml-20m
initial user, item: 138493 26744
cycle 1 U 138493 -> 138493
cycle 1 I 26744 -> 15451
False True
cycle 2 U 138493 -> 138493
cycle 2 I 15451 -> 15451
False False
ml-25m
initial user, item: 162541 59047
cycle 1 U 162541 -> 162541
cycle 1 I 59047 -> 24330
False True
cycle 2 U 162541 -> 162539
cycle 2 I 24330 -> 24330
True False
cycle 3 U 162539 -> 162539
cycle 3 I 24330 -> 24330
False False
amusic
initial user, item: 16566 11797
cycle 1 U 16566 -> 3951
cycle 1 I 11797 -> 1935
True True
cycle 2 U 3951 -> 1190
cycle 2 I 1935 -> 750
True True
cycle 3 U 1190 -> 554
cycle 3 I 750 -> 365
True True
cycle 4 U 554 -> 309
cycle 4 I 365 -> 225
True True
cycle 5 U 309 -> 186
cycle 5 I 225 -

In [13]:
stat_df

Unnamed: 0,dataset,# Users,# Items,# Ratings,# Ratings per user,Sparsity,Shape,Gini User,Gini Item,Concen.
0,ml-1m,6040,3260,998539,165.321026,0.949288,1.852761,0.528514,0.586218,0.259408
1,ml-10m,69878,9708,9995471,143.041744,0.985266,7.197981,0.570571,0.786661,0.494494
2,ml-20m,138493,15451,19964833,144.157705,0.99067,8.963368,0.580155,0.836033,0.57837
3,ml-25m,162539,24330,24890566,153.135961,0.993706,6.6806,0.588092,0.871051,0.655785
4,epinions,10706,8945,300303,28.049972,0.996864,1.19687,0.403722,0.494997,0.305511
5,yelp2015,22087,14873,602517,27.279259,0.998166,1.48504,0.441275,0.512086,0.29351
6,yelp2018,39055,25033,988768,25.317322,0.998989,1.560141,0.416771,0.506528,0.288847
7,gowalla,29858,40988,1027464,34.411682,0.99916,0.728457,0.466591,0.434611,0.291539
8,citeulike,3710,6468,120324,32.432345,0.994986,0.573593,0.438803,0.305217,0.181751
9,pinterest,55164,9316,1460487,26.475364,0.997158,5.921426,0.141055,0.418826,0.181937
