The raw data of last.fm must be in the "raw_data" folder.  
The outputs of this code will be saved in the "preproc_output" folder.

## Generate arms_feature

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.decomposition import PCA
import networkx as nx

In [2]:
import os
current_dir = os.getcwd()
current_dir

'/home/shpaik/21-aaai2022v2/Fig4/run_real_lastfm'

In [3]:
real_data_dir = current_dir + "/" + "preproc_output"
if not os.path.exists(real_data_dir):
    os.makedirs(real_data_dir)

In [4]:
artist_tags = pd.read_csv('raw_data/user_taggedartists.dat', sep="\t") 
artist_tags[:3]

Unnamed: 0,userID,artistID,tagID,day,month,year
0,2,52,13,1,4,2009
1,2,52,15,1,4,2009
2,2,52,18,1,4,2009


In [5]:
# artist: row, tag: column -> tf, idf
tf = artist_tags.groupby(['artistID', 'tagID']).size().reset_index(name="tf")
artists_with_tags = artist_tags['artistID'].unique()
N = artist_tags['artistID'].unique().shape[0]
idf = artist_tags.groupby(['tagID']).size().reset_index(name="count")
idf[['idf']] = N / (1 + idf[['count']])
print(tf.head())
print(tf.dtypes)
print(N)
print(idf.dtypes)
print(idf.head())
print(idf.loc[idf.tagID==139])

   artistID  tagID  tf
0         1    139   5
1         1    141   3
2         1    179   2
3         1    541   2
4         1    552   1
artistID    int64
tagID       int64
tf          int64
dtype: object
12523
tagID      int64
count      int64
idf      float64
dtype: object
   tagID  count         idf
0      1   1729    7.238728
1      2    212   58.793427
2      3     22  544.478261
3      4    301   41.466887
4      5    582   21.480274
     tagID  count    idf
130    139    174  71.56


In [6]:
# create tf-idf value
tfidf = pd.merge(tf, idf, on="tagID", how="outer")
tfidf.head()

Unnamed: 0,artistID,tagID,tf,count,idf
0,1,139,5,174,71.56
1,4,139,6,174,71.56
2,8,139,8,174,71.56
3,372,139,1,174,71.56
4,376,139,5,174,71.56


In [7]:
tfidf['tfidf'] = tfidf["tf"] * tfidf["idf"].apply(np.log)
tfidf.head()

Unnamed: 0,artistID,tagID,tf,count,idf,tfidf
0,1,139,5,174,71.56,21.352681
1,4,139,6,174,71.56,25.623218
2,8,139,8,174,71.56,34.16429
3,372,139,1,174,71.56,4.270536
4,376,139,5,174,71.56,21.352681


In [8]:
tfidf_fat = tfidf[['artistID', 'tagID', 'tfidf']].pivot_table(index=["artistID"], columns=["tagID"], )

In [9]:
# validation 1
mask =  pd.notna(tfidf_fat[0:1]).to_numpy().reshape(-1,)
tfidf_fat[0:1].loc[:, mask]

Unnamed: 0_level_0,tfidf,tfidf,tfidf,tfidf,tfidf,tfidf,tfidf
tagID,139,141,179,541,552,1219,2850
artistID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,21.352681,14.431048,6.823749,7.260375,7.825884,5.503497,6.257268


In [10]:
# validation 2
artist_tags.loc[artist_tags.artistID == 1]
# looks done great

Unnamed: 0,userID,artistID,tagID,day,month,year
3037,37,1,552,1,7,2008
28338,274,1,1219,1,5,2008
64127,681,1,139,1,8,2008
64128,681,1,141,1,8,2008
76492,785,1,2850,1,8,2010
132763,1545,1,139,1,1,2008
132764,1545,1,141,1,1,2008
132765,1545,1,179,1,1,2008
132766,1545,1,541,1,1,2008
154399,1730,1,139,1,10,2008


In [11]:
# fill nan as zero
tfidf_fat = tfidf_fat.fillna(0)

In [12]:
tfidf_fat[0:10]
print(tfidf_fat.shape)

(12523, 9749)


In [13]:
# extract top 25 principal component
pca = PCA(n_components=25)
pca.fit(tfidf_fat)

PCA(n_components=25)

In [14]:
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())

[0.05992843 0.03966534 0.02953625 0.02530514 0.02410847 0.01898587
 0.01767703 0.01670187 0.0150542  0.01330877 0.01279762 0.01174547
 0.01138243 0.01094121 0.01089693 0.0102934  0.00953206 0.00941894
 0.00872982 0.00847102 0.00824964 0.00778215 0.00735721 0.00705921
 0.00677507]
0.4017035273700203


In [15]:
tfidf_pc25 = pca.fit_transform(tfidf_fat)
tfidf_pc25.shape

(12523, 25)

In [16]:
# feature matrix 
tfidf_pc25.shape
np.save("preproc_output/item_features_full.npy", tfidf_pc25)

In [17]:
item_features_full = tfidf_pc25

In [18]:
ind_items_full = artist_tags['artistID'].unique()
ind_items_full = np.sort(ind_items_full)
np.save("preproc_output/item_IDs_full.npy", ind_items_full)
print(ind_items_full.shape[0])

12523


In [19]:
user_items = pd.read_csv('raw_data/user_artists.dat', sep="\t") 
user_items = user_items.rename(columns={'artistID': 'itemID'})
user_items.head()
print(np.array(user_items["userID"].unique()).shape)
print(np.array(user_items["itemID"].unique()).shape)

(1892,)
(17632,)


In [20]:
# export userIDs
user_IDs = np.sort(np.array(user_items["userID"].unique()))
print(user_IDs)
np.save("preproc_output/user_IDs_full.npy", user_IDs)

[   2    3    4 ... 2097 2099 2100]


In [21]:
user_items = user_items[['userID', 'itemID']].drop_duplicates()
mask = np.isin(user_items['itemID'], artists_with_tags)
user_items = user_items.loc[mask]

In [22]:
print(np.array(user_items["userID"].unique()).shape)
print(np.array(user_items["itemID"].unique()).shape)

(1892,)
(12133,)


In [23]:
NUM_SAMP = 25
num_items = 25
TIME_HORIZON = 150000

user_freqs = user_items['userID'].value_counts(normalize=True,sort=True)
user_freqs_IDs = np.array(list(user_freqs.index))
item_freqs = user_items['itemID'].value_counts(normalize=True,sort=True)
item_freqs_IDs = np.array(list(item_freqs.index))

for j in [1, 2, 3, 4, 5]:
    #initialize
    rwd1_items_list = np.empty(TIME_HORIZON, dtype=object)
    items_list = np.empty(TIME_HORIZON, dtype=object)
    item_features_list = np.empty(TIME_HORIZON, dtype=object)
    user_history = np.empty(TIME_HORIZON, dtype=object)
    print("j =", j)
    
    for t in range(TIME_HORIZON):
        if t % 10000 == 0:
            print(t // 10000, end=" ")
        # sample one user
        np.random.seed(t+(j*TIME_HORIZON))
        SELECTED_USER_ind = np.random.choice(user_freqs_IDs.shape[0], 1, replace=False, p=user_freqs).item()
        SELECTED_USER = user_freqs_IDs[SELECTED_USER_ind]
        # print(SELECTED_USER)
        items_of_SELECTED_USER = user_items[user_items.userID == SELECTED_USER]['itemID'].unique()
        # print(items_of_SELECTED_USER)
        user_history[t] = SELECTED_USER
        
        np.random.seed(t+10+(j*TIME_HORIZON*2))
        ind = np.random.choice(items_of_SELECTED_USER.shape[0], 1, replace=False).item()
        one_item_from_SELECTED_USER = items_of_SELECTED_USER[ind]
        # print(one_item_from_SELECTED_USER)

        mask = (item_freqs_IDs != one_item_from_SELECTED_USER)
        tmp_item_freqs = np.array(item_freqs)[mask] / np.sum(np.array(item_freqs)[mask]) 
        tmp_item_freqs_IDs = item_freqs_IDs[mask]

        np.random.seed(t+20+(j*TIME_HORIZON*3))
        # sample 24 integers from 0 to (num_item-1)
        other_chosen_items = np.random.choice(tmp_item_freqs_IDs, NUM_SAMP-1, replace=False, p=tmp_item_freqs)

        chosen_item_IDs = np.sort(np.concatenate( (one_item_from_SELECTED_USER, other_chosen_items) , axis=None))
        items_list[t] = chosen_item_IDs
        chosen_item_inds = np.isin(ind_items_full, chosen_item_IDs)
        # print(chosen_item_inds.sum())
        # print(chosen_item_inds.shape)

        # features
        item_features = item_features_full[chosen_item_inds, :]
        item_features_list[t] = item_features

        # rwd1_items
        rwd1_items = np.intersect1d(chosen_item_IDs, items_of_SELECTED_USER)
        rwd1_items_list[t] = rwd1_items                         
        # print(rwd1_items)  
        # END of t loop
    
    np.save("preproc_output/list_items_rwd1_"+str(j)+".npy", rwd1_items_list)
    np.save("preproc_output/list_items_"+str(j)+".npy", items_list)
    np.save("preproc_output/list_item_features_"+str(j)+".npy", item_features_list)
    np.save("preproc_output/list_user_history_"+str(j)+".npy", user_history)

j = 1
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 j = 2
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 j = 3
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 j = 4
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 j = 5
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 

# Create adjacency matrix

In [24]:
adjacency_thin = pd.read_csv('raw_data/user_friends.dat', sep="\t") 

In [25]:
adjacency_thin[0:10]

Unnamed: 0,userID,friendID
0,2,275
1,2,428
2,2,515
3,2,761
4,2,831
5,2,909
6,2,1209
7,2,1210
8,2,1230
9,2,1327


In [26]:
adjacency_thin.head()
G = nx.Graph()
G.add_edges_from( list(adjacency_thin.to_records(index=False)) )

In [27]:
# save laplacians
np.save("preproc_output/adjmtx_full.npy", nx.to_numpy_array(G, nodelist=user_IDs))