In [207]:
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
import networkx as nx
import random
import math
import pickle
import os
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.io as scio
os.getcwd()

'/home/tduricic/Development/workspace/original-repos/GraphRec-WWW19'

In [6]:
dataset = pickle.load(open("data/lastfm/data/dataset.pickle",'rb'))

In [16]:
dataset.keys()

dict_keys(['full', 'train', 'test', 'users', 'artist-tracks'])

In [36]:
len(dataset['users'])

3307

In [37]:
len(dataset['artist-tracks'])

28540

In [170]:
def get_discrete_cdf(values):
    # values = (values - np.min(values)) / (np.max(values) - np.min(values))
    values_sort = np.sort(values)
    values_sum = np.sum(values)

    values_sums = []
    cur_sum = 0
    for it in values_sort:
        cur_sum += it
        values_sums.append(cur_sum)

    cdf = [values_sums[np.searchsorted(values_sort, it)]/values_sum for it in values]
    return values_sort, np.sort(cdf)

def get_playcount_cdfs(values, cdfs):
    playcount_cdfs = {}

    for i in range(len(values)):
        playcount_value = values[i]
        playcount_cdf = cdfs[i]
        if playcount_value not in playcount_cdfs:
            playcount_cdfs[playcount_value] = []
        playcount_cdfs[playcount_value].append(playcount_cdf)

    return playcount_cdfs

def convert_playcount_cdfs_to_ratings(playcount_cdfs):
    playcount_ratings = {}
    for playcount_value in playcount_cdfs:
        playcount_ratings[playcount_value] = round(4 * (np.mean(playcount_cdfs[playcount_value])))
    return playcount_ratings

users = {v: k for k, v in dataset['users'].items()}

tracks = {}
for artist in dataset['artist-tracks']:
    for track in dataset['artist-tracks'][artist]:
        track_id = dataset['artist-tracks'][artist][track]
        if track_id not in tracks:
            tracks[track_id] = artist + ' - ' + track

interaction_vectors = {}
for edge in tqdm(dataset['full'].edges):
    source = edge[0]
    target = edge[1]
    if source in users:
        user_id = source
    if source in tracks:
        track_id = source
    if target in users:
        user_id = target
    if target in tracks:
        track_id = target
    playcount = dataset['full'].edges[edge]['scrobbles']

    if user_id not in interaction_vectors:
        interaction_vectors[user_id] = {'track_ids' : [], 'playcounts' : []}
    interaction_vectors[user_id]['track_ids'].append(track_id)
    interaction_vectors[user_id]['playcounts'].append(playcount)

for user_id in tqdm(interaction_vectors):
    user_playcounts = np.array(interaction_vectors[user_id]['playcounts'])
    sorted_playcounts, sorted_cdf_values = get_discrete_cdf(user_playcounts)
    playcount_cdfs = get_playcount_cdfs(sorted_playcounts, sorted_cdf_values)
    playcount_ratings = convert_playcount_cdfs_to_ratings(playcount_cdfs)
    interaction_vectors[user_id]['playcount_ratings'] = playcount_ratings
    interaction_vectors[user_id]['ratings'] = []
    for playcount_value in user_playcounts:
        interaction_vectors[user_id]['ratings'].append(playcount_ratings[playcount_value])

HBox(children=(IntProgress(value=0, max=3018209), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3307), HTML(value='')))




In [172]:
user_item_ratings = []

user_mappings = {}
item_mappings = {}

user_id_counter = 0
item_id_counter = 0

for user_id in tqdm(interaction_vectors):
    if user_id not in user_mappings:
        user_mappings[user_id] = user_id_counter
        user_id_counter += 1

    for i in range(len(interaction_vectors[user_id]['track_ids'])):
        track_id = interaction_vectors[user_id]['track_ids'][i]
        if track_id not in item_mappings:
            item_mappings[track_id] = item_id_counter
            item_id_counter += 1

        user_id_mapping = user_mappings[user_id]
        item_id_mapping = item_mappings[track_id]
        rating = interaction_vectors[user_id]['ratings'][i]

        user_item_ratings.append((user_id_mapping, item_id_mapping, rating))

HBox(children=(IntProgress(value=0, max=3307), HTML(value='')))




In [178]:
dataset['users']

{'Enrique-': 0,
 'EduardoMol': 1,
 'DemetriDyslexik': 2,
 'unicef41': 3,
 'losena': 4,
 'jpw130855': 5,
 'felipemusky': 6,
 'felipe_89': 7,
 'camiloei': 8,
 'EriF_JR': 9,
 'Backstage_Rock': 10,
 'nirvaana_': 11,
 'miladi': 12,
 'djchaco': 13,
 'dizzydjc': 14,
 'conversemanman': 15,
 'Yavedu': 16,
 'Param0rexx_': 17,
 'Jeff_Serozini': 18,
 'JCCAKES': 19,
 'EnricoFranchi': 20,
 'Ehsandiary': 21,
 'Creepsnight': 22,
 'zero-inch': 23,
 'violaceousest': 24,
 'the_edster': 25,
 'maikcuritiba': 26,
 'imyyy': 27,
 'chocobooo': 28,
 'c0rts': 29,
 'barkbarkdisco': 30,
 'WichitaQ': 31,
 'TheRootsLife': 32,
 'LeoMetal965': 33,
 'waltercabellon': 34,
 'pellitero': 35,
 'masud_saedi': 36,
 'corky64': 37,
 'alinzainescu': 38,
 'Vintovka': 39,
 'VRec': 40,
 'STxza': 41,
 'NaturalStudio': 42,
 'Lain12': 43,
 'KarenValensi': 44,
 'DJGabster': 45,
 'ASTOKALOSOU': 46,
 'thunder__': 47,
 'loohop15': 48,
 'kyliesaysparty': 49,
 'amakiell': 50,
 'aemea': 51,
 'Tott_Di': 52,
 'Nihilistic23': 53,
 'MarchuSykes

In [194]:
original_graph_nodes = {}

for line in open('data/lastfm/lastfm_sn/lastfm.nodes', 'r').readlines():
    tokens = line.split()
    original_user_mapping = int(tokens[0])
    username = tokens[1]
    original_graph_nodes[original_user_mapping] = username

final_social_edges = []
for line in open('data/lastfm/lastfm_sn/lastfm.edges', 'r').readlines():
    tokens = line.split()
    original_user_mapping_source = int(tokens[0])
    original_user_mapping_target = int(tokens[1])

    username_source = original_graph_nodes[original_user_mapping_source]
    username_target = original_graph_nodes[original_user_mapping_target]

    if username_source in dataset['users'] and username_target in dataset['users']:
        source_user_id = dataset['users'][username_source]
        target_user_id = dataset['users'][username_target]

        source_user_mapping = user_mappings[source_user_id]
        target_user_mapping = user_mappings[target_user_id]

        final_social_edges.append((source_user_mapping, target_user_mapping))
        final_social_edges.append((target_user_mapping, source_user_mapping))

In [201]:
trustnetwork = np.array(final_social_edges, dtype=int)

array([[2835, 1552],
       [1426, 1418],
       [1801,  541],
       ...,
       [2867,  336],
       [ 697, 1767],
       [ 590, 1146]])

In [204]:
rating = []
category_id = -1
helpfulness = -1

for user_item_rating in user_item_ratings:
  user_id = user_item_rating[0]
  item_id = user_item_rating[1]
  r = user_item_rating[2]
  rating.append([user_id, item_id, category_id, r, helpfulness])

rating = np.array(rating, dtype=int)

In [205]:
rating

array([[     0,      0,     -1,      4,     -1],
       [     0,      1,     -1,      4,     -1],
       [     0,      2,     -1,      4,     -1],
       ...,
       [  3305, 242639,     -1,      0,     -1],
       [  3306, 144876,     -1,      4,     -1],
       [  3306, 144908,     -1,      2,     -1]])

In [208]:
scio.savemat('data/lastfm/trustnetwork.mat', {'trustnetwork':trustnetwork})
scio.savemat('data/lastfm/rating.mat', {'rating':rating})

In [211]:
lala = scio.loadmat('data/lastfm/rating.mat')

In [210]:
lala

{'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Wed Oct  5 22:42:12 2022',
 '__version__': '1.0',
 '__globals__': [],
 'rating': array([[     0,      0,     -1,      4,     -1],
        [     0,      1,     -1,      4,     -1],
        [     0,      2,     -1,      4,     -1],
        ...,
        [  3305, 242639,     -1,      0,     -1],
        [  3306, 144876,     -1,      4,     -1],
        [  3306, 144908,     -1,      2,     -1]])}

In [None]:
ff = lala['rating'].tolist()

In [218]:
ff[0]

{'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Wed Oct  5 22:42:12 2022',
 '__version__': '1.0',
 '__globals__': [],
 'rating': array([[     0,      0,     -1,      4,     -1],
        [     0,      1,     -1,      4,     -1],
        [     0,      2,     -1,      4,     -1],
        ...,
        [  3305, 242639,     -1,      0,     -1],
        [  3306, 144876,     -1,      4,     -1],
        [  3306, 144908,     -1,      2,     -1]])}

{'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Wed Oct  5 22:42:12 2022',
 '__version__': '1.0',
 '__globals__': [],
 'rating': array([[     0,      0,     -1,      4,     -1],
        [     0,      1,     -1,      4,     -1],
        [     0,      2,     -1,      4,     -1],
        ...,
        [  3305, 242639,     -1,      0,     -1],
        [  3306, 144876,     -1,      4,     -1],
        [  3306, 144908,     -1,      2,     -1]])}

In [3]:
import pickle
path_data = './data/' + 'lastfm' + '/' + 'lastfm' + '_sixty' + '.pkl'
data_file = open(path_data, 'rb')

history_u_lists, history_ur_lists, history_v_lists, history_vr_lists, traindata, validdata, testdata, \
social_adj_lists, item_adj_lists, ratings_list = pickle.load(data_file)

In [5]:
ratings = traindata + validdata + testdata

In [6]:
len(ratings)

3013693

In [7]:
users = []
items = []
for entry in ratings:
    user = entry[0]
    item = entry[1]
    users.append(user)
    items.append(item)

In [16]:
import networkx as nx
G = nx.Graph()
for source in social_adj_lists:
    for target in list(social_adj_lists[source]):
        users.append(source)
        users.append(target)
        G.add_edge(source, target)

In [11]:
type(social_adj_lists[0])

set

In [14]:
len(set(users))

3302

In [15]:
len(set(items))

252009

In [17]:
len(G.edges)

142919