In [1]:
from pymongo import MongoClient
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
import networkx as nx
import random
import math
import pickle

In [2]:
client = MongoClient()
music = client['music_recommender']

In [3]:
def get_info_track(music, user):
    res = {}
    for e, t in enumerate(music.user_track_history.find_one({'username': user})['tracks']):
        t_t = (t['artist'], t['track'])
        res[t_t] = (e, t['timestamp'])
    return res

def load_graph(music, max_songs=None, ignore=False):
    users_count = music.user_scrobbles_history.count_documents({"tracks":{"$not":{"$size":0}}})
    track_count = music.track_info.count_documents({"spotify_id": {"$exists": True}})
    #Generate ids for the users
    print('Indexing users...')
    users = music.user_scrobbles_history.find({"tracks":{"$not":{"$size":0}}}, {"username": True})
    users = [u['username'] for u in tqdm(users, total=users_count) 
             if music.user_track_history.count_documents({'username': u['username']}) > 0]
    users_count = len(users)
    users_ids = {u: v for v, u in enumerate(users)}
    
    #Genereate ids for the tracks
    valid_spotify = {x['spotify_id'] for x in music.track_spotify_features.find()}
    print('Indexing songs...')
    val_artist = defaultdict(set)
    for u in music.user_track_history.find({}):
        for t in u['tracks']:
            val_artist[t['artist']].add(t['track'])
    #tracks = music.track_info.aggregate([{"$project": {"track": True, "artist": True}}])
    tracks = music.track_info.find({"spotify_id": {"$exists": True}},
                                   {"track": True, "artist": True, "_id": False, "spotify_id": True})
    artist_ids = defaultdict(dict)
    count = 0
    spotify = {}
    for e, t in tqdm(enumerate(tracks, start=len(users_ids)), total=track_count):
        if t['artist'] not in val_artist or t['track'] not in val_artist[t['artist']] or t['spotify_id'] not in valid_spotify:
            continue
        if t['spotify_id'] not in spotify:
            spotify[t['spotify_id']] = e
        artist_ids[t['artist']][t['track']] = spotify[t['spotify_id']]
        count += 1
    last_id = e 
    id_spotify = {e: s for s, e in spotify.items()}
    if max_songs is None:
        max_songs=count
    #Generate Graph
    print('Generating graph...')
    failed = 0
    g = nx.Graph()
    for u in tqdm(users):
        u = music.user_scrobbles_history.find_one({'username': u})
        info_tracks = get_info_track(music, u['username'])
        tracks = u['tracks']
        id_u = users_ids[u['username']]
        g.add_node(id_u)
        added = set()
        for i in range(min(max_songs, len(tracks))):
            t = tracks[i]
            if t['artist'] not in artist_ids or t['track'] not in artist_ids[t['artist']]:
                failed += 1
                if ignore:
                    continue
                last_id += 1
                artist_ids[t['artist']][t['track']] = last_id
            id_t = artist_ids[t['artist']][t['track']]
            if id_t in added: #Si esta agregada la actualiza
                g[id_u][id_t]['scrobbles'] += int(t['scrobbles'].replace(',', ''))
                if (t['artist'], t['track']) in info_tracks and g[id_u][id_t]['pos'] > info_tracks[(t['artist'], t['track'])][0]:
                    g[id_u][id_t]['pos'] = info_tracks[(t['artist'], t['track'])][0]
                    g[id_u][id_t]['date'] = info_tracks[(t['artist'], t['track'])][1]
                continue
            #Sino la agrego
            if (t['artist'], t['track']) not in info_tracks:
                g.add_edge(id_u, id_t, scrobbles=int(t['scrobbles'].replace(',', '')), 
                           pos=float('inf'),
                           date=None)
            else:
                g.add_edge(id_u, id_t, scrobbles=int(t['scrobbles'].replace(',', '')), 
                           pos=info_tracks[(t['artist'], t['track'])][0],
                           date=info_tracks[(t['artist'], t['track'])][1])
            added.add(id_t)
    print('Failed songs: {}'.format(failed))        
    return g, users_ids, artist_ids, spotify

In [4]:
g, users_ids, artist_ids, spotify_ids = load_graph(music, ignore=True)
#g, users_ids, artist_ids = load_graph(music)

Indexing users...


  0%|          | 0/3519 [00:00<?, ?it/s]

Indexing songs...


  0%|          | 0/361499 [00:00<?, ?it/s]

Generating graph...


  0%|          | 0/3309 [00:00<?, ?it/s]

Failed songs: 2519909


In [5]:
def get_connected_users(g, users_ids):
    processed = set()
    components = list()
    users = set(users_ids.values())
    for u in tqdm(users):
        to_look = [u]
        component = set()
        while len(to_look) > 0:
            n = to_look.pop()
            if n in processed:
                continue
            processed.add(n)
            if n in users:
                component.add(n)
            to_look.extend(g[n])
        #Remove users without tracks
        non_track_users = set()
        for u in component:
            valid = False
            for n in g.neighbors(u):
                if n not in users:
                    valid = True
                    break
            if not valid:
                non_track_users.add(u)
        component = component - non_track_users
        #Remove users without tracks
        if len(component) > 0:
            components.append(component)
    components.sort(key=lambda x: len(x), reverse=True)
    return components

In [6]:
components = get_connected_users(g, users_ids)
for c in components:
    print(list(c)[:10])
    print((len(c) / len(users_ids)) * 100)

  0%|          | 0/3309 [00:00<?, ?it/s]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
99.93955877908733


In [7]:
def sub_graph(g, users, users_ids, artist_ids):
    #Map to new user ids
    new_user_id = {u: e for e, u in enumerate(users)}
    n_users_ids = {u: new_user_id[n] for u, n in users_ids.items() if n in new_user_id}
    #Map to new artist-track ids
    tracks = set()
    for u in users:
        tracks.update(g[u])
    tracks = list(tracks)
    tracks.sort()
    new_track_id = {t: (e + len(n_users_ids)) for e, t in enumerate(tracks)}
    n_artist_ids = defaultdict(dict)
    for a, tracks in artist_ids.items():
        for t, i in tracks.items():
            if i in new_track_id:
                n_artist_ids[a][t] = new_track_id[i]
    #Generate Graph
    ng = nx.Graph()
    for u in users:
        nu = new_user_id[u]
        for t in g[u]:
            ng.add_edge(nu, new_track_id[t], 
                        scrobbles=g.edges[(u, t)]['scrobbles'],
                        pos=g.edges[(u, t)]['pos'], 
                        date=g.edges[(u, t)]['date'])
    return ng, n_users_ids, n_artist_ids, n_users_ids, new_track_id

In [8]:
sub_g, sub_users_ids, sub_artist_ids, _, track_map = sub_graph(g, components[0], users_ids, artist_ids)

In [9]:
len(sub_g.nodes()) - len(sub_users_ids)


252014

In [10]:
len(sub_users_ids)

3307

In [11]:
def transfer_tracks(g_or, g_dest, user, tracks):
    for t in tracks:
        g_dest.add_edge(user, t, 
                        scrobbles=g_or.edges[(user, t)]['scrobbles'], 
                        pos=g_or.edges[(user, t)]['pos'], 
                        date=g_or.edges[(user, t)]['date'])
    pass

'''
def train_test(g, users_ids, test_split=0.3):
    refs = Counter()
    #Count references to songs
    for u in users_ids.values():
        songs = set(g[u])
        for s in songs:
            refs[s] += 1
    #Generate graph
    g_train = nx.Graph()
    g_test = nx.Graph()
    non_songs_users = 0
    count_train = 0
    count_test = 0
    for u in tqdm(users_ids.values()):
        tracks = list(g[u])
        tracks.sort(key=lambda x: g[u][x]['pos'])
        split = min(math.ceil(len(tracks) * test_split), len(tracks) - 1)
        assert split >= 0
        for i in range(split):
            if refs[tracks[i]] == 1:
                split = i
                break
            refs[tracks[i]] -= 1
        if split == 0:
            print(u)
            non_songs_users += 1
        train = tracks[split:]
        test = tracks[:split]
        transfer_tracks(g, g_train, u, train)
        transfer_tracks(g, g_test, u, test)  
        count_train += len(train)
        count_test += len(test)
    print(f'Users without songs in testing {non_songs_users}')
    print(f'Train data: {count_train} Test data: {count_test}')
    return g_train, g_test  

'''
def train_test(g, users_ids, test_split=0.3):
    refs = Counter()
    #Count references to songs
    for u in users_ids.values():
        songs = set(g[u])
        for s in songs:
            refs[s] += 1
    #Generate graph
    g_train = nx.Graph()
    g_test = nx.Graph()
    non_songs_users = 0
    count_train = 0
    count_test = 0
    train_set = set()
    test_set = set()
    for u in tqdm(users_ids.values()):
        tracks = list(g[u])
        tracks.sort(key=lambda x: g[u][x]['pos'])
        v_len = len([x for x in tracks if g[u][x]['pos'] < float('inf')])
        #split = min(math.ceil(v_len * test_split), len(tracks) - 1)
        split = min(math.ceil(len(tracks) * test_split), len(tracks) - 1)
        assert split >= 0
        train = []
        test = []
        i = 0
        while i < len(tracks) and len(test) < split and g[u][tracks[i]]['pos'] < float('inf'):
            ct = tracks[i]
            if refs[ct] == 1:
                train.append(ct)
            else:
                test.append(ct)
                refs[ct] -= 1
            i += 1
        if len(test) == 0:
            #print(u)
            non_songs_users += 1
        train.extend(tracks[i:])
        transfer_tracks(g, g_train, u, train)
        transfer_tracks(g, g_test, u, test)  
        count_train += len(train)
        count_test += len(test)
        train_set.update(train)
        test_set.update(test)
    print(f'Users without songs in testing {non_songs_users}')
    print(f'Train data: {count_train} Test data: {count_test}')
    print(f'Train tracks data: {len(train_set)} Test tracks data: {len(test_set)}')
    return g_train, g_test 

In [12]:
g_train, g_test = train_test(sub_g, sub_users_ids)

  0%|          | 0/3307 [00:00<?, ?it/s]

Users without songs in testing 28
Train data: 2564908 Test data: 453301
Train tracks data: 252014 Test tracks data: 155849


In [13]:
unused_users = set()
for u in sub_users_ids.values():
    if u in g_test and len(list(g_test.neighbors(u))) == 0:
        unused_users.add(u)
    
print(len(unused_users))
g_test.remove_nodes_from(unused_users)

0


In [14]:
#Check all ids
not_used = [0] * len(sub_g.nodes())

for i in tqdm(sub_users_ids.values()):
    not_used[i] += 1

for tracks in tqdm(sub_artist_ids.values()):
    for i in tracks.values():
        not_used[i] += 1
        
total = 0
for e, i in enumerate(not_used):
    if i == 1:
        total += 1
    else:
        print(f'{e}: {i}')

print(total)
print(len(not_used))

  0%|          | 0/3307 [00:00<?, ?it/s]

  0%|          | 0/28540 [00:00<?, ?it/s]

3336: 2
3337: 2
3340: 3
3345: 2
3360: 2
3366: 2
3373: 2
3377: 2
3380: 2
3396: 2
3407: 2
3415: 2
3444: 2
3511: 2
3550: 2
3577: 2
3578: 2
3586: 2
3589: 2
3594: 2
3597: 3
3660: 2
3721: 2
3749: 2
3774: 2
3778: 2
3780: 2
3786: 3
3798: 3
3809: 2
3835: 2
3841: 2
3855: 2
3864: 2
3884: 2
3889: 2
3915: 2
3916: 2
3917: 2
3927: 2
3940: 2
3941: 2
3945: 2
3952: 3
3962: 2
3981: 3
3984: 2
3992: 2
3997: 3
4014: 2
4017: 2
4029: 3
4040: 2
4048: 3
4053: 2
4054: 2
4058: 2
4070: 2
4079: 2
4087: 2
4119: 2
4120: 2
4132: 2
4140: 2
4141: 2
4153: 3
4158: 2
4183: 2
4194: 2
4201: 2
4208: 2
4224: 3
4251: 2
4264: 3
4269: 2
4270: 2
4274: 2
4275: 2
4281: 2
4291: 2
4293: 2
4315: 2
4326: 2
4342: 2
4359: 2
4366: 2
4380: 2
4382: 2
4463: 2
4464: 2
4467: 2
4479: 2
4490: 2
4505: 2
4506: 2
4541: 2
4589: 2
4590: 2
4618: 2
4638: 2
4664: 2
4673: 2
4675: 2
4679: 2
4680: 2
4695: 2
4759: 2
4778: 2
4784: 2
4785: 2
4799: 2
4843: 2
4844: 2
4845: 2
4855: 2
4862: 2
4863: 2
4900: 2
4907: 2
4908: 2
4913: 2
4933: 2
4951: 2
4968: 2
4969: 2


24662: 2
24668: 2
24674: 2
24677: 2
24683: 2
24743: 2
24745: 3
24842: 2
24844: 2
24846: 2
24898: 2
24984: 2
24990: 2
25043: 2
25054: 2
25172: 2
25173: 2
25174: 3
25175: 2
25176: 2
25192: 2
25227: 2
25249: 2
25258: 2
25330: 2
25340: 2
25389: 2
25390: 2
25391: 2
25392: 2
25413: 2
25432: 2
25433: 2
25457: 2
25504: 3
25525: 2
25571: 2
25584: 2
25598: 2
25610: 3
25630: 3
25631: 3
25656: 2
25657: 2
25691: 2
25741: 2
25744: 3
25766: 2
25842: 3
25843: 3
25869: 3
25871: 2
25872: 2
25873: 2
25906: 2
25911: 3
25994: 2
26036: 2
26045: 2
26101: 2
26119: 3
26229: 2
26289: 2
26307: 2
26358: 2
26426: 2
26460: 2
26461: 2
26462: 2
26463: 2
26476: 2
26540: 2
26639: 2
26675: 2
26680: 2
26682: 3
26698: 2
26835: 2
26895: 2
26896: 2
26898: 2
27163: 2
27233: 2
27285: 2
27320: 2
27344: 2
27390: 2
27423: 2
27579: 2
27583: 2
27597: 2
27598: 2
27613: 2
27616: 2
27617: 2
27618: 2
27666: 2
27678: 2
27710: 2
27780: 2
27791: 2
27793: 2
27822: 3
27841: 2
27850: 2
27941: 2
28009: 2
28109: 2
28169: 2
28353: 2
28380: 2
2

56652: 2
56667: 2
56687: 2
56710: 2
56717: 2
56742: 2
56763: 2
56766: 2
56784: 2
56795: 2
56796: 2
56803: 2
56807: 2
56821: 2
56825: 2
56843: 2
56865: 2
56874: 2
56893: 3
56894: 2
56913: 2
56932: 2
56958: 2
56978: 2
56988: 2
57001: 2
57011: 2
57026: 2
57049: 2
57053: 2
57054: 2
57068: 2
57075: 2
57077: 2
57079: 2
57080: 2
57082: 2
57084: 2
57088: 2
57090: 2
57092: 3
57093: 3
57094: 2
57096: 2
57098: 2
57100: 2
57106: 2
57108: 2
57120: 3
57136: 2
57153: 2
57198: 4
57207: 2
57244: 3
57268: 2
57280: 2
57285: 3
57292: 2
57294: 3
57336: 2
57393: 2
57400: 2
57411: 2
57412: 2
57416: 2
57423: 2
57426: 2
57427: 2
57461: 2
57469: 2
57470: 2
57474: 2
57476: 2
57483: 2
57499: 2
57530: 3
57537: 2
57539: 2
57542: 2
57544: 2
57545: 2
57587: 2
57615: 2
57664: 2
57686: 2
57756: 2
57810: 2
57869: 2
57907: 2
57921: 2
57954: 2
57966: 2
58001: 2
58011: 3
58039: 2
58044: 2
58056: 2
58058: 2
58062: 2
58072: 2
58075: 2
58085: 2
58087: 2
58114: 2
58118: 2
58127: 2
58136: 2
58151: 2
58167: 2
58176: 2
58232: 2
5

90710: 2
90724: 2
90727: 2
90739: 2
90773: 2
90787: 2
90818: 2
90819: 2
90820: 2
90821: 3
90822: 2
90824: 2
90827: 2
90831: 2
90854: 2
90872: 2
90873: 3
90875: 2
90889: 2
90893: 2
90929: 2
90933: 2
90947: 2
90958: 2
90982: 2
91046: 2
91047: 2
91099: 2
91102: 2
91104: 2
91176: 2
91187: 2
91230: 2
91234: 2
91273: 2
91275: 2
91276: 2
91277: 2
91285: 2
91287: 2
91298: 2
91347: 2
91348: 2
91385: 2
91388: 3
91425: 2
91493: 2
91494: 2
91527: 2
91560: 2
91561: 2
91576: 2
91599: 2
91600: 2
91647: 2
91665: 2
91670: 2
91708: 4
91751: 2
91762: 2
91763: 2
91764: 2
91765: 2
91767: 2
91780: 2
91796: 2
91955: 3
91963: 2
91977: 2
92046: 3
92097: 3
92174: 2
92183: 2
92196: 2
92203: 2
92214: 2
92216: 2
92250: 2
92304: 2
92306: 2
92310: 2
92314: 2
92315: 3
92317: 2
92348: 2
92349: 2
92354: 2
92433: 2
92435: 2
92457: 2
92491: 2
92503: 2
92506: 2
92510: 3
92566: 2
92614: 2
92641: 3
92780: 2
92917: 2
92936: 2
92951: 2
92970: 2
92971: 2
92988: 2
92991: 2
93013: 2
93051: 2
93064: 2
93082: 3
93088: 2
93091: 2
9

160193: 2
160207: 2
160220: 2
160259: 2
160320: 2
160448: 2
160672: 2
160862: 2
160914: 2
160967: 2
161021: 2
161109: 2
161211: 2
161213: 2
161385: 2
161408: 2
161452: 2
161454: 2
161531: 2
161609: 2
161700: 2
162060: 2
162107: 2
162118: 2
162123: 2
162153: 2
162229: 2
162354: 2
162355: 3
162417: 2
162518: 2
162586: 2
162600: 2
162623: 2
162626: 2
162735: 2
162754: 2
162789: 2
162848: 2
162855: 2
162903: 2
162904: 2
162997: 2
163026: 2
163160: 2
163236: 2
163334: 2
163492: 2
163642: 2
163703: 2
163775: 2
163845: 2
163878: 2
163899: 2
163963: 2
163993: 2
164000: 2
164278: 3
164341: 2
164437: 2
164513: 2
164692: 2
164714: 2
164919: 2
164931: 2
164982: 2
165022: 2
165133: 2
165181: 2
165295: 2
165338: 2
165350: 2
165450: 3
165554: 2
165562: 2
165581: 2
165610: 2
165631: 2
165644: 2
165822: 2
165897: 2
165917: 2
165929: 2
165948: 2
165966: 2
165983: 2
166022: 2
166083: 2
166100: 2
166217: 2
166309: 2
166378: 2
166396: 2
166519: 2
166624: 2
166636: 2
166654: 2
166656: 2
166666: 2
166705: 2


In [15]:
t = list(sub_g[0].keys())[0]
i_track_map = {v: k for k, v in track_map.items()}
print(i_track_map[t])
ids_spotify = {v: k for k, v in spotify_ids.items()}
print(ids_spotify[i_track_map[t]])

3309
46BMGjAnLVVR1gZcayw93j


In [16]:
spotify_ids[ids_spotify[i_track_map[t]]]

3309

In [17]:
artist_ids['Black Kids']

{"I'm Not Gonna Teach Your Boyfriend How to Dance With You": 3309,
 'Hit The Heartbrakes': 3310,
 "I've Underestimated My Charm (Again)": 3311,
 'Partie Traumatic': 3312,
 "I'm Making Eyes at You": 3313,
 'Listen To Your Body Tonight': 3314,
 'Hurricane Jane': 3315,
 'Look at Me (When I Rock Wichoo)': 3316,
 'Love Me Already': 3317,
 'I Wanna Be Your Limousine': 3318,
 'My Christian Name': 3365,
 'Obligatory Drugs': 4209,
 'V-Card (Not Nuthin’)': 4650,
 'Natural Born Kissers': 4695,
 'In A Song': 4968,
 'If My Heart Is Broken': 5032,
 'All The Emotions': 5185,
 'Way Into Leather': 5186,
 'Hurricane Jane (The Cansecos Remix)': 74906,
 "I'm Making Eyes At You (Joy Electric Remix)": 123982,
 "I'm Not Gonna Teach Your Boyfriend How to Dance with You - The Twelves Remix": 123984,
 "I'm Not Gonna Teach Your Boyfriend How to Dance with You (The Twelves Remix)": 123984}

In [18]:
sub_artist_ids['Black Kids']

{"I'm Not Gonna Teach Your Boyfriend How to Dance With You": 3307,
 'Hit The Heartbrakes': 3308,
 "I've Underestimated My Charm (Again)": 3309,
 'Partie Traumatic': 3310,
 "I'm Making Eyes at You": 3311,
 'Listen To Your Body Tonight': 3312,
 'Hurricane Jane': 3313,
 'Look at Me (When I Rock Wichoo)': 3314,
 'Love Me Already': 3315,
 'I Wanna Be Your Limousine': 3316,
 'My Christian Name': 3361,
 'Obligatory Drugs': 4107,
 'V-Card (Not Nuthin’)': 4498,
 'Natural Born Kissers': 4538,
 'In A Song': 4773,
 'If My Heart Is Broken': 4829,
 'All The Emotions': 4959,
 'Way Into Leather': 4960,
 'Hurricane Jane (The Cansecos Remix)': 54755,
 "I'm Making Eyes At You (Joy Electric Remix)": 87774,
 "I'm Not Gonna Teach Your Boyfriend How to Dance with You - The Twelves Remix": 87775,
 "I'm Not Gonna Teach Your Boyfriend How to Dance with You (The Twelves Remix)": 87775}

In [19]:
with open('data/dataset.pickle', 'wb') as f:
    pickle.dump({'full': sub_g, 'train': g_train, 'test': g_test, 
                 'users': sub_users_ids, 'artist-tracks': sub_artist_ids}, f)

In [None]:
nx.draw(g)

In [None]:
nx.draw(g_train)

In [None]:
nx.draw(g_test)