In [1]:
import os, sys
import time
import json
import collections
from scipy import sparse
import numpy as np
import pandas as pd

In [2]:
PROJECT_PATH = '/Users/Sp0t/Desktop/6998-Adv-ML-Final-Proj/'
INPUT_DATA_PATH = PROJECT_PATH + 'mpd.v1/data/'
PROC_DATA_PATH = PROJECT_PATH + 'mpd_proc/'
OUTPUT_DATA_PATH = PROJECT_PATH + 'data_2/'

Load data, track frequency of tracks and artists and record descriptive track info.

In [None]:
track_counts = collections.Counter()
artist_counts = collections.Counter()
track_info = {}

for fname in os.listdir(INPUT_DATA_PATH):
    data = json.loads(open(os.path.join(INPUT_DATA_PATH, fname)).read())
    
    for playlist in data['playlists']:
        for track in playlist['tracks']:
            track_id = track['track_uri'].split(':')[-1]
            track_counts[track_id] += 1
            
            artist_name = track['artist_name']
            artist_counts[artist_name] += 1
            
            track_info[track_id] = {
                'track_name': track['track_name'],
                'artist_name': artist_name,
                'album_name': track['album_name'],
                'duration': track['duration_ms']
            }

if not os.path.exists(OUTPUT_DATA_PATH):
    os.makedirs(OUTPUT_DATA_PATH)
            
with open(OUTPUT_DATA_PATH + 'track_counts.json', 'w') as f:
    json.dump(dict(track_counts), f)
    
with open(OUTPUT_DATA_PATH + 'artist_counts.json', 'w') as f:
    json.dump(dict(artist_counts), f)
    
with open(OUTPUT_DATA_PATH + 'track_info_dict.json', 'w') as f:
    json.dump(track_info, f)

Load track count info. Filter out infrequent tracks (below track_threshold)

In [3]:
track_counts = collections.Counter(json.loads(open(OUTPUT_DATA_PATH + 'track_counts.json').read()))

In [4]:
track_threshold = 25
common_track_counts = collections.Counter({k:c for k, c in track_counts.items() if c > track_threshold})

In [5]:
len(common_track_counts)

190897

Load artist count info. Filter out infrequent artists (below artist_threshold)

In [6]:
artist_counts = collections.Counter(json.loads(open(OUTPUT_DATA_PATH + 'artist_counts.json').read()))

In [7]:
artist_threshold = 40
common_artist_counts = collections.Counter({k:c for k, c in artist_counts.items() if c > artist_threshold})

In [8]:
len(common_artist_counts)

40588

Mapping of Spotify track ids to internal integer mapping (from 1 to M).

In [9]:
valid_track_ids = common_track_counts.keys()
track_id_map = dict(zip(valid_track_ids, range(len(valid_track_ids))))
rev_track_id_map = dict(zip(track_id_map.values(), track_id_map.keys()))

Mapping of Spotify artist names to internal integer mapping (from 1 to M)

In [10]:
valid_artist_names = common_artist_counts.keys()
artist_id_map = dict(zip(valid_artist_names, range(len(valid_artist_names))))
rev_artist_id_map = dict(zip(artist_id_map.values(), artist_id_map.keys()))

Parse data again, keep 

In [11]:
track_playlist_threshold = 10
track_playlist_count = 0
track_playlist_id_map = {}
track_row_inds = []
track_col_inds = []
valid_track_id_set = set(valid_track_ids)

artist_playlist_threshold = 4
artist_playlist_count = 0
artist_playlist_id_map = {}
artist_row_inds = []
artist_col_inds = []
valid_artist_name_set = set(valid_artist_names)

start = time.time()

for fname in os.listdir(INPUT_DATA_PATH):
    data = json.loads(open(os.path.join(INPUT_DATA_PATH, fname)).read())
    for playlist in data['playlists']:
        valid_tracks = [track_id_map[track['track_uri'].split(':')[-1]] for track in playlist['tracks'] \
                        if track['track_uri'].split(':')[-1] in valid_track_id_set]
        if len(valid_tracks) > track_playlist_threshold:
            track_playlist_id_map[track_playlist_count] = playlist['pid']
            track_row_inds += [track_playlist_count] * len(valid_tracks)
            track_col_inds += valid_tracks
            track_playlist_count += 1

        valid_artists = [artist_id_map[track['artist_name']] for track in playlist['tracks'] \
                         if track['artist_name'] in valid_artist_name_set]
        if len(valid_artists) > artist_playlist_threshold:
            artist_playlist_id_map[artist_playlist_count] = playlist['pid']
            artist_row_inds += [artist_playlist_count] * len(valid_artists)
            artist_col_inds += valid_artists
            artist_playlist_count += 1
            
end = time.time()
print ("Time elapsed: {}s".format(round(end-start, 1)))

Time elapsed: 532.7s


In [12]:
track_data_thin = np.array([track_row_inds, track_col_inds]).T
track_df = pd.DataFrame(track_data_thin)
track_df.columns = ['playlist_id', 'track_id']

artist_data_thin = np.array([artist_row_inds, artist_col_inds]).T
artist_df = pd.DataFrame(artist_data_thin)
artist_df.columns = ['playlist_id', 'artist_id']

In [13]:
tracks_to_delete = set(rev_track_id_map.keys()) - set(track_df['track_id'].unique())
tracks_to_delete
bad_spotify_track_ids = [rev_track_id_map[track] for track in tracks_to_delete]
bad_spotify_track_ids

[]

In [14]:
artists_to_delete = set(rev_artist_id_map.keys()) - set(artist_df['artist_id'].unique())
artists_to_delete
bad_spotify_artist_names = [rev_artist_id_map[artist] for artist in artists_to_delete]
bad_spotify_artist_names

[]

In [15]:
track_remaining_ids = track_df['track_id'].unique()

new_valid_track_ids = [rev_track_id_map[track_id] for track_id in track_remaining_ids]
new_track_id_map = dict(zip(new_valid_track_ids, range(len(new_valid_track_ids))))
new_rev_track_id_map = dict(zip(new_track_id_map.values(), new_track_id_map.keys()))

track_old_new_map = dict(zip(track_remaining_ids, new_rev_track_id_map.keys()))

artist_remaining_ids = artist_df['artist_id'].unique()

new_valid_artist_names = [rev_artist_id_map[artist_id] for artist_id in artist_remaining_ids]
new_artist_id_map = dict(zip(new_valid_artist_names, range(len(new_valid_artist_names))))
new_rev_artist_id_map = dict(zip(new_artist_id_map.values(), new_artist_id_map.keys()))

artist_old_new_map = dict(zip(artist_remaining_ids, new_rev_artist_id_map.keys()))

In [16]:
track_df['track_id'] = track_df['track_id'].apply(lambda x: track_old_new_map[x])
artist_df['artist_id'] = artist_df['artist_id'].apply(lambda x: artist_old_new_map[x])

In [17]:
track_row_inds = np.array(track_df['playlist_id'])
track_col_inds = np.array(track_df['track_id'])
track_data = sparse.coo_matrix(([1] * len(track_row_inds), (track_row_inds, track_col_inds)))

artist_row_inds = np.array(artist_df['playlist_id'])
artist_col_inds = np.array(artist_df['artist_id'])
artist_data = sparse.coo_matrix(([1] * len(artist_row_inds), (artist_row_inds, artist_col_inds)))

In [18]:
sparse.save_npz(OUTPUT_DATA_PATH + 'track_raw_sparse_large.npz', track_data)
sparse.save_npz(OUTPUT_DATA_PATH + 'artist_raw_sparse_large.npz', artist_data)

In [19]:
track_df.to_csv(OUTPUT_DATA_PATH + 'track_raw_df_large.csv')
artist_df.to_csv(OUTPUT_DATA_PATH + 'artist_raw_df_large.csv')

### Load from sparse .npz file

In [20]:
track_data = sparse.load_npz(OUTPUT_DATA_PATH + 'track_raw_sparse_large.npz')
artist_data = sparse.load_npz(OUTPUT_DATA_PATH + 'artist_raw_sparse_large.npz')

In [21]:
track_df = pd.DataFrame({'playlist_id': track_data.row, 'track_id': track_data.col})
artist_df = pd.DataFrame({'playlist_id': artist_data.row, 'artist_id': artist_data.col})

https://github.com/dawenl/vae_cf/blob/master/VAE_ML20M_WWW2018.ipynb

In [22]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_playlist = data.groupby('playlist_id')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_playlist):
        n_items_u = len(group)

        if n_items_u >= 5:
            inds = np.zeros(n_items_u, dtype='bool')
            inds[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).
                 astype('int64')] = True

            tr_list.append(group[np.logical_not(inds)])
            te_list.append(group[inds])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print('{:d} playlists sampled'.format(i))
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [23]:
track_sparsity = 1. * track_df.shape[0] / (track_data.shape[0] * track_data.shape[1])

print("""After filtering, there are {:d} playlist inclusion events from {:d} playlist and {:d} tracks \
(sparsity: {:.3f}%)""".format(
    track_df.shape[0],
    track_data.shape[0],
    track_data.shape[1],
    track_sparsity * 100))

After filtering, there are 58908656 playlist inclusion events from 919695 playlist and 190897 tracks (sparsity: 0.034%)


In [24]:
artist_sparsity = 1. * artist_df.shape[0] / (artist_data.shape[0] * artist_data.shape[1])

print("""After filtering, there are {:d} playlist inclusion events from {:d} playlist and {:d} artists \
(sparsity: {:.3f}%)""".format(
    artist_df.shape[0],
    artist_data.shape[0],
    artist_data.shape[1],
    artist_sparsity * 100))

After filtering, there are 64944204 playlist inclusion events from 997671 playlist and 40588 artists (sparsity: 0.160%)


In [25]:
track_unique_pid = pd.unique(track_df['playlist_id'])
track_inds_perm = np.random.permutation(track_unique_pid.size)
track_unique_pid = track_unique_pid[track_inds_perm]

artist_unique_pid = pd.unique(artist_df['playlist_id'])
artist_inds_perm = np.random.permutation(artist_unique_pid.size)
artist_unique_pid = artist_unique_pid[artist_inds_perm]

In [37]:
track_n_playlists = track_data.shape[0]
track_n_heldout_playlists = 50000

track_tr_playlists = track_unique_pid[:(track_n_playlists - track_n_heldout_playlists * 2)]
track_vd_playlists = track_unique_pid[(track_n_playlists - track_n_heldout_playlists * 2):
                                      (track_n_playlists - track_n_heldout_playlists)]
track_te_playlists = track_unique_pid[(track_n_playlists - track_n_heldout_playlists):]

In [38]:
track_train_playlists = track_df.loc[track_df['playlist_id'].isin(track_tr_playlists)]

In [39]:
track_unique_tid = pd.unique(track_train_playlists['track_id'])
len(track_unique_tid)
### THIS HAS TO MATCH THE ORIGINAL NUMBER OF UNIQUE TRACKS -- OTHERWISE RESAMPLE TRAINING PLAYLIST IDS

190897

In [40]:
len(pd.unique(track_df['track_id']))

190897

In [41]:
artist_n_playlists = artist_data.shape[0]
artist_n_heldout_playlists = 15000

artist_tr_playlists = artist_unique_pid[:(artist_n_playlists - artist_n_heldout_playlists * 2)]
artist_vd_playlists = artist_unique_pid[(artist_n_playlists - artist_n_heldout_playlists * 2):
                                        (artist_n_playlists - artist_n_heldout_playlists)]
artist_te_playlists = artist_unique_pid[(artist_n_playlists - artist_n_heldout_playlists):]

In [42]:
artist_train_playlists = artist_df.loc[artist_df['playlist_id'].isin(artist_tr_playlists)]

In [43]:
artist_unique_aid = pd.unique(artist_train_playlists['artist_id'])
len(artist_unique_aid)
### THIS HAS TO MATCH THE ORIGINAL NUMBER OF UNIQUE TRACKS -- OTHERWISE RESAMPLE TRAINING PLAYLIST IDS

40588

In [44]:
len(pd.unique(artist_df['artist_id']))

40588

In [45]:
track_track2id = dict((tid, i) for (i, tid) in enumerate(track_unique_tid))
track_playlist2id = dict((pid, i) for (i, pid) in enumerate(track_unique_pid))

In [46]:
artist_artist2id = dict((aid, i) for (i, aid) in enumerate(artist_unique_aid))
artist_playlist2id = dict((pid, i) for (i, pid) in enumerate(artist_unique_pid))

In [47]:
proc_dir = os.path.join(PROC_DATA_PATH, 'data_2')

if not os.path.exists(proc_dir):
    os.makedirs(proc_dir)

with open(os.path.join(proc_dir, 'track_unique_tid.txt'), 'w') as f:
    for tid in track_unique_tid:
        f.write('%s\n' % tid)
        
with open(os.path.join(proc_dir, 'artist_unique_aid.txt'), 'w') as f:
    for aid in artist_unique_aid:
        f.write('%s\n' % aid)

In [48]:
track_vad_playlists = track_df.loc[track_df['playlist_id'].isin(track_vd_playlists)]
track_vad_playlists = track_vad_playlists.loc[track_vad_playlists['track_id'].isin(track_unique_tid)]
track_vad_playlists_tr, track_vad_playlists_te = split_train_test_proportion(track_vad_playlists)

0 playlists sampled
1000 playlists sampled
2000 playlists sampled
3000 playlists sampled
4000 playlists sampled
5000 playlists sampled
6000 playlists sampled
7000 playlists sampled
8000 playlists sampled
9000 playlists sampled
10000 playlists sampled
11000 playlists sampled
12000 playlists sampled
13000 playlists sampled
14000 playlists sampled
15000 playlists sampled
16000 playlists sampled
17000 playlists sampled
18000 playlists sampled
19000 playlists sampled
20000 playlists sampled
21000 playlists sampled
22000 playlists sampled
23000 playlists sampled
24000 playlists sampled
25000 playlists sampled
26000 playlists sampled
27000 playlists sampled
28000 playlists sampled
29000 playlists sampled
30000 playlists sampled
31000 playlists sampled
32000 playlists sampled
33000 playlists sampled
34000 playlists sampled
35000 playlists sampled
36000 playlists sampled
37000 playlists sampled
38000 playlists sampled
39000 playlists sampled
40000 playlists sampled
41000 playlists sampled
42000

In [49]:
track_test_playlists = track_df.loc[track_df['playlist_id'].isin(track_te_playlists)]
track_test_playlists = track_test_playlists.loc[track_test_playlists['track_id'].isin(track_unique_tid)]
track_test_playlists_tr, track_test_playlists_te = split_train_test_proportion(track_test_playlists)

0 playlists sampled
1000 playlists sampled
2000 playlists sampled
3000 playlists sampled
4000 playlists sampled
5000 playlists sampled
6000 playlists sampled
7000 playlists sampled
8000 playlists sampled
9000 playlists sampled
10000 playlists sampled
11000 playlists sampled
12000 playlists sampled
13000 playlists sampled
14000 playlists sampled
15000 playlists sampled
16000 playlists sampled
17000 playlists sampled
18000 playlists sampled
19000 playlists sampled
20000 playlists sampled
21000 playlists sampled
22000 playlists sampled
23000 playlists sampled
24000 playlists sampled
25000 playlists sampled
26000 playlists sampled
27000 playlists sampled
28000 playlists sampled
29000 playlists sampled
30000 playlists sampled
31000 playlists sampled
32000 playlists sampled
33000 playlists sampled
34000 playlists sampled
35000 playlists sampled
36000 playlists sampled
37000 playlists sampled
38000 playlists sampled
39000 playlists sampled
40000 playlists sampled
41000 playlists sampled
42000

In [50]:
artist_vad_playlists = artist_df.loc[artist_df['playlist_id'].isin(artist_vd_playlists)]
artist_vad_playlists = artist_vad_playlists.loc[artist_vad_playlists['artist_id'].isin(artist_unique_aid)]
artist_vad_playlists_tr, artist_vad_playlists_te = split_train_test_proportion(artist_vad_playlists)

0 playlists sampled
1000 playlists sampled
2000 playlists sampled
3000 playlists sampled
4000 playlists sampled
5000 playlists sampled
6000 playlists sampled
7000 playlists sampled
8000 playlists sampled
9000 playlists sampled
10000 playlists sampled
11000 playlists sampled
12000 playlists sampled
13000 playlists sampled
14000 playlists sampled


In [51]:
artist_test_playlists = artist_df.loc[artist_df['playlist_id'].isin(artist_te_playlists)]
artist_test_playlists = artist_test_playlists.loc[artist_test_playlists['artist_id'].isin(artist_unique_aid)]
artist_test_playlists_tr, artist_test_playlists_te = split_train_test_proportion(artist_test_playlists)

0 playlists sampled
1000 playlists sampled
2000 playlists sampled
3000 playlists sampled
4000 playlists sampled
5000 playlists sampled
6000 playlists sampled
7000 playlists sampled
8000 playlists sampled
9000 playlists sampled
10000 playlists sampled
11000 playlists sampled
12000 playlists sampled
13000 playlists sampled
14000 playlists sampled


In [52]:
def numerize(tp, playlist2id, item2id, item_key_str):
    pid = list(map(lambda x: playlist2id[x], tp['playlist_id']))
    iid = list(map(lambda x: item2id[x], tp[item_key_str]))
    return pd.DataFrame(data={'pid': pid, 'iid': iid}, columns=['pid', 'iid'])

In [53]:
track_id_key_str = 'track_id'
track_train_data = numerize(track_train_playlists, track_playlist2id, track_track2id, track_id_key_str)
track_train_data.to_csv(os.path.join(proc_dir, 'track_train.csv'), index=False)

track_vad_data_tr = numerize(track_vad_playlists_tr, track_playlist2id, track_track2id, track_id_key_str)
track_vad_data_tr.to_csv(os.path.join(proc_dir, 'track_validation_tr.csv'), index=False)

track_vad_data_te = numerize(track_vad_playlists_te, track_playlist2id, track_track2id, track_id_key_str)
track_vad_data_te.to_csv(os.path.join(proc_dir, 'track_validation_te.csv'), index=False)

track_test_data_tr = numerize(track_test_playlists_tr, track_playlist2id, track_track2id, track_id_key_str)
track_test_data_tr.to_csv(os.path.join(proc_dir, 'track_test_tr.csv'), index=False)

track_test_data_te = numerize(track_test_playlists_te, track_playlist2id, track_track2id, track_id_key_str)
track_test_data_te.to_csv(os.path.join(proc_dir, 'track_test_te.csv'), index=False)

In [54]:
artist_id_key_str = 'artist_id'
artist_train_data = numerize(artist_train_playlists, artist_playlist2id, artist_artist2id, artist_id_key_str)
artist_train_data.to_csv(os.path.join(proc_dir, 'artist_train.csv'), index=False)

artist_vad_data_tr = numerize(artist_vad_playlists_tr, artist_playlist2id, artist_artist2id, artist_id_key_str)
artist_vad_data_tr.to_csv(os.path.join(proc_dir, 'artist_validation_tr.csv'), index=False)

artist_vad_data_te = numerize(artist_vad_playlists_te, artist_playlist2id, artist_artist2id, artist_id_key_str)
artist_vad_data_te.to_csv(os.path.join(proc_dir, 'artist_validation_te.csv'), index=False)

artist_test_data_tr = numerize(artist_test_playlists_tr, artist_playlist2id, artist_artist2id, artist_id_key_str)
artist_test_data_tr.to_csv(os.path.join(proc_dir, 'artist_test_tr.csv'), index=False)

artist_test_data_te = numerize(artist_test_playlists_te, artist_playlist2id, artist_artist2id, artist_id_key_str)
artist_test_data_te.to_csv(os.path.join(proc_dir, 'artist_test_te.csv'), index=False)