In [1]:
import json
import time
import os
from scipy import sparse
import numpy as np
import collections
from tqdm import tqdm
import pandas as pd
import sys

Load data, track frequency of tracks and record descriptive track info.

In [2]:
track_counts = collections.Counter()
track_info = {}

data_path = '/Users/kwu/data/recsys2018/mpd.v1/data'

for fname in os.listdir(data_path):
    data = json.loads(open(os.path.join(data_path, fname)).read())
    
    for playlist in data['playlists']:
        for track in playlist['tracks']:
            track_id = track['track_uri'].split(':')[-1]
            track_counts[track_id] += 1  
            track_info[track_id] = {
                            'track_name': track['track_name'],
                            'artist_name': track['artist_name'],
                            'album_name': track['album_name']
                        }
            
with open('track_counts.json', 'w') as f:
    json.dump(dict(track_counts), f)
    
with open('track_info_dict.json', 'w') as f:
    json.dump(track_info, f)

Load track count info. Filter out infrequent tracks (below track_threshold)

In [75]:
track_counts = collections.Counter(json.loads(open('../data/track_counts.json').read()))

In [76]:
track_threshold = 100
common_track_counts =  collections.Counter({k:c for k, c in track_counts.items() if c > track_threshold})

In [77]:
len(common_track_counts)

69678

Mapping of Spotify track ids to internal integer mapping (from 1 to M).

In [78]:
valid_track_ids = common_track_counts.keys()
track_id_map = dict(zip(valid_track_ids, range(len(valid_track_ids))))
rev_track_id_map = dict(zip(track_id_map.values(), track_id_map.keys()))

In [None]:
Parse data again, keep 

In [80]:
playlist_threshold = 50
playlist_count = 0

playlist_id_map = {}

row_inds = []
col_inds = []

start = time.time()

valid_track_id_set = set(valid_track_ids)

for fname in os.listdir(data_path):
    data = json.loads(open(os.path.join(data_path, fname)).read())
    for playlist in data['playlists']:
        valid_tracks = [track_id_map[track['track_uri'].split(':')[-1]] for track in playlist['tracks'] if track['track_uri'].split(':')[-1] in valid_track_id_set]
        if len(valid_tracks) > playlist_threshold:
            
            playlist_id_map[playlist_count] = playlist['pid']
            row_inds += [playlist_count] * len(valid_tracks)
            col_inds += valid_tracks
            
            playlist_count += 1
            
end = time.time()
print ("Time elapsed: {}s".format(round(end-start, 1)))

Time elapsed: 412.9s


In [81]:
data_thin = np.array([row_inds, col_inds]).T
df = pd.DataFrame(data_thin)
df.columns = ['user_id', 'track_id']

In [82]:
tracks_to_delete = set(rev_track_id_map.keys()) - set(df['track_id'].unique())
tracks_to_delete
bad_spotify_track_ids = [rev_track_id_map[track] for track in tracks_to_delete]
bad_spotify_track_ids

['2LQIXDbSokfmms0rlffgXq', '4EezDPRZTcFd5iFZwvAOL4', '6MGrXTz0TZaO6sbHZhOq61']

In [84]:
remaining_ids = df['track_id'].unique()

new_valid_track_ids = [rev_track_id_map[track_id] for track_id in remaining_ids]
new_track_id_map = dict(zip(new_valid_track_ids, range(len(new_valid_track_ids))))
new_rev_track_id_map = dict(zip(new_track_id_map.values(), new_track_id_map.keys()))

old_new_map = dict(zip(remaining_ids, new_rev_track_id_map.keys()))

In [86]:
df['track_id'] = df['track_id'].apply(lambda x: old_new_map[x])

In [91]:
row_inds = np.array(df['user_id'])
col_inds = np.array(df['track_id'])
data = sparse.coo_matrix(([1] * len(row_inds), (row_inds, col_inds)))

In [92]:
data = sparse.coo_matrix(([1] * len(row_inds), (row_inds, col_inds)))
sparse.save_npz("raw_sparse_2.npz", data)

In [93]:
df.to_csv('raw_df_2.csv')

### Load from sparse .npz file

In [2]:
data = sparse.load_npz('../data/raw_sparse_2.npz')

In [3]:
df = pd.DataFrame({'user_id': data.row, 'track_id': data.col})

https://github.com/dawenl/vae_cf/blob/master/VAE_ML20M_WWW2018.ipynb

In [4]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user_id')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [5]:
sparsity = 1. * df.shape[0] / (data.shape[0] * data.shape[1])

print("After filtering, there are %d playlist inclusion events from %d playlist and %d tracks (sparsity: %.3f%%)" % 
      (df.shape[0], data.shape[0], data.shape[1], sparsity * 100))

After filtering, there are 39106490 playlist inclusion events from 393740 playlist and 69675 tracks (sparsity: 0.143%)


In [11]:
unique_uid = pd.unique(df['user_id'])

n_users = data.shape[0]
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]

In [12]:
n_heldout_users = 10000

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

In [13]:
train_plays = df.loc[df['user_id'].isin(tr_users)]

In [16]:
unique_sid = pd.unique(train_plays['track_id'])
len(unique_sid)
### THIS HAS TO MATCH THE ORIGINAL NUMBER OF UNIQUE TRACKS -- OTHERWISE RESAMPLE TRAINING USER IDS

69675

In [17]:
len(pd.unique(df['track_id']))

69675

In [18]:
show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

In [20]:
DATA_DIR = '/home/kjw2157/6998-Adv-ML-Final-Proj/mpd_proc'
pro_dir = os.path.join(DATA_DIR, 'data_small_2')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

In [21]:
def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user_id')
    tr_list, te_list = list(), list()

    np.random.seed(98765)

    for i, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if i % 1000 == 0:
            print("%d users sampled" % i)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [22]:
vad_plays = df.loc[df['user_id'].isin(vd_users)]
vad_plays = vad_plays.loc[vad_plays['track_id'].isin(unique_sid)]
vad_plays_tr, vad_plays_te = split_train_test_proportion(vad_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [23]:
test_plays = df.loc[df['user_id'].isin(te_users)]
test_plays = test_plays.loc[test_plays['track_id'].isin(unique_sid)]
test_plays_tr, test_plays_te = split_train_test_proportion(test_plays)

0 users sampled
1000 users sampled
2000 users sampled
3000 users sampled
4000 users sampled
5000 users sampled
6000 users sampled
7000 users sampled
8000 users sampled
9000 users sampled


In [24]:
def numerize(tp):
    uid = list(map(lambda x: profile2id[x], tp['user_id']))
    sid = list(map(lambda x: show2id[x], tp['track_id']))
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

train_data = numerize(train_plays)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

vad_data_tr = numerize(vad_plays_tr)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(vad_plays_te)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(test_plays_tr)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(test_plays_te)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)