In [1]:
import os
import sys
import tables
import glob
import pandas as pd
import numpy as np
"""
read.py
02-22-19
jack skrable
"""


'\nread.py\n02-22-19\njack skrable\n'

In [2]:
# Progress bar for cli
def progress(count, total, suffix=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.0 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s %s\r' % (bar, percents, '%', suffix))
    sys.stdout.flush()
# Get list of all h5 files in basedir
def get_all_files(basedir, ext='.h5'):
    print('Getting list of all h5 files in',basedir)
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root, '*'+ext))
        for f in files:
            allfiles.append(os.path.abspath(f))
    return allfiles
# From a list of h5 files, extracts song metadata and creates a dataframe
def extract_song_data(files):
    # Init empty df
    df = pd.DataFrame()
    # Get total h5 file count
    size = len(files)
    print(size, 'files found.')
    # Iter thru files
    for i, f in enumerate(files):
        # Update progress bar
        progress(i, size, 'of files processed')
        # Read file into store
        s_hdf = pd.HDFStore(f, 'r')
        # DF to hold single file info
        data = pd.DataFrame()
        # Walk nodes under root
        for item in s_hdf.root._f_walknodes():
            # Get name for column
            name = item._v_pathname[1:].replace('/','_')
            # Store arrays
            if type(item) is tables.earray.EArray:
                data[name] = [np.array(item)]
            # Store tables
            elif type(item) is tables.table.Table:
                # Get all columns
                cols =  item.coldescrs.keys()
                for row in item:
                    for col in cols:
                        col_name = '_'.join([name,col])
                        try:
                            data[col_name] = row[col]
                        except Exception as e:
                            print(e)

        # Append to main df
        df = df.append(data, ignore_index=True)
        # Close store for reading
        s_hdf.close()

    # Dev set of columns
    # df = df[['metadata_songs_artist_id','metadata_songs_title','musicbrainz_songs_year','metadata_artist_terms','analysis_songs_analysis_sample_rate','metadata_songs_artist_location','analysis_sections_confidence','analysis_sections_start','analysis_segments_start','analysis_segments_timbre','analysis_segments_pitches','analysis_songs_tempo','analysis_bars_confidence','analysis_bars_start','analysis_beats_confidence','analysis_beats_start','analysis_songs_duration','analysis_songs_energy','analysis_songs_key','analysis_songs_key_confidence','analysis_songs_time_signature','analysis_songs_time_signature_confidence','metadata_similar_artists']]

    # Drop bad columns
    df.drop(['musicbrainz_artist_mbtags_count','musicbrainz_artist_mbtags',
             'musicbrainz_songs_idx_artist_mbtags'], inplace=True, axis=1)

    return df

In [3]:
def get_song_file_map(files):

    # Init empty df
    songmap = {}
    # Get total h5 file count
    size = len(files)
    print(size, 'files found.')
    # Iter thru files
    for i, f in enumerate(files):
        # Update progress bar
        progress(i, size, 'of files processed')
        # Read file into store
        s_hdf = pd.HDFStore(f)
        song_id = s_hdf.root.metadata.songs[0]['song_id'].astype('U')
        filepath = s_hdf.filename
        songmap.update({song_id: s_hdf.filename})
        # Close store for reading
        s_hdf.close()

    with open('./data/song-file-map.json', 'w') as file:
        json.dump(songmap, file, sort_keys=True, indent=2)

    return songmap


def get_user_taste_data(filename):
    tasteDF = pd.read_csv('./TasteProfile/train_triplets_SAMPLE.txt', sep='\t', header=None, names={'user,song,count'})

    return tasteDF


# Function to read all h5 files in a directory into a dataframe
def h5_to_df(basedir, limit=None, init=False):
    files = get_all_files(basedir, '.h5')
    files = files if limit is None else files[:limit]
    df = extract_song_data(files)

    if init:
        get_song_file_map(files)
    return df

In [4]:
data_dir = "MillionSongSubset"
# get_all_files(data_dir)
df = h5_to_df(data_dir)
df.columns

Getting list of all h5 files in MillionSongSubset
10000 files found.
[########----------------------------------------------------] 12.5% of files processed

In [None]:
df

Unnamed: 0,analysis_bars_confidence,analysis_bars_start,analysis_beats_confidence,analysis_beats_start,analysis_sections_confidence,analysis_sections_start,analysis_segments_confidence,analysis_segments_loudness_max,analysis_segments_loudness_max_time,analysis_segments_loudness_start,...,metadata_songs_genre,metadata_songs_idx_artist_terms,metadata_songs_idx_similar_artists,metadata_songs_release,metadata_songs_release_7digitalid,metadata_songs_song_hotttnesss,metadata_songs_song_id,metadata_songs_title,metadata_songs_track_7digitalid,musicbrainz_songs_year
0,"[0.643, 0.746, 0.722, 0.095, 0.091, 0.362, 0.4...","[0.58521, 2.94247, 5.14371, 7.74554, 10.36149,...","[0.834, 0.851, 0.65, 0.635, 0.532, 0.753, 0.62...","[0.58521, 1.19196, 1.78893, 2.37813, 2.94247, ...","[1.0, 1.0, 0.218, 0.133, 0.384, 0.326, 0.373, ...","[0.0, 7.74554, 36.44331, 43.61667, 75.17954, 9...","[0.0, 1.0, 0.483, 0.137, 0.42, 1.0, 0.257, 1.0...","[-60.0, -31.646, -34.565, -38.407, -34.696, -2...","[0.0, 0.10929, 0.11044, 0.0844, 0.05898, 0.073...","[-60.0, -60.0, -40.84, -40.401, -38.456, -39.6...",...,b'',0,0,b'Fear Itself',300848,0.602120,b'SOMZWCG12A8C13C480',"b""I Didn't Mean To""",3401791,0
1,"[0.007, 0.259, 0.172, 0.404, 0.011, 0.016, 0.0...","[0.71054, 2.71502, 4.70861, 6.69288, 8.66941, ...","[1.0, 0.945, 0.714, 0.973, 0.818, 0.974, 0.878...","[0.20627, 0.71054, 1.21836, 1.71841, 2.21729, ...","[1.0, 0.451, 0.27, 0.397, 0.225, 0.426, 0.459,...","[0.0, 8.1777, 19.52952, 38.84063, 50.22563, 70...","[0.0, 1.0, 0.93, 0.643, 0.761, 0.21, 1.0, 0.71...","[-60.0, -14.269, -10.165, -18.098, -19.136, -1...","[0.0, 0.05811, 0.03982, 0.04186, 0.03568, 0.03...","[-60.0, -60.0, -23.521, -25.16, -27.133, -24.2...",...,b'',0,0,b'Dimensions',300822,,b'SOCIWDW12A8C13D406',b'Soul Deep',3400270,1969
2,"[0.98, 0.399, 0.185, 0.27, 0.422, 0.0, 0.445, ...","[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....","[0.98, 0.399, 0.185, 0.27, 0.422, 0.0, 0.445, ...","[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....","[1.0, 0.121, 0.214, 0.198, 0.66, 0.468, 0.591,...","[0.0, 37.88678, 49.43939, 68.63657, 98.89331, ...","[0.0, 1.0, 0.106, 0.048, 0.282, 0.69, 0.308, 0...","[-59.895, -11.914, -10.344, -9.678, -9.22, -8....","[0.27572, 0.1589, 0.0515, 0.0741, 0.09185, 0.0...","[-60.0, -59.9, -12.744, -12.003, -12.991, -15....",...,b'',0,0,b'Las Numero 1 De La Sonora Santanera',514953,,b'SOXVLOJ12AB0189215',b'Amor De Cabaret',5703798,0
3,"[0.017, 0.05, 0.014, 0.008, 0.114, 0.019, 0.08...","[1.30621, 3.29887, 5.30252, 7.32327, 9.33775, ...","[0.809, 0.616, 0.789, 0.66, 0.439, 0.758, 0.60...","[0.81002, 1.30621, 1.80617, 2.2996, 2.80049, 3...","[1.0, 0.086, 0.153, 0.146, 0.088, 0.217, 0.372...","[0.0, 20.38681, 27.94943, 55.12454, 67.71832, ...","[1.0, 1.0, 0.919, 0.591, 0.841, 0.174, 0.753, ...","[-18.682, -9.55, -9.709, -8.633, -7.434, -11.7...","[0.34385, 0.07741, 0.04658, 0.07981, 0.04477, ...","[-60.0, -27.665, -21.241, -15.222, -18.915, -1...",...,b'',0,0,b'Friend Or Foe',287650,,b'SONHOTT12A8C13493C',b'Something Girls',3226795,1982
4,"[0.175, 0.409, 0.639, 0.067, 0.016, 0.066, 0.0...","[1.06368, 2.91491, 4.76729, 6.61852, 8.46978, ...","[0.883, 0.738, 0.484, 0.609, 0.625, 0.719, 0.4...","[0.13576, 0.59914, 1.06368, 1.52591, 1.99045, ...","[1.0, 0.768, 0.611, 0.388, 0.52, 0.42, 0.499, ...","[0.0, 8.00636, 23.26694, 67.22425, 74.15257, 1...","[0.0, 1.0, 0.359, 1.0, 0.963, 0.544, 1.0, 0.75...","[-59.813, -7.713, -16.13, -2.512, -8.088, -8.7...","[0.06094, 0.06433, 0.02255, 0.02018, 0.02463, ...","[-60.0, -59.828, -19.551, -32.609, -21.899, -2...",...,b'',0,0,b'Muertos Vivos',611336,0.604501,b'SOFSOCN12A8C143F5D',b'Face the Ashes',6795666,2007
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,"[0.46, 0.081, 0.024, 0.009, 0.036, 0.047, 0.09...","[0.87991, 2.58253, 4.03501, 5.51685, 7.17707, ...","[0.719, 0.432, 0.437, 0.364, 0.399, 0.437, 0.3...","[0.44279, 0.87991, 1.31384, 1.7435, 2.16781, 2...","[1.0, 1.0, 0.674, 0.671, 0.466, 0.577, 0.484, ...","[0.0, 11.74785, 41.53344, 70.22572, 96.58756, ...","[0.0, 0.958, 0.549, 0.515, 0.193, 0.435, 0.008...","[-60.0, -47.262, -40.317, -32.809, -30.848, -2...","[0.0, 0.15505, 0.12436, 0.52408, 0.07835, 0.04...","[-60.0, -60.0, -47.304, -41.67, -33.841, -31.6...",...,b'',0,0,b'Sin / Pecado',691752,0.594080,b'SOLXXPY12A67ADABA0',b'The Hanged Man',7677054,1998
9996,"[0.103, 0.217, 0.346, 0.191, 0.093, 0.063, 0.0...","[1.63576, 3.96124, 6.36732, 8.73604, 11.07133,...","[0.936, 0.0, 0.604, 0.623, 0.579, 0.0, 0.314, ...","[0.07692, 0.87108, 1.63576, 2.4083, 3.18477, 3...","[1.0, 0.096, 0.078, 0.32, 0.304, 0.393, 0.18, ...","[0.0, 27.15271, 58.59412, 71.84861, 87.16708, ...","[0.0, 1.0, 0.643, 0.14, 0.152, 0.098, 0.237, 0...","[-60.0, -30.954, -24.75, -20.887, -18.227, -15...","[0.0, 0.11027, 0.26957, 0.31172, 0.21855, 0.21...","[-60.0, -60.0, -39.197, -25.201, -21.459, -18....",...,b'',0,0,b'Collection',41649,0.334707,b'SOAYONI12A6D4F85C8',b'The Wonderful World Of The Young',442366,1998
9997,"[0.003, 0.002, 0.116, 0.123, 0.032, 0.011, 0.0...","[0.78745, 2.81695, 4.84925, 6.88146, 8.91176, ...","[1.0, 0.928, 0.305, 0.955, 0.788, 0.879, 0.73,...","[0.27924, 0.78745, 1.29562, 1.8038, 2.31229, 2...","[1.0, 0.237, 0.373, 0.615, 0.323, 0.505, 0.353...","[0.0, 7.89684, 40.90774, 83.57051, 102.36691, ...","[0.0, 1.0, 0.969, 1.0, 1.0, 1.0, 0.919, 1.0, 1...","[-60.0, -20.26, -26.046, -18.698, -12.525, -20...","[0.0, 0.10728, 0.02996, 0.02279, 0.02831, 0.03...","[-60.0, -60.0, -40.975, -43.789, -40.884, -42....",...,b'',0,0,b'Reality',346402,,b'SOJZLAJ12AB017E8A2',b'Sentimental Man',3884209,0
9998,"[0.542, 0.429, 0.057, 0.017, 0.074, 0.047, 0.0...","[0.28192, 2.0533, 3.62096, 5.37773, 7.09486, 8...","[0.574, 0.0, 0.658, 0.307, 0.0, 0.126, 0.321, ...","[0.28192, 0.73103, 1.18674, 1.64136, 2.0533, 2...","[1.0, 1.0, 0.936, 0.945, 0.489, 0.513, 0.262, ...","[0.0, 11.38994, 26.28686, 36.10941, 112.26613,...","[1.0, 1.0, 0.23, 0.439, 0.287, 0.15, 0.124, 0....","[-29.516, -17.029, -29.995, -28.098, -30.057, ...","[0.16335, 0.092, 0.02269, 0.26405, 0.02937, 0....","[-60.0, -31.891, -34.322, -34.373, -32.889, -3...",...,b'',0,0,b'Nouveau Zydeco',86259,0.000000,b'SORZSCJ12A8C132446',b'Zydeco In D-Minor',904098,0


In [None]:
# import pickle
# pickle.dump(df, open('msd.p','wb'))

In [4]:
import pickle
df = pickle.load(open('msd.p','rb'))

In [7]:
bytes_metadata = ['metadata_songs_song_id','metadata_songs_title','metadata_songs_artist_id','metadata_songs_artist_name']
int_metadata = ['musicbrainz_songs_year']
audio_features = ['analysis_segments_timbre','analysis_segments_pitches','analysis_segments_loudness_max','analysis_segments_confidence','analysis_songs_loudness']

In [16]:
df2 = df[bytes_metadata + int_metadata + audio_features]
for c in bytes_metadata:
    df2[c] = [x.decode('utf-8') for x in df2[c]]
    
df2.sort_values(int_metadata,ascending=False,inplace=True)
df2.index = df2['metadata_songs_title']+' by '+df2['metadata_songs_artist_name']
df2 = df2[~df2.index.duplicated(keep='first')]
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,metadata_songs_song_id,metadata_songs_title,metadata_songs_artist_id,metadata_songs_artist_name,musicbrainz_songs_year,analysis_segments_timbre,analysis_segments_pitches,analysis_segments_loudness_max,analysis_segments_confidence,analysis_songs_loudness
The Real Trick by Slackbaba,SONCJLY12AB0187193,The Real Trick,ARI28VJ1187FB4EAD7,Slackbaba,2010,"[[2.776, 120.203, 12.042, -88.433, 63.195, -44...","[[0.669, 0.662, 0.398, 0.377, 0.621, 1.0, 0.55...","[-49.353, -36.033, -31.933, -23.733, -19.038, ...","[0.621, 1.0, 0.204, 0.793, 0.393, 0.52, 0.323,...",-6.726
Qué Desilusión by Mägo de Oz,SOVTEUT12AB0185D22,Qué Desilusión,AR7ZI7Z1187B98BF55,Mägo de Oz,2010,"[[11.041, 74.538, 123.898, -275.931, 25.525, -...","[[0.042, 0.049, 0.021, 0.032, 0.027, 0.02, 0.2...","[-19.978, -20.527, -25.291, -25.926, -19.24, -...","[1.0, 0.561, 0.451, 0.408, 0.848, 0.064, 0.336...",-4.638
The Death Of Romance by Zeromancer,SOZEFPO12AB0184AEC,The Death Of Romance,ARQDWBG1187B9891D7,Zeromancer,2010,"[[49.651, 65.595, 56.047, -45.908, -26.652, 99...","[[0.062, 0.058, 0.139, 0.262, 1.0, 0.417, 0.06...","[-4.979, -5.398, -5.205, -6.002, -6.45, -5.487...","[1.0, 0.129, 0.077, 0.2, 0.221, 0.319, 0.362, ...",-6.236
Kui rebeneb taevas by Metsatöll,SOZOSQN12AB01872A1,Kui rebeneb taevas,ARYCQBH1187B9B1BE4,Metsatöll,2010,"[[21.41, -8.668, -8.791, 96.955, -36.823, 27.9...","[[1.0, 0.998, 0.38, 0.11, 0.244, 0.185, 0.192,...","[-31.792, -43.976, -37.132, -36.185, -35.806, ...","[0.0, 0.181, 1.0, 1.0, 0.079, 0.068, 0.891, 0....",-4.045
Death Or Jail by Sick Of It All,SOBKGSR12AB0184A57,Death Or Jail,ARHL8SP1187B98F8A3,Sick Of It All,2010,"[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...","[[0.919, 1.0, 0.396, 0.389, 0.445, 0.438, 0.50...","[-60.0, -23.613, -24.468, -24.776, -25.365, -2...","[0.0, 1.0, 0.792, 0.878, 0.807, 0.87, 0.911, 0...",-2.388
...,...,...,...,...,...,...,...,...,...,...
Dyna by Kanda Bongo Man,SOVYJHX12AB017F591,Dyna,ARMRQF21187FB54D89,Kanda Bongo Man,0,"[[9.425, 28.095, -71.075, 0.873, 27.348, 20.87...","[[0.357, 0.91, 1.0, 0.993, 0.516, 0.391, 0.77,...","[-44.098, -48.073, -47.604, -45.971, -38.291, ...","[1.0, 0.618, 0.474, 0.865, 0.941, 1.0, 0.307, ...",-12.602
Medley: Father_ I Adore You/More Precious Than Silver by Cadet,SOFBBQY12A6D4F67B6,Medley: Father_ I Adore You/More Precious Than...,AR1ZNJO1187B993E46,Cadet,0,"[[24.293, -46.846, -36.426, -42.942, -11.621, ...","[[0.347, 0.698, 0.461, 0.435, 0.876, 0.875, 0....","[-28.891, -28.587, -14.936, -13.429, -15.977, ...","[1.0, 0.525, 0.998, 0.098, 0.579, 0.342, 0.095...",-5.105
Widescreen by Pinch,SODMUKC12AB017E2F3,Widescreen,AROJ4RQ1187FB50102,Pinch,0,"[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...","[[0.219, 0.294, 0.629, 0.426, 0.16, 0.352, 0.3...","[-60.0, -38.628, -42.471, -42.095, -41.393, -3...","[0.0, 1.0, 0.333, 0.737, 0.291, 1.0, 0.045, 0....",-10.411
Si Tu Me Dices Ven by Isabel Pantoja,SOHQXIF12A8C13FEAD,Si Tu Me Dices Ven,ARUK0W21187B995456,Isabel Pantoja,0,"[[16.836, 115.39, 189.97, -332.652, 95.916, 45...","[[0.029, 0.004, 0.048, 0.003, 0.004, 0.095, 0....","[-22.627, -16.951, -20.847, -22.618, -15.406, ...","[1.0, 0.107, 0.728, 0.371, 0.772, 0.44, 0.194,...",-6.451


In [17]:
df2[:5].to_json('msd5.json',orient='index')

In [18]:
df2.to_json('msd.json',orient='index')

In [19]:
print(df2.columns.values)

['metadata_songs_song_id' 'metadata_songs_title'
 'metadata_songs_artist_id' 'metadata_songs_artist_name'
 'musicbrainz_songs_year' 'analysis_segments_timbre'
 'analysis_segments_pitches' 'analysis_segments_loudness_max'
 'analysis_segments_confidence' 'analysis_songs_loudness']
