In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h5py
import hdf5_getters

# Reading the Subset CSV

In [2]:
df = pd.read_csv('SongCSV.csv')

In [3]:
df

Unnamed: 0,SongNumber,SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year
0,1,b'SOGSMXL12A81C23D88',275907,b'Je Sais Que La Terre Est Plate (Deluxe)',b'AREJXK41187B9A4ACC',46.71067,b'France',1.71819,b'Rapha\xc3\xabl',0.0,148.74077,0,0.591,124.059,4,0.372,b'Je Sais Que La Terre Est Plate',2008
1,2,b'SOMBCOW12AAF3B229F',382807,b'Comme Vous',b'AR2XRFQ1187FB417FE',,b'',,b'Julie Zenatti',0.0,252.99546,1,0.429,80.084,4,0.533,b'On Efface',2004
2,3,b'SOEYIHF12AB017B5F4',490659,b'Watkins Ale - Music of the English Renaissa...,b'ARODOO01187FB44F4A',,b'',,b'The Baltimore Consort',0.0,78.02730,3,0.000,54.874,4,0.000,b'Howells Delight',0
3,4,b'SODJYEC12A8C13D757',116616,"bDon't Worry Lady""""",b'ARJGW911187FB586CA',,b'',,b'I Hate Sally',0.0,163.63057,7,0.380,77.150,3,0.369,b'Martha Served',2007
4,5,b'SOGSOUE12A58A76443',767122,b'Easy Listening: Cartoon Songs',b'AR9HQ6Y1187FB3C2CB',,b'',,b'Orlando Pops Orchestra',0.0,199.99302,10,0.551,120.382,4,1.000,b'Zip-A-Dee-Doo-Dah (Song of the South)',0
5,6,b'SOVVDCO12AB0187AF7',503347,b'X Communication : Trilogy II',b'ARDPTGD1187B9AD361',36.87652,b'Sikeston MO',-89.58828,b'Brand X',0.0,279.35302,9,0.519,99.024,4,1.000,b'Liquid Time (composition by John Goodsall)',0
6,7,b'SOKSZVC12A8C142004',239673,b'Karelian Isthmus',b'ARV8T9T1187B99F3F4',,b'',,b'Amorphis',0.0,255.03302,9,0.447,175.673,4,0.000,b'Misery Path (From the Privilege of Evil)',0
7,8,b'SORWTIF12A6D4FAA41',124304,b'Nobilt\xc3\xa0 di strada',b'ARJ5BEW1187FB52361',,b'',,b'Inoki',0.0,259.31710,1,0.194,87.999,4,0.954,b'Nuovi Re pt. I I (feat. Tek money - Lady Tam...,0
8,9,b'SOZQSGL12AF72A9145',181162,b'Milking The Sacred Cow',b'AR050VJ1187B9B13A7',,b'',,b'Dead Kennedys',0.0,216.84200,10,0.325,92.897,4,0.879,b'Halloween',1982
9,10,b'SOKRHNY12AB01837DB',337773,b'Novas Vos Trago',b'AR8KUS11187B98C991',,b'',,b'Brigada Victor Jara',0.0,312.99873,4,0.669,86.981,5,0.688,b'Parto em terras distantes',1998


In [4]:
df.shape

(10000, 18)

In [5]:
df.columns

Index(['SongNumber', 'SongID', 'AlbumID', 'AlbumName', 'ArtistID',
       'ArtistLatitude', 'ArtistLocation', 'ArtistLongitude', 'ArtistName',
       'Danceability', 'Duration', 'KeySignature', 'KeySignatureConfidence',
       'Tempo', 'TimeSignature', 'TimeSignatureConfidence', 'Title', 'Year'],
      dtype='object')

# Reading the MSD Summary HDF5 File 
This is a little different from just reading in the CSV. We read the HDF5 file, the use the associated keys/groups to navigate to the metadata information. We save the metadata dataframe as a collection of numpy void objects, which is a 1,000,000 by 1 dimension object where each element is a numpy void object which contains a sort of "list" of all the attributes and can be called by name. I demonstrate a sample call below and also convert the collection of numpy void objects into a regular numpy array object for future use in ML algorithms. 

Before running this, you need to download the MSD summary HDF5 file from here: https://labrosa.ee.columbia.edu/millionsong/pages/getting-dataset

In [8]:
file = h5py.File('msd_summary_file.h5', 'r')
list(file.keys()) # getting the keys for the hdf5 object

['analysis', 'metadata', 'musicbrainz']

In [147]:
h5file = h5py.File('msd_summary_file.h5', 'r')

# see which keys are in the file 
print(list(h5file.keys()))

# note: each key has a "songs" group inside. 

# getting the dataframes from each group
metadata = h5file['metadata']['songs']
musicbrainz = h5file['musicbrainz']['songs']
analysis = h5file['analysis']['songs']

# converting these to numpy objects (so that they're viewable) 
meta = metadata.value # This array contains all the metadata values for each of the million songs
anal = analysis.value
mb = musicbrainz.value

file.close()

['analysis', 'metadata', 'musicbrainz']


In [148]:
# a sample of what each element of the three numpy arrays are. 
print(meta[0])
print(mb[0])
print(anal[0])

(b'', 4069, 0.6498221, 0.39403189, b'ARYZTJS1187B98C555', nan, b'', nan, b'357ff05d-848a-44cf-b608-cb34b5701ae5', b'Faster Pussy cat', 44895, b'', 0, 0, b'Monster Ballads X-Mas', 633681, 0.54289874, b'SOQMMHC12AB0180CB8', b'Silent Night', 7032331)
(0, 2003)
(22050, b'aee9820911781c734e7694c5432990ca', 0., 252.05506, 2.049, 0., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0.777, -4.829, 0, 0.688, 236.635, 87.002, 4, 0.94, b'TRMMMYQ128F932D901')


In [174]:
print(meta.dtype) # a list of the things you can call for each numpy void object

# calling the list of song_hotttnesss for all 1 million songs
# when you call the numpy void collection like this, the result is a numpy darray which you can stitch together 
# column-wise to get a true array
hot = meta['song_hotttnesss']
print(hot.shape)
print(type(hot))

name = meta['title']

[('analyzer_version', 'S32'), ('artist_7digitalid', '<i4'), ('artist_familiarity', '<f8'), ('artist_hotttnesss', '<f8'), ('artist_id', 'S32'), ('artist_latitude', '<f8'), ('artist_location', 'S1024'), ('artist_longitude', '<f8'), ('artist_mbid', 'S40'), ('artist_name', 'S1024'), ('artist_playmeid', '<i4'), ('genre', 'S1024'), ('idx_artist_terms', '<i4'), ('idx_similar_artists', '<i4'), ('release', 'S1024'), ('release_7digitalid', '<i4'), ('song_hotttnesss', '<f8'), ('song_id', 'S32'), ('title', 'S1024'), ('track_7digitalid', '<i4')]
(1000000,)
<class 'numpy.ndarray'>


In [170]:
new_df = np.column_stack((name, hot))
new_df

array([[b'Silent Night', b'0.5428987432910862'],
       [b'Tanssi vaan', b'0.2998774882739778'],
       [b'No One Could Ever', b'0.6178709693948196'],
       ...,
       [b'Novemba', b'nan'],
       [b'Faraday', b'nan'],
       [b'Fernweh feat. Sektion Kuchik\xc3\xa4schtli',
        b'0.4879499884328053']], dtype='|S1024')

# Analyzing MSD Subset and Plotting some Plots

In [None]:
fig = plt.figure()
ax = df.plot.scatter(x = 'Year', y = "Tempo")
ax.set_xlim(1910, 2015)
ax.set_title("Year vs Tempo")
ax.set_ylabel("Tempo (bpm)")
# plt.savefig('year_vs_tempo.png')
plt.show()

In [None]:
years = list(set(df.Year))
del years[0]

In [None]:
mean_tempo = []
for i in range(len(years)):
    mean_tempo.append(df[(df.Year == years[i])].Tempo.mean())

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(years, mean_tempo, '-k')
# ax.set_xlim(1910, 2015)
ax.set_title("Year vs Mean Tempo")
ax.set_ylabel("Mean Tempo (bpm)")
plt.savefig('year_vs_meantempo.png')
plt.show()

In [None]:
fig = plt.figure()
ax = df[df.Year != 0]['Year'].plot.hist(legend = None)
ax.set_title("Year")
plt.savefig('year_hist.png')
plt.show()