## Importing Modules

In [1]:
import pandas as pd
import h5py
%matplotlib inline

## Getting the Data

In [2]:
import glob
import os

In [3]:
file_path = os.path.join(os.path.pardir, "data", "MillionSongSubset", "data")
files = glob.glob(file_path + '/**/*.h5', recursive=True)

In [4]:
files[:10]

['..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAAAW128F429D538.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAABD128F429CF47.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAADZ128F9348C2E.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAAEF128F4273421.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAAFD128F92F423A.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAAMO128F1481E7F.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAAMQ128F1460CD3.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAAPK128E0786D96.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAARJ128F9320760.h5',
 '..\\data\\MillionSongSubset\\data\\A\\A\\A\\TRAAAVG12903CFA543.h5']

In [5]:
# Check the keys 
with h5py.File(files[0], "r") as f:
    print(list(f.keys()))

['analysis', 'metadata', 'musicbrainz']


In [6]:
# Examine each dataset
store = pd.HDFStore(files[0])

In [7]:
store

<class 'pandas.io.pytables.HDFStore'>
File path: ..\data\MillionSongSubset\data\A\A\A\TRAAAAW128F429D538.h5
/analysis/songs               frame_table [0.0.0] (typ->generic,nrows->1,ncols->31,indexers->[index],dc->[analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,idx_bars_confidence,idx_bars_start,idx_beats_confidence,idx_beats_start,idx_sections_confidence,idx_sections_start,idx_segments_confidence,idx_segments_loudness_max,idx_segments_loudness_max_time,idx_segments_loudness_start,idx_segments_pitches,idx_segments_start,idx_segments_timbre,idx_tatums_confidence,idx_tatums_start,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id])
/metadata/songs               frame_table [0.0.0] (typ->generic,nrows->1,ncols->20,indexers->[index],dc->[analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,ar

In [8]:
df = store.select("/analysis/songs")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 31 columns):
analysis_sample_rate              1 non-null int32
audio_md5                         1 non-null object
danceability                      1 non-null float64
duration                          1 non-null float64
end_of_fade_in                    1 non-null float64
energy                            1 non-null float64
idx_bars_confidence               1 non-null int32
idx_bars_start                    1 non-null int32
idx_beats_confidence              1 non-null int32
idx_beats_start                   1 non-null int32
idx_sections_confidence           1 non-null int32
idx_sections_start                1 non-null int32
idx_segments_confidence           1 non-null int32
idx_segments_loudness_max         1 non-null int32
idx_segments_loudness_max_time    1 non-null int32
idx_segments_loudness_start       1 non-null int32
idx_segments_pitches              1 non-null int32
idx_segments_start     

We'll first strip out the audio file's MD5 hash, since that's not required for our application.

In [10]:
df = df.drop('audio_md5', 1)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 0 to 0
Data columns (total 30 columns):
analysis_sample_rate              1 non-null int32
danceability                      1 non-null float64
duration                          1 non-null float64
end_of_fade_in                    1 non-null float64
energy                            1 non-null float64
idx_bars_confidence               1 non-null int32
idx_bars_start                    1 non-null int32
idx_beats_confidence              1 non-null int32
idx_beats_start                   1 non-null int32
idx_sections_confidence           1 non-null int32
idx_sections_start                1 non-null int32
idx_segments_confidence           1 non-null int32
idx_segments_loudness_max         1 non-null int32
idx_segments_loudness_max_time    1 non-null int32
idx_segments_loudness_start       1 non-null int32
idx_segments_pitches              1 non-null int32
idx_segments_start                1 non-null int32
idx_segments_timbre     

## Working on the Summary File

We'll now also look at the `subset_msd_summary` HDF5 file

In [12]:
store = pd.HDFStore("../data/MillionSongSubset/AdditionalFiles/subset_msd_summary_file.h5")
df = store.select("/analysis/songs")
df

Unnamed: 0,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,idx_bars_confidence,idx_bars_start,idx_beats_confidence,idx_beats_start,...,key,key_confidence,loudness,mode,mode_confidence,start_of_fade_out,tempo,time_signature,time_signature_confidence,track_id
0,22050,a600d65cf157a306be60f26ecbf218f4,0,280.21506,0.238,0,0,0,0,0,...,5,0.555,-3.306,1,0.500,275.528,173.205,5,0.120,TRACCVZ128F4291A8A
1,22050,c64d75b0588e5ab552ee94548b50a4fa,0,191.68608,0.000,0,0,0,0,0,...,0,0.501,-10.764,1,0.710,184.128,150.955,4,0.600,TRACCMH128F428E4CD
2,22050,0cadd310865701bb93ced1cd78e8910a,0,222.92853,0.000,0,0,0,0,0,...,1,0.329,-9.035,1,0.407,216.300,93.056,4,1.000,TRACCSW128F148C7C3
3,22050,14be4fc1170152c445b3be7b8d18dfec,0,278.38649,0.496,0,0,0,0,0,...,7,0.313,-23.095,1,0.387,278.386,127.113,1,0.446,TRACCXJ128F428F0CF
4,22050,1def5d8298e8cb29a188a5a7c0e9429a,0,89.15546,4.847,0,0,0,0,0,...,2,0.000,-20.359,1,0.000,79.203,90.660,3,0.524,TRACCVS12903D014F8
5,22050,79ed013fa65b4fc3424dd1ef0ab76dd5,0,255.73832,0.156,0,0,0,0,0,...,9,0.556,-5.724,1,0.455,252.012,101.167,1,1.000,TRACCKS128F42B77AE
6,22050,6f8cc33a8ed925e2077a876de3a11977,0,239.59465,0.403,0,0,0,0,0,...,2,0.167,-10.653,1,0.372,231.805,173.841,3,0.302,TRACCQM12903CACC1E
7,22050,ee8450e2e32b8adc9623d95ba6633b2f,0,156.96934,0.322,0,0,0,0,0,...,4,0.772,-20.816,0,0.524,142.286,127.547,1,0.168,TRACCUS128F92E1FEB
8,22050,60d56ccbced3db74f86bddc21bb4c92f,0,197.19791,0.276,0,0,0,0,0,...,4,0.665,-29.750,1,0.582,187.582,127.782,4,0.226,TRACCJA128F149A144
9,22050,19a1d23af6018cdfb32e8e751932d662,0,262.58240,2.328,0,0,0,0,0,...,9,0.317,-5.644,1,0.357,257.155,90.013,5,0.386,TRACCGB12903CD1B90


In [13]:
len(files)

10000

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 31 columns):
analysis_sample_rate              10000 non-null int32
audio_md5                         10000 non-null object
danceability                      10000 non-null float64
duration                          10000 non-null float64
end_of_fade_in                    10000 non-null float64
energy                            10000 non-null float64
idx_bars_confidence               10000 non-null int32
idx_bars_start                    10000 non-null int32
idx_beats_confidence              10000 non-null int32
idx_beats_start                   10000 non-null int32
idx_sections_confidence           10000 non-null int32
idx_sections_start                10000 non-null int32
idx_segments_confidence           10000 non-null int32
idx_segments_loudness_max         10000 non-null int32
idx_segments_loudness_max_time    10000 non-null int32
idx_segments_loudness_start       10000 non-null int32
idx

Thus, the individual H5 files are all collected into one H5 file. 

## Getting the metadata

In [15]:
df = store.select("/metadata/songs")

In [16]:
df

Unnamed: 0,analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,artist_playmeid,genre,idx_artist_terms,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid
0,,29785,0.780462,0.574275,ARMQHX71187B9890D3,,"Atlanta, GA",,bc5e2ad6-0a4a-4d90-b911-e9a7e6861727,Mastodon,-1,,0,0,Call of the Mastodon,223563,0.597641,SOVLGJY12A8C13FBED,Deep Sea Creature,2442524
1,,167867,0.561590,0.420570,AR2PT4M1187FB55B1A,,,,d54ea4a6-0b9c-4e47-bed0-289ae9ff4037,Los Chichos,1880,,0,0,Adelante,221677,,SOGDQZK12A8C13F37C,No Quieras Marcharte,2423472
2,,7725,0.687687,0.406686,ARDI88R1187B98DAB2,,,,fd87374e-ffde-4d36-89a8-8a073f795666,Foxy Brown,-1,,0,0,Ill Na Na,47304,0.588922,SODMVJR12A6D4F985D,If I...,507029
3,,2799,0.391741,0.291265,ARUKJUP12086C14589,,,,8a5f2736-bcde-4a2e-8d50-72631d66a7ef,Ramón Vargas;Vladimir Jurowski,20111,,0,0,Massenet: Werther,295123,,SOIWBDR12A8C13A4AC,Werther - Lyric Drama in four Acts/Act I/Alors...,3343102
4,,74269,0.593416,0.395710,ARZEWUR1187FB53DC8,50.45663,"Belgica -- Namur, Namur/Ghent, East Flanders",4.87137,0be59867-0da4-4e45-9b64-728cdf25487c,Enthroned,55656,,0,0,Pentagrammaton,785362,,SOHCCIA12AC907577F,Ad Te Clamamus Exsvles Mortva Liberi,8688607
5,,49956,0.815923,0.555138,ARUZRFN1187B98AC05,,"Cleveland, OH",,2fa45bbb-0efb-4950-9d40-94bf23cbfec1,Bone Thugs-N-Harmony,5412,,0,0,BTNHRESURRECTION,310248,0.474055,SOBOAQC12A8C13E3E9,Murder One,3510188
6,,25694,0.543937,0.430300,ARHBWOZ1187FB3FD53,,ITALY,,e6ff2839-5ccb-451b-b07e-f485bc143118,Utopia,6721,,0,0,Utopia,576541,,SOKVLHX12AB0187B39,On My Feet Again,6389516
7,,588,0.929030,0.750427,ARR3ONV1187B9A2F59,54.31407,UK,-2.23001,fd857293-5ab8-40de-b29e-55a69d4e4d0f,Muse,-1,,0,0,Plug In Baby,521383,0.652836,SOMMSMW12A8C13FCCC,Bedroom Acoustics,5764770
8,,11301,0.645192,0.471224,AR3THYK1187B999F1F,,,,286ec4c2-b5ca-4f85-b331-280a6d73dd14,João Gilberto,-1,,0,0,Joao Voz E Violato,55934,0.631601,SODPNJR12A6D4FA52D,Segredo,581259
9,,19967,0.495819,0.342765,ARFELOH1187B991F95,,,,65b785d9-499f-48e6-9063-3a1fd1bd488d,Niraj Chag,-1,,0,0,The Lost Souls Bonus EP,722229,,SOFFLLP12AB018ED52,Sajana (Ft Faheem Mazhar),8005714


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 20 columns):
analyzer_version       10000 non-null object
artist_7digitalid      10000 non-null int32
artist_familiarity     9996 non-null float64
artist_hotttnesss      10000 non-null float64
artist_id              10000 non-null object
artist_latitude        3742 non-null float64
artist_location        10000 non-null object
artist_longitude       3742 non-null float64
artist_mbid            10000 non-null object
artist_name            10000 non-null object
artist_playmeid        10000 non-null int32
genre                  10000 non-null object
idx_artist_terms       10000 non-null int32
idx_similar_artists    10000 non-null int32
release                10000 non-null object
release_7digitalid     10000 non-null int32
song_hotttnesss        5648 non-null float64
song_id                10000 non-null object
title                  10000 non-null object
track_7digitalid       10000 non-null int

**Single data object**

In [18]:
df[:6]

Unnamed: 0,analyzer_version,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_name,artist_playmeid,genre,idx_artist_terms,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid
0,,29785,0.780462,0.574275,ARMQHX71187B9890D3,,"Atlanta, GA",,bc5e2ad6-0a4a-4d90-b911-e9a7e6861727,Mastodon,-1,,0,0,Call of the Mastodon,223563,0.597641,SOVLGJY12A8C13FBED,Deep Sea Creature,2442524
1,,167867,0.56159,0.42057,AR2PT4M1187FB55B1A,,,,d54ea4a6-0b9c-4e47-bed0-289ae9ff4037,Los Chichos,1880,,0,0,Adelante,221677,,SOGDQZK12A8C13F37C,No Quieras Marcharte,2423472
2,,7725,0.687687,0.406686,ARDI88R1187B98DAB2,,,,fd87374e-ffde-4d36-89a8-8a073f795666,Foxy Brown,-1,,0,0,Ill Na Na,47304,0.588922,SODMVJR12A6D4F985D,If I...,507029
3,,2799,0.391741,0.291265,ARUKJUP12086C14589,,,,8a5f2736-bcde-4a2e-8d50-72631d66a7ef,Ramón Vargas;Vladimir Jurowski,20111,,0,0,Massenet: Werther,295123,,SOIWBDR12A8C13A4AC,Werther - Lyric Drama in four Acts/Act I/Alors...,3343102
4,,74269,0.593416,0.39571,ARZEWUR1187FB53DC8,50.45663,"Belgica -- Namur, Namur/Ghent, East Flanders",4.87137,0be59867-0da4-4e45-9b64-728cdf25487c,Enthroned,55656,,0,0,Pentagrammaton,785362,,SOHCCIA12AC907577F,Ad Te Clamamus Exsvles Mortva Liberi,8688607
5,,49956,0.815923,0.555138,ARUZRFN1187B98AC05,,"Cleveland, OH",,2fa45bbb-0efb-4950-9d40-94bf23cbfec1,Bone Thugs-N-Harmony,5412,,0,0,BTNHRESURRECTION,310248,0.474055,SOBOAQC12A8C13E3E9,Murder One,3510188
