In [68]:
import os
import glob
import h5py
import numpy as np
import csv

dataset_dir = "../../MillionSongSubset"
csv_path = "../../MillionSongSubsetCSV.csv"

# Define the relevant data columns
schema_file = "../../MillionSongSubset/A/A/A/TRAAAAW128F429D538.h5"
data_columns = []
with h5py.File(schema_file, 'r') as schema:
    metadata_data = schema['metadata']['songs'][0]
    analysis_data = schema['analysis']['songs'][0]
    metadata_fields = metadata_data.dtype.fields.keys()
    analysis_fields = analysis_data.dtype.fields.keys()
    for key in metadata_fields:
        data_columns.append(key)
    
    for key in analysis_fields:
        data_columns.append(key)
print('DATA COLUMNS', data_columns)

# Open the CSV file for writing
with open(csv_path, 'w', newline='') as csvfile:
    # Create a CSV writer object
    writer = csv.writer(csvfile, delimiter=',')
    
    # Write the header row with the column names
    writer.writerow(data_columns)
    
    # Loop over all subdirectories in the first level
    for sub_dir_1 in sorted(os.listdir(dataset_dir)):
        # Ignore any non-directory entries
        if not os.path.isdir(os.path.join(dataset_dir, sub_dir_1)):
            continue
            
        # Loop over all subdirectories in the second level
        for sub_dir_2 in sorted(os.listdir(os.path.join(dataset_dir, sub_dir_1))):
            if not os.path.isdir(os.path.join(dataset_dir, sub_dir_1, sub_dir_2)):
                continue
                
            # Loop over all subdirectories in the third level
            for sub_dir_3 in sorted(os.listdir(os.path.join(dataset_dir, sub_dir_1, sub_dir_2))):
                if not os.path.isdir(os.path.join(dataset_dir, sub_dir_1, sub_dir_2, sub_dir_3)):
                    continue
                    
                # Use glob to find all the .h5 files in the fourth level
                h5_files = glob.glob(os.path.join(dataset_dir, sub_dir_1, sub_dir_2, sub_dir_3, "*.h5"))
                
                # Loop over each .h5 file and write the data to the CSV file
                for h5_file in h5_files:
                    # Open the .h5 file
                    row_data = []
                    with h5py.File(h5_file, 'r') as f:
                        # Get the relevant data
                        metadata_data = f['metadata']['songs'][0]
                        analysis_data = f['analysis']['songs'][0]
                        metadata_fields = metadata_data.dtype.fields.keys()
                        analysis_fields = analysis_data.dtype.fields.keys()
                        
                        for field in metadata_fields:
                            try:
                                row_data.append(metadata_data[field])
                            except(ValueError):
                                row_data.append(None)
                            
                        for field in analysis_fields:
                            try:
                                row_data.append(analysis_data[field])
                            except(ValueError):
                                row_data.append(None)
                                                
                        writer.writerow(row_data)


print("Finished!")

DATA COLUMNS ['analyzer_version', 'artist_7digitalid', 'artist_familiarity', 'artist_hotttnesss', 'artist_id', 'artist_latitude', 'artist_location', 'artist_longitude', 'artist_mbid', 'artist_name', 'artist_playmeid', 'genre', 'idx_artist_terms', 'idx_similar_artists', 'release', 'release_7digitalid', 'song_hotttnesss', 'song_id', 'title', 'track_7digitalid', 'analysis_sample_rate', 'audio_md5', 'danceability', 'duration', 'end_of_fade_in', 'energy', 'idx_bars_confidence', 'idx_bars_start', 'idx_beats_confidence', 'idx_beats_start', 'idx_sections_confidence', 'idx_sections_start', 'idx_segments_confidence', 'idx_segments_loudness_max', 'idx_segments_loudness_max_time', 'idx_segments_loudness_start', 'idx_segments_pitches', 'idx_segments_start', 'idx_segments_timbre', 'idx_tatums_confidence', 'idx_tatums_start', 'key', 'key_confidence', 'loudness', 'mode', 'mode_confidence', 'start_of_fade_out', 'tempo', 'time_signature', 'time_signature_confidence', 'track_id']
Finished!


In [73]:
with h5py.File(schema_file, 'r') as schema:
    print(list(schema['metadata']['similar_artists']))  

[b'ARV4KO21187FB38008', b'ARWHM281187FB3D381', b'ARJGOG11187B98D89F', b'AR9ODB41187FB459B2', b'ARXM6VQ1187FB5B1E0', b'ARNWZ1N1187B9B71BA', b'ARDWYZZ11F4C8413FA', b'ARTP3H51187B98FB75', b'ARWCDXN12454A4D1E8', b'ARJ54S61187B9ACD39', b'AR5PF241187B989C1D', b'ARR7MLL1187B99B636', b'ARLMHFV1187B9A3833', b'ARPRERY1187B99E2DC', b'AR34BCQ1187B9A68E4', b'ARFWBUC11F4C8413DA', b'ARPWGMN1187FB560E3', b'ARVCIVW12454A4D1E7', b'ARG89HY1187FB3CA15', b'AR9IGU51187FB40D6B', b'ARNNOYR11F4C845127', b'ARZMFNT11F4C8413DD', b'ARPR9W71187FB3723A', b'AR5VBGP1187B98EB43', b'ARFHDOI1187FB57230', b'ARBSQPF11F4C8413E0', b'AROYGID11F4C8413DB', b'ARDXUGZ11F4C84452F', b'ARMW4I01187B98AEF8', b'AR7AYQG1187B994B3F', b'ARHVZEM11F4C841FF9', b'ARP9H0U1187FB3FEA7', b'ARVSIGU11F4C8413E6', b'AROWKNS1187FB59ED5', b'ARUSTLW11F4C8413DE', b'ARSKPDX11F4C83D2A9', b'ARB4D891187B9954F7', b'ARRIWD31187B9A9B4A', b'ARNAAQH11F4C8413E1', b'ARVRVYO11F4C8413DF', b'ARNIZ5P1187B989AF5', b'AR6YR7H1187FB40D23', b'ARVAOCN12454A4CD5D', b'ARQN3MO1