##Exploration of Million Song Subset and generation of useful data frames

In [1]:
import os
import glob
import hdf5_getters
import time
import pickle

import pandas as pd
import numpy as np
from scipy import stats

In [2]:
def pickle_stuff(filename, data):
    ''' save file '''
    with open(filename, 'w') as picklefile:
        pickle.dump(data, picklefile)

def unpickle(filename):
    ''' open file '''
    with open(filename, 'r') as picklefile:
        old_data = pickle.load(picklefile)
    return old_data

###Check layout of summary hdf5 file

In [3]:
h5 = hdf5_getters.open_h5_file_read('./MillionSongSubset/AdditionalFiles/subset_msd_summary_file.h5')

In [8]:
h5.root

/ (RootGroup) 'H5 Song File'
  children := ['musicbrainz' (Group), 'analysis' (Group), 'metadata' (Group)]

In [9]:
h5.root.musicbrainz

/musicbrainz (Group) 'data about the song coming from MusicBrainz'
  children := ['songs' (Table)]

In [10]:
h5.root.musicbrainz.songs

/musicbrainz/songs (Table(10000,), shuffle, zlib(1)) 'table of data coming from MusicBrainz'
  description := {
  "idx_artist_mbtags": Int32Col(shape=(), dflt=0, pos=0),
  "year": Int32Col(shape=(), dflt=0, pos=1)}
  byteorder := 'little'
  chunkshape := (1024,)

In [11]:
h5.root.metadata

/metadata (Group) 'metadata about the song'
  children := ['songs' (Table)]

In [12]:
h5.root.metadata.songs

/metadata/songs (Table(10000,), shuffle, zlib(1)) 'table of metadata for one song'
  description := {
  "analyzer_version": StringCol(itemsize=32, shape=(), dflt='', pos=0),
  "artist_7digitalid": Int32Col(shape=(), dflt=0, pos=1),
  "artist_familiarity": Float64Col(shape=(), dflt=0.0, pos=2),
  "artist_hotttnesss": Float64Col(shape=(), dflt=0.0, pos=3),
  "artist_id": StringCol(itemsize=32, shape=(), dflt='', pos=4),
  "artist_latitude": Float64Col(shape=(), dflt=0.0, pos=5),
  "artist_location": StringCol(itemsize=1024, shape=(), dflt='', pos=6),
  "artist_longitude": Float64Col(shape=(), dflt=0.0, pos=7),
  "artist_mbid": StringCol(itemsize=40, shape=(), dflt='', pos=8),
  "artist_name": StringCol(itemsize=1024, shape=(), dflt='', pos=9),
  "artist_playmeid": Int32Col(shape=(), dflt=0, pos=10),
  "genre": StringCol(itemsize=1024, shape=(), dflt='', pos=11),
  "idx_artist_terms": Int32Col(shape=(), dflt=0, pos=12),
  "idx_similar_artists": Int32Col(shape=(), dflt=0, pos=13),
  "relea

In [13]:
h5.root.analysis

/analysis (Group) 'Echo Nest analysis of the song'
  children := ['songs' (Table)]

In [14]:
h5.root.analysis.songs

/analysis/songs (Table(10000,), shuffle, zlib(1)) 'table of Echo Nest analysis for one song'
  description := {
  "analysis_sample_rate": Int32Col(shape=(), dflt=0, pos=0),
  "audio_md5": StringCol(itemsize=32, shape=(), dflt='', pos=1),
  "danceability": Float64Col(shape=(), dflt=0.0, pos=2),
  "duration": Float64Col(shape=(), dflt=0.0, pos=3),
  "end_of_fade_in": Float64Col(shape=(), dflt=0.0, pos=4),
  "energy": Float64Col(shape=(), dflt=0.0, pos=5),
  "idx_bars_confidence": Int32Col(shape=(), dflt=0, pos=6),
  "idx_bars_start": Int32Col(shape=(), dflt=0, pos=7),
  "idx_beats_confidence": Int32Col(shape=(), dflt=0, pos=8),
  "idx_beats_start": Int32Col(shape=(), dflt=0, pos=9),
  "idx_sections_confidence": Int32Col(shape=(), dflt=0, pos=10),
  "idx_sections_start": Int32Col(shape=(), dflt=0, pos=11),
  "idx_segments_confidence": Int32Col(shape=(), dflt=0, pos=12),
  "idx_segments_loudness_max": Int32Col(shape=(), dflt=0, pos=13),
  "idx_segments_loudness_max_time": Int32Col(shape=()

In [15]:
#seg_loud_max = [(row['track_id'], row['idx_sections_start']) for row in h5.root.analysis.songs.where('(tempo>238) & (tempo<240)')]
#print seg_loud_max

In [7]:
tids = [(row['track_id'], row['tempo'], row['loudness']) for row in h5.root.analysis.songs.where('tempo>240')]
#'(tempo>245) & (tempo<250)')]
tids

[('TRAIHYT128F9350C8D', 240.263, -6.566),
 ('TRAHHSV128F42374E3', 244.366, -12.785),
 ('TRAHGSZ128F930C9A6', 243.195, -23.366),
 ('TRAXASE128F9325369', 240.064, -8.298),
 ('TRAXKGF128F92D68E9', 241.52, -15.734),
 ('TRARFWJ128F9339D70', 243.981, -13.538),
 ('TRAAUIH128F4254C9D', 241.877, -4.779),
 ('TRAFJKG128F9332800', 246.5, -11.906),
 ('TRAFYGI12903CE9B9A', 242.393, -9.606),
 ('TRADDCV128F423BAB0', 240.761, -5.575),
 ('TRADZCV128F4294DFD', 240.314, -15.655),
 ('TRAPZAW128F930FEEF', 240.983, -13.915),
 ('TRAWINR128F93378C6', 241.242, -20.243),
 ('TRAZFZS128F4272A31', 246.593, -22.28),
 ('TRAGYJM128EF3466D2', 253.357, -3.868),
 ('TRAKISP128F4284A37', 243.994, -11.315),
 ('TRAKSPS128F933B3D9', 241.892, -8.328),
 ('TRBCHFG128F9353D72', 240.5, -12.369),
 ('TRBHZRN128F92EFF91', 258.677, -14.165),
 ('TRBBQFV128F4252EFF', 241.818, -15.967),
 ('TRBFXLH12903CBAFAD', 248.079, -16.386),
 ('TRBFATS128F9319AEB', 242.7, -7.117),
 ('TRBEFQY128F932EE3F', 244.268, -23.465),
 ('TRBGYHC12903D0626A', 262

###Get title and other metadata (hotttness, etc) features

In [17]:
tids_titles = []

def get_items(t_id, titles, metas, sh5):
    title = hdf5_getters.get_title(sh5)
    artist = hdf5_getters.get_artist_name(sh5)
    artist_id = hdf5_getters.get_artist_id(sh5)
    song_id = hdf5_getters.get_song_id(sh5)
    titles.append((t_id, title, artist_id, artist, song_id))
    song_hottt = hdf5_getters.get_song_hotttnesss(sh5)
    artist_hottt = hdf5_getters.get_artist_hotttnesss(sh5)
    artist_fam = hdf5_getters.get_artist_familiarity(sh5)
    
    metavals = [t_id, song_hottt, artist_hottt, artist_fam]
    for m in metavals[1:]:
        if type(m) != float:
            m = 0.0
    metas.append(metavals)
                    
def get_matching_titles(track_ids=None, basedir='./MillionSongSubset/data/',ext='.h5', verbose=0):
    t1 = time.time()
    titles = []
    metas = []
    count = 0
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            if track_ids and len(titles) == len(track_ids):
                break
            sh5 = hdf5_getters.open_h5_file_read(f)
            if track_ids:
                for t_id in track_ids:
                    if hdf5_getters.get_track_id(sh5) == t_id:
                        get_items(t_id, titles, metas, sh5)
            else:
                t_id = hdf5_getters.get_track_id(sh5)
                get_items(t_id, titles, metas, sh5)
            if verbose > 1:
                if count >= 1000:
                    print "1000!"
                    print "elapsed time: ", time.time()-t1
                    print titles[-1]
                    count = 0
                else:
                    count += 1
            sh5.close()
    t2 = time.time()
    if verbose > 0:
        print "total time: ", (t2-t1)
    return titles, metas

In [18]:
tids_only = [t[0] for t in tids]
test_titles, test_metas = get_matching_titles(track_ids=tids_only, verbose=1)
print test_titles
print test_metas

total time:  69.8689739704
[('TRBGYHC12903D0626A', 'Endless Light', 'AR0JEOI1187B98B918', 'Erik Berglund', 'SODWYEY12AC468C30F'), ('TRBHZRN128F92EFF91', 'Kapitel 4', 'ARIEIJW1187B98F6DD', 'Oliver Kalkofe', 'SOQFMJN12AB0181D3D'), ('TRBFXLH12903CBAFAD', "Two of a Kind_ Workin' on a Full House", 'ARBMHBC11F4C8403A2', 'Hushabye Baby', 'SOOMCCM12AB01896EA'), ('TRAZFZS128F4272A31', 'Severe Severing', 'ARSL3N21187B98DFC5', 'Klaus Badelt', 'SOABYBF12A8C138A09'), ('TRAGYJM128EF3466D2', 'Been There All The Time', 'AR9YWMS1187FB43A34', 'Dinosaur Jr.', 'SOZRIFJ12A67ADA4BE'), ('TRAFJKG128F9332800', 'Ego is the drug/3am', 'ARCRSUM1187FB52AB0', 'The Frequency', 'SONNXDI12AB0186816')]
[['TRBGYHC12903D0626A', 0.0, 0.33619115568493391, 0.37703408512839853], ['TRBHZRN128F92EFF91', 0.21508031850922793, 0.40619874783410698, 0.45248343829809812], ['TRBFXLH12903CBAFAD', nan, 0.36551400326265299, 0.35342955022197026], ['TRAZFZS128F4272A31', 0.32773668317784083, 0.46934037425924607, 0.68800334883697312], ['TRA

In [19]:
titles, metas = get_matching_titles(verbose=2)
#print titles[0]
#print metas

1000!
elapsed time:  14.493544817
('TRBBLTV12903CC3C00', 'Marcel', 'ARW45YR1187FB3DE95', 'Pierre Perret', 'SOJKSPB12A67AD93AE')
1000!
elapsed time:  28.5904939175
('TRBDFWV128F93027EE', 'Red is the Rose', 'AR5J8N51187FB54ED9', 'Joe Heaney', 'SOSLPVV12A8C1351F1')
1000!
elapsed time:  43.0146207809
('TRAZGIF128F934522F', '3 Deuces', 'ARGEKDX1187FB3A5BF', 'Marcus Miller', 'SOTOZVC12AB01870D1')
1000!
elapsed time:  57.4039077759
('TRAWLVX128F934E63C', 'sample processing failed (part one: "why don\'t you fuckin\' remember anything ?")', 'ARA8X3M1187B9A13CA', 'Aphasia', 'SOTOVKU12AB018AC33')
1000!
elapsed time:  71.7407538891
('TRATEMP128E0788535', 'Abscondence', 'ARMPIH61187FB44227', 'Sun Yan-Zi', 'SOKPWFH12A6702056E')
1000!
elapsed time:  86.4123408794
('TRABGWT128F9334E43', 'Ven Morena', 'ARRVUWA1187B9A0FE2', 'Orquesta Arag\xc3\xb3n', 'SOACHGQ12AB018590E')
1000!
elapsed time:  101.224594831
('TRARRWA128F42A0195', 'Martha Served', 'ARJGW911187FB586CA', 'I Hate Sally', 'SODJYEC12A8C13D757')

###Generate useful data frames
for title and metadata features

In [20]:
title_heads = ["track_id", "title", "artist_id", "artist", "song_id"]
titledf = pd.DataFrame(titles, columns=title_heads)
#titledf.index.name = "index"
titledf.head()

Unnamed: 0,track_id,title,artist_id,artist,song_id
0,TRBGPHG12903CE6CC3,The Law Gonna Step On You (1931),ARFXRHR1187B98FF09,Bo Carter,SORUUEV12A58A7B9FA
1,TRBGPYK128F42796E1,Des Vôtres,ARQDTOS12086C11443,Fredericks_ Goldman_ Jones,SOEZDOH12A8AE4787A
2,TRBGPJP128E078ED20,Crazy,AR12F2S1187FB56EEF,Aerosmith,SOOOWIC12A6701C7E5
3,TRBGPXH128F428C912,Chiove,AR7LIU31187B98EF11,Rita Chiarelli,SOLSWBA12A8C141B9A
4,TRBGPSV12903CA9C25,Rude Bwoy Love (feat. Dj Fly_ Dj Traxx_ T.Will...,ARPGCHN1187B9A2831,Nicky B_ Naëlle,SOHNJQL12AB018CC5C


In [24]:
titledf[titledf["song_id"] == "SOEBCBI12AF72A154F"]

Unnamed: 0,track_id,title,artist_id,artist,song_id
5141,TRALZBV128F4250E60,The Captain,AR7DBMA1187FB516B1,The Knife,SOEBCBI12AF72A154F


In [42]:
meta_heads = ["track_id", "song_hottt", "artist_hottt", "artist_fam"]
metadf = pd.DataFrame(metas, columns=meta_heads)
metadf.index.name = "index"
metadf.head()
#metadf2 = metadf.set_index("track_id")
#metadf2.head()

Unnamed: 0_level_0,track_id,song_hottt,artist_hottt,artist_fam
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,TRBGPHG12903CE6CC3,0.252446,0.35747,0.469672
1,TRBGPYK128F42796E1,,0.305101,0.38272
2,TRBGPJP128E078ED20,,0.610731,0.872537
3,TRBGPXH128F428C912,,0.28805,0.404823
4,TRBGPSV12903CA9C25,,0.4127,0.595409


In [5]:
metadf = unpickle("./pkls/metas_df.pkl")

In [17]:
print len(metadf[metadf["song_hottt"] <=0.4].index)#head()

3118


In [16]:
print len(metadf[metadf["artist_hottt"] <=0.4].index)#head()

5834


In [15]:
print len(metadf[metadf["artist_fam"] <=0.4].index)#head()

1257


In [11]:
len(metadf.index)

10000

###Save titles and metas data frames

In [44]:
pickle_stuff("titles_df3.pkl", titledf)

In [45]:
pickle_stuff("metas_df3.pkl", metadf)

In [46]:
with open("titles_csv3.csv", 'w') as twf:
    twf.write(titledf.to_csv())

In [47]:
with open("metas_csv3.csv", 'w') as mwf:
    mwf.write(metadf.to_csv())

###Now for the actual track data/features

Check attributes of example hdf5 file

In [3]:
h52 = hdf5_getters.open_h5_file_read('./MillionSongSubset/data/A/F/J/TRAFJKG128F9332800.h5')#TRAGYJM128EF3466D2.h5')

In [4]:
h52.root.analysis

/analysis (Group) 'Echo Nest analysis of the song'
  children := ['beats_start' (EArray), 'segments_pitches' (EArray), 'segments_confidence' (EArray), 'sections_start' (EArray), 'tatums_confidence' (EArray), 'segments_timbre' (EArray), 'segments_loudness_start' (EArray), 'sections_confidence' (EArray), 'bars_confidence' (EArray), 'segments_loudness_max_time' (EArray), 'bars_start' (EArray), 'tatums_start' (EArray), 'beats_confidence' (EArray), 'segments_start' (EArray), 'segments_loudness_max' (EArray), 'songs' (Table)]

Check what the rhythm attributes look like

In [7]:
print hdf5_getters.get_beats_start(h52).shape

(4291,)


In [9]:
print hdf5_getters.get_tatums_start(h52).shape

(4291,)


In [26]:
def get_test_features(track_ids=None, basedir='./MillionSongSubset/data/', ext='.h5', verbose=0):
    t1 = time.time()
    count = 0
    features = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            if track_ids and len(features) >= len(track_ids):
                break
            sh5 = hdf5_getters.open_h5_file_read(f)
            if track_ids:
                for t_id in track_ids:
                    if hdf5_getters.get_track_id(sh5) == t_id:
                        duration = hdf5_getters.get_duration(sh5)
                        tatums = float(hdf5_getters.get_tatums_start(sh5).shape[0])
                        #print tatums, tatums/duration
                        beats = float(hdf5_getters.get_beats_start(sh5).shape[0])
                        #print beats, beats/duration
                        segments = float(hdf5_getters.get_segments_start(sh5).shape[0])
                        #print segments, segmprint sections, sections/durationents/duration
                        sections = float(hdf5_getters.get_sections_start(sh5).shape[0])
                        print t_id
                        print tatums/beats
                        print beats/segments
                        print segments/sections
                        print sections, sections/duration
            sh5.close()
    if verbose > 0:
        print "total time: ", (time.time()-t1)

In [29]:
tids_only1 = [t[0] for t in tids][:2]
#print tids_only1
test_features = get_test_features(track_ids=tids_only1, verbose=1)
#print test_features[0]

TRAHHSV128F42374E3
1.0
1.425
52.3076923077
13.0 0.0529647896597
TRAIHYT128F9350C8D
2.0
0.650406504065
118.08
25.0 0.0520890522772
total time:  82.6958019733


#### Extract desired features from each track file

In [37]:
def get_pitch_features(sh5):
    seg_pitches = hdf5_getters.get_segments_pitches(sh5)
    pitch_means = np.mean(seg_pitches, axis=0).tolist()
    pitch_medians = np.median(seg_pitches, axis=0).tolist()
    pitch_modes = stats.mode(seg_pitches)[0].tolist()[0]
    pitch_features = [pitch_means, pitch_medians, pitch_modes]
    for p in pitch_features:
        if len(p) != 12:
            p = [0,0,0,0,0,0,0,0,0,0,0,0]
    return pitch_means, pitch_medians, pitch_modes

def get_timbre_features(sh5):
    seg_timbres = hdf5_getters.get_segments_timbre(sh5)
    timbre_means = np.mean(seg_timbres, axis=0).tolist()
    timbre_medians = np.median(seg_timbres, axis=0).tolist()
    timbre_modes = stats.mode(seg_timbres)[0].tolist()[0]
    timbre_features = [timbre_means, timbre_medians, timbre_modes]
    for t in timbre_features:
        if len(t) != 12:
            t = [0,0,0,0,0,0,0,0,0,0,0,0]
    return timbre_means, timbre_medians, timbre_modes

def get_loudness_features(sh5):
    loud_maxs = hdf5_getters.get_segments_loudness_max(sh5)
    loud_mean = np.mean(loud_maxs)
    loud_median = np.median(loud_maxs)
    loud_mode = stats.mode(loud_maxs)[0][0]#.tolist()
    return loud_mean, loud_median, loud_mode

def get_density_features(sh5):
    duration = hdf5_getters.get_duration(sh5)
    tatums = float(hdf5_getters.get_tatums_start(sh5).shape[0])
    beats = float(hdf5_getters.get_beats_start(sh5).shape[0])
    segments = float(hdf5_getters.get_segments_start(sh5).shape[0])
    sections = float(hdf5_getters.get_sections_start(sh5).shape[0])
    if beats > 0:
        tatum_density = tatums/beats
    else:
        tatum_density = 0.0
    if segments > 0:
        beat_density = beats/segments
    else:
        beat_density = 0.0
    if sections > 0:
        segment_density = segments/sections
    else:
        segment_density = 0.0
    if duration > 0:
        section_density = sections/duration
    else:
        section_density = 0.0
    return tatum_density, beat_density, segment_density, section_density
    
def get_all_features(t_id, features, sh5):
    pitch_means, pitch_medians, pitch_modes = get_pitch_features(sh5)
    pitches = pitch_means + pitch_medians + pitch_modes
    loud_mean, loud_median, loud_mode = get_loudness_features(sh5)
    louds = [loud_mean, loud_median, loud_mode]
    timbre_means, timbre_medians, timbre_modes = get_timbre_features(sh5)
    timbres = timbre_means + timbre_medians + timbre_modes
    duration = hdf5_getters.get_duration(sh5)
    tempo = hdf5_getters.get_tempo(sh5)
    tatum_density, beat_density, segment_density, section_density = get_density_features(sh5)
    loudness = hdf5_getters.get_loudness(sh5)
    mode = hdf5_getters.get_mode(sh5)
    key = hdf5_getters.get_key(sh5)
    time_sig = hdf5_getters.get_time_signature(sh5)
    fade_in = hdf5_getters.get_end_of_fade_in(sh5)
    features.append([t_id, duration, fade_in, tempo, tatum_density, beat_density, segment_density, section_density,
                     loudness, mode, key, time_sig]+pitches+louds+timbres)
                        
def get_track_features(track_ids=None, basedir='./MillionSongSubset/data/', ext='.h5', verbose=0):
    t1 = time.time()
    t2 = t1
    count = 0
    features = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            if track_ids and len(features) >= len(track_ids):
                break
            sh5 = hdf5_getters.open_h5_file_read(f)
            if track_ids:
                for t_id in track_ids:
                    if hdf5_getters.get_track_id(sh5) == t_id:
                        get_all_features(t_id, features, sh5)
            else:
                t_id = hdf5_getters.get_track_id(sh5)
                get_all_features(t_id, features, sh5)
            if verbose > 1:
                if count >= 1000:
                    print "1000!"
                    print "elapsed time: ", time.time()-t1
                    print "round time: ", time.time()-t2
                    print features[-1]
                    count = 0
                    t2 = time.time()
                else:
                    count += 1
            sh5.close()
    if verbose > 0:
        print "total time: ", (time.time()-t1)
    return features

In [32]:
tids_only0 = [t[0] for t in tids]
print tids_only0
print "track_id, duration, fade_in, tempo, tatum_density, beat_density, segment_density, section_density, " \
        + "loudness, mode, key, time_sig, pitches(3x12), louds(3), timbres(3x12)"
test_features = get_track_features(track_ids=tids_only0, verbose=1)
print test_features[0]

['TRAIHYT128F9350C8D', 'TRAHHSV128F42374E3', 'TRAHGSZ128F930C9A6', 'TRAXASE128F9325369', 'TRAXKGF128F92D68E9', 'TRARFWJ128F9339D70', 'TRAAUIH128F4254C9D', 'TRAFJKG128F9332800', 'TRAFYGI12903CE9B9A', 'TRADDCV128F423BAB0', 'TRADZCV128F4294DFD', 'TRAPZAW128F930FEEF', 'TRAWINR128F93378C6', 'TRAZFZS128F4272A31', 'TRAGYJM128EF3466D2', 'TRAKISP128F4284A37', 'TRAKSPS128F933B3D9', 'TRBCHFG128F9353D72', 'TRBHZRN128F92EFF91', 'TRBBQFV128F4252EFF', 'TRBFXLH12903CBAFAD', 'TRBFATS128F9319AEB', 'TRBEFQY128F932EE3F', 'TRBGYHC12903D0626A']
track_id, duration, fade_in, tempo, tatum_density, beat_density,         segment_density, section_density, loudness, mode, key, time_sig, pitches(3x12), louds(3), timbres(3x12)
total time:  125.714949131
['TRBGYHC12903D0626A', 1815.2224000000001, 0.23100000000000001, 262.82799999999997, 1.0, 1.7372134038800706, 58.15384615384615, 0.042969941314078094, -11.534000000000001, 1, 0, 7, 0.4068697089947092, 0.11680842151675491, 0.33634766313932957, 0.10223082010582031, 0.42

In [34]:
print test_features[0][50]

-15.739


In [38]:
print "track_id, duration, fade_in, tempo, tatum_density, beat_density, segment_density, section_density, " \
      + "loudness, mode, key, time_sig, pitches(3x12), louds(3), timbres(3x12)"
features = get_track_features(verbose=2)
#total time:  6930.79542994
#total time:  7129.08057904
#replace energy and danceability with rhythm densities

track_id, duration, fade_in, tempo, tatum_density, beat_density, segment_density, section_density, loudness, mode, key, time_sig, pitches(3x12), louds(3), timbres(3x12)
1000!
elapsed time:  710.096243858
round time:  710.096272945
['TRBBLTV12903CC3C00', 277.21098000000001, 0.16, 111.682, 2.0, 0.4145748987854251, 88.21428571428571, 0.050503050059561135, -10.416, 0, 10, 3, 0.39947611336032424, 0.4047765182186235, 0.2864178137651819, 0.2525433198380567, 0.26561376518218627, 0.49350202429149825, 0.29562995951417037, 0.24776599190283408, 0.24167449392712553, 0.2719910931174084, 0.3793757085020242, 0.24711659919028323, 0.277, 0.271, 0.197, 0.16, 0.179, 0.382, 0.21, 0.137, 0.134, 0.164, 0.247, 0.162, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, -12.322485020242915, -11.496, -13.042, 44.017012145748986, -15.268599999999989, 20.567357894736837, -5.011744129554655, -28.93324858299595, 5.606742510121475, -42.81682267206477, -8.117199190283388, 7.390285020242924, 0.7160591093117409,

####Tests of pitch results and shape, as well as how to get the correct stats

In [106]:
#a = np.array([[1, 2], [3, 4], [5, 6]])
#print np.mean(a, axis=0)
#print np.mean(a, axis=1)

In [107]:
#print seg_pitches.shape
#print type(seg_pitches)
#seg_pitch_mean = np.mean(seg_pitches, axis=0)
#print seg_pitch_mean.shape, "\n", seg_pitch_mean.tolist()
#print seg_pitches
#print
#print np.median(seg_pitches, axis=0).tolist()
#print
#print stats.mode(seg_pitches)[0].tolist()[0]

###Generate track features dataframe

####Create header labels

In [39]:
p_mean_titles = ["pitch_mean"+str(n) for n in range(1,13)]
p_median_titles = ["pitch_median"+str(n) for n in range(1,13)]
p_mode_titles = ["pitch_mode"+str(n) for n in range(1,13)]
print p_mean_titles

t_mean_titles = ["timbre_mean"+str(n) for n in range(1,13)]
t_median_titles = ["timbre_median"+str(n) for n in range(1,13)]
t_mode_titles = ["timbre_mode"+str(n) for n in range(1,13)]

['pitch_mean1', 'pitch_mean2', 'pitch_mean3', 'pitch_mean4', 'pitch_mean5', 'pitch_mean6', 'pitch_mean7', 'pitch_mean8', 'pitch_mean9', 'pitch_mean10', 'pitch_mean11', 'pitch_mean12']


In [40]:
heads = ["track_id", "duration", "fade_in", "tempo", "tatum_density", "beat_density", "segment_density",
         "section_density", "loudness", "mode", "key", "time_sig"] \
         + p_mean_titles + p_median_titles + p_mode_titles + \
         ["loud_mean", "loud_median", "loud_mode"] + t_mean_titles + t_median_titles + t_mode_titles
print len(heads), "\n", heads

87 
['track_id', 'duration', 'fade_in', 'tempo', 'tatum_density', 'beat_density', 'segment_density', 'section_density', 'loudness', 'mode', 'key', 'time_sig', 'pitch_mean1', 'pitch_mean2', 'pitch_mean3', 'pitch_mean4', 'pitch_mean5', 'pitch_mean6', 'pitch_mean7', 'pitch_mean8', 'pitch_mean9', 'pitch_mean10', 'pitch_mean11', 'pitch_mean12', 'pitch_median1', 'pitch_median2', 'pitch_median3', 'pitch_median4', 'pitch_median5', 'pitch_median6', 'pitch_median7', 'pitch_median8', 'pitch_median9', 'pitch_median10', 'pitch_median11', 'pitch_median12', 'pitch_mode1', 'pitch_mode2', 'pitch_mode3', 'pitch_mode4', 'pitch_mode5', 'pitch_mode6', 'pitch_mode7', 'pitch_mode8', 'pitch_mode9', 'pitch_mode10', 'pitch_mode11', 'pitch_mode12', 'loud_mean', 'loud_median', 'loud_mode', 'timbre_mean1', 'timbre_mean2', 'timbre_mean3', 'timbre_mean4', 'timbre_mean5', 'timbre_mean6', 'timbre_mean7', 'timbre_mean8', 'timbre_mean9', 'timbre_mean10', 'timbre_mean11', 'timbre_mean12', 'timbre_median1', 'timbre_median

####Initial dataframe

In [41]:
df = pd.DataFrame(features, columns=heads)

In [42]:
df.head(6)

Unnamed: 0,track_id,duration,fade_in,tempo,tatum_density,beat_density,segment_density,section_density,loudness,mode,...,timbre_mode3,timbre_mode4,timbre_mode5,timbre_mode6,timbre_mode7,timbre_mode8,timbre_mode9,timbre_mode10,timbre_mode11,timbre_mode12
0,TRBGPHG12903CE6CC3,162.24608,0.162,116.083,2.003279,0.44919,75.444444,0.055471,-15.925,1,...,50.255,43.133,-14.848,-6.216,-36.211,-14.181,-10.398,-15.291,-38.765,-4.891
1,TRBGPYK128F42796E1,344.21506,8.388,153.939,2.0,0.791704,100.818182,0.031957,-9.985,1,...,2.374,-15.719,-6.225,-40.024,-61.058,-13.232,-26.859,-6.637,-6.181,-7.26
2,TRBGPJP128E078ED20,318.61506,0.287,232.709,1.0,1.468565,64.846154,0.040802,-4.43,0,...,40.848,-14.816,-23.065,-30.342,11.848,-38.658,4.058,-2.595,-12.936,-10.203
3,TRBGPXH128F428C912,228.38812,0.316,99.381,2.002646,0.610662,47.615385,0.056921,-16.107,1,...,-191.506,-280.943,-44.835,-68.281,-15.739,12.542,-36.845,-13.414,1.47,16.671
4,TRBGPSV12903CA9C25,251.55873,0.723,173.974,1.998596,0.596815,108.454545,0.043727,-5.794,1,...,-38.144,20.98,-45.595,-36.786,-29.561,-39.134,-12.461,-15.608,2.605,-22.469
5,TRBGPAX12903CEC037,188.26404,0.0,78.498,2.0,0.446655,61.444444,0.047805,-18.335,1,...,116.21,-160.697,21.198,-74.912,21.562,7.834,-24.402,-7.54,6.403,-14.735


###Split into segment data frames

In [43]:
track_ids = df["track_id"]
track_ids.head(6)

0    TRBGPHG12903CE6CC3
1    TRBGPYK128F42796E1
2    TRBGPJP128E078ED20
3    TRBGPXH128F428C912
4    TRBGPSV12903CA9C25
5    TRBGPAX12903CEC037
Name: track_id, dtype: object

#### Handle categorical variables
(get dummies where necessary)

In [44]:
modes = df["mode"]
modes.head(6)

0    1
1    1
2    0
3    1
4    1
5    1
Name: mode, dtype: int64

In [45]:
keys = pd.get_dummies(df["key"], prefix="key")
keys.head(6)

Unnamed: 0,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11
0,0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,1


In [46]:
times = pd.get_dummies(df["time_sig"], prefix="time_sig") #"ts-1", "ts1", "ts3", "ts4", "ts5", "ts6", "ts7"
times.head(6)

Unnamed: 0,time_sig_0,time_sig_1,time_sig_3,time_sig_4,time_sig_5,time_sig_7
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,0,1,0,0,0
3,0,1,0,0,0,0
4,0,0,0,1,0,0
5,0,1,0,0,0,0


In [47]:
categoricals = pd.concat([modes, keys,times], axis=1)
categoricals.head(6)

Unnamed: 0,mode,key_0,key_1,key_2,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,time_sig_0,time_sig_1,time_sig_3,time_sig_4,time_sig_5,time_sig_7
0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0


####Generate z scores for non categorical variables

In [76]:
firsts = df.iloc[:,1:9]
firsts.head(6)

Unnamed: 0,duration,fade_in,tempo,tatum_density,beat_density,segment_density,section_density,loudness
0,162.24608,0.162,116.083,2.003279,0.44919,75.444444,0.055471,-15.925
1,344.21506,8.388,153.939,2.0,0.791704,100.818182,0.031957,-9.985
2,318.61506,0.287,232.709,1.0,1.468565,64.846154,0.040802,-4.43
3,228.38812,0.316,99.381,2.002646,0.610662,47.615385,0.056921,-16.107
4,251.55873,0.723,173.974,1.998596,0.596815,108.454545,0.043727,-5.794
5,188.26404,0.0,78.498,2.0,0.446655,61.444444,0.047805,-18.335


In [80]:
firsts_zscore = pd.DataFrame()
for f in firsts:
    f_zscore = f #+ '_zscore'
    if firsts[f].std(ddof=0) == 0:
        firsts_zscore[f_zscore] = 0.0
    else:
        firsts_zscore[f_zscore] = (firsts[f] - firsts[f].mean())/firsts[f].std(ddof=0)
    for val in firsts_zscore[f_zscore]:
        #In [7]: d.sales[d.sales==24] = 100
        if val > 15:
            firsts_zscore[f_zscore][firsts_zscore[f_zscore] > 15] = 15.0
        elif val < -15:
            firsts_zscore[f_zscore][firsts_zscore[f_zscore] < -15] = 15.0
firsts_zscore.fillna(0)
firsts_zscore.head(6)

Unnamed: 0,duration,fade_in,tempo,tatum_density,beat_density,segment_density,section_density,loudness
0,-0.668187,-0.319412,-0.194199,-0.341662,-0.464571,-0.32573,1.088936,-1.007373
1,0.926188,4.084562,0.881785,-0.34608,0.578074,0.36889,-1.047679,0.092725
2,0.701886,-0.25249,3.120673,-1.693426,2.638498,-0.615863,-0.244006,1.12152
3,-0.088664,-0.236965,-0.668922,-0.342516,0.026966,-1.087565,1.22063,-1.04108
4,0.114352,-0.019068,1.451242,-0.347972,-0.015188,0.57794,0.02184,0.868905
5,-0.440223,-0.406142,-1.262482,-0.34608,-0.472289,-0.708987,0.392367,-1.45371


In [81]:
lasts = df.iloc[:,12:]
lasts.head(6)

Unnamed: 0,pitch_mean1,pitch_mean2,pitch_mean3,pitch_mean4,pitch_mean5,pitch_mean6,pitch_mean7,pitch_mean8,pitch_mean9,pitch_mean10,...,timbre_mode3,timbre_mode4,timbre_mode5,timbre_mode6,timbre_mode7,timbre_mode8,timbre_mode9,timbre_mode10,timbre_mode11,timbre_mode12
0,0.270331,0.143707,0.168981,0.35051,0.170888,0.253457,0.123119,0.263781,0.54509,0.19732,...,50.255,43.133,-14.848,-6.216,-36.211,-14.181,-10.398,-15.291,-38.765,-4.891
1,0.486095,0.450403,0.501069,0.269405,0.295941,0.442808,0.360309,0.695634,0.332228,0.358093,...,2.374,-15.719,-6.225,-40.024,-61.058,-13.232,-26.859,-6.637,-6.181,-7.26
2,0.229911,0.437368,0.476496,0.22983,0.406257,0.246439,0.399648,0.254604,0.264917,0.442389,...,40.848,-14.816,-23.065,-30.342,11.848,-38.658,4.058,-2.595,-12.936,-10.203
3,0.224934,0.27801,0.155682,0.37725,0.147202,0.104323,0.126932,0.262422,0.402643,0.184982,...,-191.506,-280.943,-44.835,-68.281,-15.739,12.542,-36.845,-13.414,1.47,16.671
4,0.544393,0.430185,0.427274,0.131469,0.262028,0.130499,0.268519,0.374562,0.251579,0.465624,...,-38.144,20.98,-45.595,-36.786,-29.561,-39.134,-12.461,-15.608,2.605,-22.469
5,0.324989,0.205213,0.193684,0.276089,0.345495,0.360568,0.272523,0.084389,0.191877,0.226774,...,116.21,-160.697,21.198,-74.912,21.562,7.834,-24.402,-7.54,6.403,-14.735


In [82]:
lasts_zscore = pd.DataFrame()
for l in lasts:
    l_zscore = l #+ '_zscore'
    if lasts[l].std(ddof=0) == 0:
        lasts_zscore[l_zscore] = 0.0
    else:
        lasts_zscore[l_zscore] = (lasts[l] - lasts[l].mean())/lasts[l].std(ddof=0)
    for val in lasts_zscore[l_zscore]:
        if val > 15:
            lasts_zscore[l_zscore][lasts_zscore[l_zscore] > 15] = 15.0
        elif val < -15:
            lasts_zscore[l_zscore][lasts_zscore[l_zscore] < -15] = -15.0
            
lasts_zscore.fillna(0)
lasts_zscore.head(6)

Unnamed: 0,pitch_mean1,pitch_mean2,pitch_mean3,pitch_mean4,pitch_mean5,pitch_mean6,pitch_mean7,pitch_mean8,pitch_mean9,pitch_mean10,...,timbre_mode3,timbre_mode4,timbre_mode5,timbre_mode6,timbre_mode7,timbre_mode8,timbre_mode9,timbre_mode10,timbre_mode11,timbre_mode12
0,-1.143198,-1.879897,-1.660159,0.457382,-1.495596,-0.616698,-1.845234,-0.797447,1.935592,-1.328494,...,1.259628,1.189218,0.437165,0.945188,-0.284978,0.281554,0.185953,-0.112983,-1.104735,0.363289
1,0.319989,0.124739,1.238081,-0.282134,-0.445299,1.042941,0.295974,2.856606,0.086929,0.028991,...,0.536204,0.386584,0.661103,-0.21227,-1.06644,0.317366,-0.484924,0.358257,0.436883,0.239837
2,-1.417306,0.039537,1.023627,-0.642976,0.481223,-0.678206,0.651097,-0.875095,-0.497655,0.740744,...,1.1175,0.3989,0.22377,0.119205,1.226524,-0.642127,0.775115,0.578357,0.11729,0.086472
3,-1.451059,-1.002063,-1.776224,0.701205,-1.694531,-1.923831,-1.810814,-0.808945,0.698469,-1.432664,...,-2.393088,-3.230586,-0.341594,-1.179682,0.358886,1.289991,-0.891908,-0.010774,0.798868,1.486924
4,0.715337,-0.007409,0.594053,-1.539842,-0.730126,-1.694406,-0.532655,0.139908,-0.613491,0.936924,...,-0.075974,0.887092,-0.361331,-0.101413,-0.075829,-0.66009,0.101875,-0.130245,0.852567,-0.552731
5,-0.772539,-1.477877,-1.44457,-0.221191,-0.029104,0.322118,-0.496511,-2.31534,-1.131993,-1.079796,...,2.256128,-1.590651,1.373275,-1.406702,1.532039,1.112327,-0.384787,0.309085,1.032259,-0.149699


###Re join segment data frames
Now with dummies and z-scored features

In [83]:
final_df = pd.concat([track_ids, firsts_zscore, categoricals, lasts_zscore], axis=1)
final_df.head(6)

Unnamed: 0,track_id,duration,fade_in,tempo,tatum_density,beat_density,segment_density,section_density,loudness,mode,...,timbre_mode3,timbre_mode4,timbre_mode5,timbre_mode6,timbre_mode7,timbre_mode8,timbre_mode9,timbre_mode10,timbre_mode11,timbre_mode12
0,TRBGPHG12903CE6CC3,-0.668187,-0.319412,-0.194199,-0.341662,-0.464571,-0.32573,1.088936,-1.007373,1,...,1.259628,1.189218,0.437165,0.945188,-0.284978,0.281554,0.185953,-0.112983,-1.104735,0.363289
1,TRBGPYK128F42796E1,0.926188,4.084562,0.881785,-0.34608,0.578074,0.36889,-1.047679,0.092725,1,...,0.536204,0.386584,0.661103,-0.21227,-1.06644,0.317366,-0.484924,0.358257,0.436883,0.239837
2,TRBGPJP128E078ED20,0.701886,-0.25249,3.120673,-1.693426,2.638498,-0.615863,-0.244006,1.12152,0,...,1.1175,0.3989,0.22377,0.119205,1.226524,-0.642127,0.775115,0.578357,0.11729,0.086472
3,TRBGPXH128F428C912,-0.088664,-0.236965,-0.668922,-0.342516,0.026966,-1.087565,1.22063,-1.04108,1,...,-2.393088,-3.230586,-0.341594,-1.179682,0.358886,1.289991,-0.891908,-0.010774,0.798868,1.486924
4,TRBGPSV12903CA9C25,0.114352,-0.019068,1.451242,-0.347972,-0.015188,0.57794,0.02184,0.868905,1,...,-0.075974,0.887092,-0.361331,-0.101413,-0.075829,-0.66009,0.101875,-0.130245,0.852567,-0.552731
5,TRBGPAX12903CEC037,-0.440223,-0.406142,-1.262482,-0.34608,-0.472289,-0.708987,0.392367,-1.45371,1,...,2.256128,-1.590651,1.373275,-1.406702,1.532039,1.112327,-0.384787,0.309085,1.032259,-0.149699


### Save data frame

In [84]:
basedir = "./pkls/"

In [85]:
pickle_stuff(basedir+"zscore_df.pkl", final_df)

In [86]:
pickle_stuff(basedir+"df.pkl", df)

In [87]:
with open(basedir+"zscore_df_csv.csv", 'w') as zwf:
    zwf.write(final_df.to_csv())

In [88]:
with open(basedir+"df_csv.csv", 'w') as wf:
    wf.write(df.to_csv())

###Quick look at the ends of the data

In [89]:
print "duration", final_df['duration'].max(), final_df['duration'].min()
print "fade_in", final_df['fade_in'].max(), final_df['fade_in'].min()
print "tempo", final_df['tempo'].max(), final_df['tempo'].min()
print "tatums", final_df['tatum_density'].max(), final_df['tatum_density'].min()
print "beats", final_df['beat_density'].max(), final_df['beat_density'].min()
print "segments", final_df['segment_density'].max(), final_df['segment_density'].min()
print "sections", final_df['section_density'].max(), final_df['section_density'].min()
print "loudness", final_df['loudness'].max(), final_df['loudness'].min()


duration 13.8546846791 -2.08060387031
fade_in 15.0 -0.406142244727
tempo 3.97674823851 -3.49363791569
tatums 7.74961282674 -3.04077256445
beats 15.0 -1.83194598768
segments 8.91503392667 -2.39106154579
sections 6.78462296373 -3.95138660286
loudness 2.04678788423 -7.62240809715


In [90]:
print final_df.iloc[:,41:].max()
print final_df.iloc[:,41:].min()

pitch_median2       3.044243
pitch_median3       5.158474
pitch_median4       6.471354
pitch_median5       5.116924
pitch_median6       5.858537
pitch_median7       5.885733
pitch_median8       5.098400
pitch_median9       5.861246
pitch_median10      5.026105
pitch_median11      6.526543
pitch_median12      5.616900
pitch_mode1         0.226147
pitch_mode2         0.234061
pitch_mode3         0.380304
pitch_mode4         0.656484
pitch_mode5         0.414412
pitch_mode6         0.501410
pitch_mode7         0.472442
pitch_mode8         0.371791
pitch_mode9         0.524449
pitch_mode10        0.364106
pitch_mode11        0.554039
pitch_mode12        0.376484
loud_mean           1.880684
loud_median         1.895146
loud_mode           1.612068
timbre_mean1        2.081075
timbre_mean2        4.039996
timbre_mean3        7.824561
timbre_mean4        9.177122
                     ...    
timbre_mean7        7.663099
timbre_mean8       11.249824
timbre_mean9        4.907108
timbre_mean10 

In [91]:
print final_df[final_df['duration']> 8][["track_id", "duration"]]

                track_id   duration
155   TRBGBIZ12903CB26BA  13.854685
288   TRBGYHC12903D0626A  13.814860
510   TRBHDSW128F42ABC12   9.133126
726   TRBCVOS128F92F037C  12.016781
1006  TRBBLNE128F4256D3F   8.455414
1409  TRBFCJQ128F427086F   9.006327
2763  TRAXMJB128F427089F   9.129922
2823  TRAXIQA128F422C8C7   8.462051
3849  TRAHAMC128F425B346   8.620894
4622  TRANLLL128F42268D9  10.130813
4710  TRANFLR128F931CF30  12.689229
5326  TRALKGV128F932CADE   8.896922
7212  TRAFLYL128F932DB93  11.913327
7431  TRAEGRR128F422285F  10.180022
7739  TRADNOD128F4262F3D  11.221882


In [92]:
print final_df[final_df['fade_in']> 10][["track_id", 'fade_in']]

                track_id    fade_in
267   TRBGAUG128F428378E  11.260695
451   TRBHIDP128F147DE7D  15.000000
1445  TRBFIBL128F423AC68  10.309874
2528  TRAPVGB128F42725AF  15.000000
3536  TRAGAVB128F932585E  11.353850
4457  TRAMOUF128F9353A1B  12.357673
5856  TRAVFSW128F145C1D2  15.000000
7019  TRARFWJ128F9339D70  15.000000
8935  TRAACQT128F9331780  14.676372
9253  TRAYCJB128F424AB43  15.000000
9495  TRAUNWX12903D01764  10.480657


In [93]:
print final_df[final_df['beat_density']> 8][["track_id", "beat_density"]]

                track_id  beat_density
848   TRBCOIT128F933E906      8.749894
4440  TRAMSIH128F42990AB     12.886391
4441  TRAMSBQ128F4238B2D     15.000000
5502  TRACLXN128F931A74C     11.140565


In [94]:
print final_df[final_df['timbre_median4']> 6][["track_id", 'timbre_median4']]

                track_id  timbre_median4
481   TRBHFZP12903CE88C3        7.192954
2647  TRAPADM128F92CD44E        7.328467
4637  TRANCRD12903CDE242        6.448273
5497  TRACLPE128F4287687       11.336674
5655  TRACYDQ128F932C763        6.891586
5769  TRAVNKY12903CCC5B3        6.015094
9423  TRAUPNY12903C9DE65        6.280354


In [95]:
print final_df[final_df['timbre_median8']> 6][["track_id", 'timbre_median8']]

                track_id  timbre_median8
5655  TRACYDQ128F932C763       13.866641
9171  TRAYGKQ128F932F520        8.410282
9406  TRAYQAN128F146247B        6.766265
