## Evolution of the Informational Complexity of Contemporary Western Music

In [1]:
import hdf5_getters as h
import os
import glob
import sqlite3
import math
import numpy as np
import random as rnd
import scipy

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

### Data Extraction

In [3]:
#create song class; thin version to save memory
class Song:
    def __init__(self,hfile):
        #initialization from h5 file
        self.song=h.get_song_id(hfile)
        self.artist=h.get_artist_id(hfile)
        self.aname=h.get_artist_name(hfile)
        self.title=h.get_title(hfile)
        self.duration=h.get_duration(hfile)
        self.key=h.get_key(hfile) #
        self.loudness=h.get_loudness(hfile)
        self.tempo=h.get_tempo(hfile)
        self.timeSig=h.get_time_signature(hfile) #
        self.segments=h.get_segments_start(hfile)
        self.pitches=h.get_segments_pitches(hfile)
        self.timbres=h.get_segments_timbre(hfile)
        self.volumes=h.get_segments_loudness_max(hfile)
        self.sections=h.get_sections_start(hfile)
        self.beats=h.get_beats_start(hfile)
        self.bars=h.get_bars_start(hfile)
        self.year=h.get_year(hfile)
        try: self.terms=h.get_artist_terms(hfile)#[0]
        except: self.terms=None #print(song.song,song.aname,song.title,len(song.terms)) #bad songs
        self.term_weights=h.get_artist_terms_weight(hfile)
        self.tags=h.get_artist_mbtags(hfile)
        self.energy=h.get_energy(hfile) #
        self.mode=h.get_mode(hfile) #
        #codeword arrays
        self.pcodes=[] #pitch
        self.lcodes=[] #loudness
        self.tcodes=[] #timbre
        self.rcodes=[] #rhythm
        #entropy measures
        self.zipEntropy=0
        self.totalEntropies=[] #indexed zip, shannon, conditional, average
        self.pentropy=0
        self.pconEntropy=0
        self.pzipEntropy=0
        self.lentropy=0
        self.lconEntropy=0
        self.lzipEntropy=0
        self.tentropy=0
        self.tconEntropy=0
        self.tzipEntropy=0
        self.rentropy=0
        self.rconEntropy=0
        self.rzipEntropy=0
        #hotness measures
        self.afam=h.get_artist_familiarity(hfile)
        self.ahot=h.get_artist_hotttnesss(hfile)
        self.shot=h.get_song_hotttnesss(hfile)
        self.simartists=h.get_similar_artists(hfile)

In [5]:
#calculate entropy based on scipy
def entropy(array):
    elements={}
    for element in array:
        elements.setdefault(element,0)
        elements[element]+=1
    pk=[elements[k] for k in elements]
    return scipy.stats.entropy(pk,base=2)

In [6]:
#conditional entropy function, can be used with multiple priors (lookback argument)
def conditionalEntropy(array,lookback=1): #look back one position by default
    uniques={}
    biuniques={}
    priors={} #count how many times x is a prior to some y
    total=len(array)
    for element in array:
        element=str(element)
        if not(element in uniques): uniques[element]=1
        else: uniques[element]+=1
    #create array of instances of x and y together
    for i in range(total):
        if i<lookback: pass
        else: 
            bielement=""
            for j in range(lookback,0,-1): #count backwards
                bielement+=","+str(array[i-j])
            prior=bielement #count priors, do not count current y; may be one or multiple elements
            if not(prior in priors): priors[prior]=1
            else: priors[prior]+=1
            bielement+=","+str(array[i]) #add current element
            if not(bielement in biuniques): biuniques[bielement]=1
            else: biuniques[bielement]+=1
    #print total,len(uniques),len(biuniques),len(priors)
    #print priors.keys()[:10], biuniques.keys()[:10]
    conEntropy=0
    for key in biuniques: #look at Y|X
        #pkey=float(biuniques[key])/total
        prior=key[:key.rfind(",")] #cut out last element
        px=float(priors[prior])/(total-lookback)
        pcon=float(biuniques[key])/priors[prior]
        conEntropy+=pcon*math.log(pcon,2)*px
    return conEntropy*-1

In [9]:
#get song pitch codewords; uses binary thresholds of 0.5
def calculatePitchEntropy(hfile):
    #calculate pentropy and pconEntropy for song object based on pitch codewords
    pcodes=[]
    pitches=h.get_segments_pitches(hfile)
    for pitch in pitches:
        codeword="" #use string representation for later comparisons
        for element in pitch:
            if element>.5: codeword+='1'
            else: codeword+='0'
        pcodes.append(codeword)
    if pcodes: return (entropy(pcodes),conditionalEntropy(pcodes))
    else: print("No pitch data for %s, %s, %s" % (h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile)))

In [17]:
#get song loudness codewords; rounds to nearest decibel of the maximum loudness in the segment
def calculateLoudnessEntropy(hfile):
    lcodes=[]
    volumes=h.get_segments_loudness_max(hfile)
    for volume in volumes:
        lcodes.append(int(round(volume)))
        #song.lcodes.append(round(volume*5)/5) #rounds to nearest fifth of decibel
    if lcodes: return (entropy(lcodes),conditionalEntropy(lcodes))
    else: print("No loudness data for %s, %s, %s" % (h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile)) )

In [10]:
#shortcut to populate timbre thresholds for each element in the timbre vector (minus the loudness component), based on the 10,000 song subset
thresholds=[(-18.013999999999999, 36.572000000000003),(-15.398999999999999, 30.408999999999999),(-12.234, 17.236999999999998),(-18.550999999999998, 11.664999999999999),(-23.155999999999999, 0.99399999999999999),(-14.17, 9.3079999999999998),(-10.507999999999999, 6.9219999999999997),(-5.6420000000000003, 12.021000000000001),(-4.6890000000000001, 9.907),(-6.1059999999999999, 6.7590000000000003),(-4.8019999999999996, 10.147)]
print(len(thresholds))

11


In [11]:
#create ternary timbre codewords (based on 11 dimensional vectors); depends on thresholds set above
def calculateTimbreEntropy(hfile):
    tcodes=[]
    timbres=h.get_segments_timbre(hfile)
    for timbre in timbres:
        codeword="" #string representation
        for i in range(11):
            if timbre[i+1]<thresholds[i][0]: codeword+='0'
            elif timbre[i+1]<thresholds[i][1]: codeword+='1'
            else: codeword+='2'
        tcodes.append(codeword)
    if tcodes: return (entropy(tcodes),conditionalEntropy(tcodes))
    else: pass #print("No timbre data for %s, %s, %s" % (h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile)) )

In [13]:
#calculate rhythm based on beats in section (based on an average 16th note in the piece)
#note that this does not explicitly account for key signature changes; however, changes will be reflected in the resulting complexity score
def calculateRhythmEntropy(hfile):
    #calculate the average beat
    rcodes=[]
    segDiffs=[]
    beats=h.get_beats_start(hfile)
    for i in range(len(beats)):
        if i==0: pass #skip the first iteration
        elif beats[i]==0: pass
        elif beats[i-1]==0: pass
        else: segDiffs.append(beats[i]-beats[i-1])
    average=0
    for diff in segDiffs: average+=diff
    try: average=average/len(segDiffs) #average beat
    except: #division by zero
        print("No segment data for %s, %s, %s" % (h.get_song_id(hfile),h.get_artist_id(hfile),h.get_artist_name(hfile)) )
        return #ignore song with bad beat data
    sixteenth=average/4 #sixteenth note
    #calculate rhythm in terms of number of sixteenths based on average beat
    segments=h.get_segments_start(hfile)
    for i in range(len(segments)): 
        if i==0: pass #skip the first iteration
        else: 
            duration=segments[i]-segments[i-1]
            rcodes.append(round(duration/sixteenth*4)/4) #can distinguish 64th notes
            #song.rcodes.append(round(duration/sixteenth*2)/2) #can distinguish 32nd notes
            #rcodes.append(round(duration/sixteenth)) #can distinguish 16th notes
    if rcodes: return (entropy(rcodes),conditionalEntropy(rcodes))
    else: print("No rhythm data for %s, %s, %s" % (h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile)) )

In [14]:
#shortcut to populate genre durations from a sample (root folder A)
gd={'folk rock': 241.29038304516129, 'power metal': 283.04389999999989, 'dance': 282.20036205882349, 'speedcore': 202.9655696551724, 'folktronica': 240.07977857142859, 'ost': 279.14404000000002, 'swamp pop': 139.45424, 'heavy metal': 272.11441694323156, 'meditation': 333.67464899999993, 'classic rock': 259.45216449612417, 'alternative rock': 234.7075970609319, 'ballet': 180.4142425, 'turkish': 328.22811999999999, 'dirty rap': 171.29837666666666, 'glam rock': 243.54058911111107, 'heartland rock': 246.38380666666666, 'alternative hip hop': 234.69271376811594, 'early music': 259.82566875000003, 'dancehall': 230.86107212624583, 'rap metal': 210.60235861111116, 'marimba': 285.45696333333336, 'piano rock': 242.82873463414634, 'bass music': 229.75342390625008, 'speed metal': 246.00778445714275, 'turnablism': 209.11515551724139, 'power violence': 212.2117925, 'neo classical metal': 359.63600777777782, 'crust punk': 196.83593285714286, 'juke joint blues': 206.11873, 'jazz blues': 201.43410416666663, 'noise pop': 225.73860060975599, 'british blues': 233.86615000000003, 'hardcore techno': 366.25913750000001, 'dark pop': 175.00689, 'black metal': 271.88357215517232, 'post rock': 271.30317706766908, 'nortec': 173.34811999999999, 'christian metal': 239.1326078125, 'riot grrrl': 173.47873124999998, 'illbient': 254.22515537037037, 'alternative country': 219.9948291666667, 'japanese': 237.5614516666667, 'hard trance': 316.93604431325286, 'rock steady': 193.21318144144149, 'folk': 211.13184142857139, 'intelligent dance music': 302.7905998, 'garage rock': 186.4666986521739, 'indie rock': 213.39217971428556, 'new jack swing': 258.60083709677417, 'orchestra': 244.22936319587646, 'math-core': 171.94476166666664, 'funeral doom': 285.58321999999998, 'nu-soul': 245.49832000000001, 'celtic punk': 196.50210545454544, 'christian': 254.57587000000001, 'mbalax': 310.26706714285717, 'oldies': 192.80498900000001, 'groove metal': 240.43531636363633, 'melodic metalcore': 205.48274999999998, 'surf music': 180.10077134615383, 'ragtime': 190.56894406249992, 'german rap': 216.39791, 'boogaloo': 271.08961666666664, 'metalcore': 259.38566875000004, 'screamo': 196.19628156250002, 'shock rock': 200.17741058823532, 'chant': 329.96526, 'glam metal': 230.82700545454543, 'patriotic': 189.12421142857144, 'gangster rap': 231.4795718430033, 'mbaqanga': 209.86730249999999, 'sxsw 2007': 298.84035999999998, 'protopunk': 585.79546500000004, 'trip hop': 262.74151887290157, 'jazz funk': 276.85300069958845, 'irish folk': 212.52107410256409, 'world beat': 291.30651607594933, 'africa': 350.82403499999998, 'hardcore metal': 192.27264454545454, 'melodic black metal': 255.22893500000001, 'hard house': 295.70874252100828, 'europop': 231.84212198757771, 'disco house': 270.05179125000001, 'freakbeat': 204.10469050000003, 'east coast hip hop': 218.50729999999999, 'uk garage': 275.7544779130435, 'symphony': 288.02131499999996, 'jive': 190.00335749999999, 'ballad': 221.11909778625952, 'dubtronica': 303.51627999999999, 'straight edge': 151.38142982456139, 'relax': 266.41369333333336, 'freestyle': 250.80889180952389, 'newgrass': 116.4273, 'nintendocore': 171.5587275, 'instrumental': 311.82321999999999, 'peace punk': 168.737506, 'chicago blues': 230.02677899999995, 'world music': 259.29481705882364, 'glam': 172.25968333333333, 'karaoke': 196.02613571428569, 'grindcore': 162.85272214876039, 'vaudeville': 165.63404565217391, 'highlife': 1083.9375075, 'soul': 161.22207799999998, 'progressive bluegrass': 230.44947734177217, 'dark ambient': 362.10964081081067, 'skiffle': 162.85995500000001, 'louisiana blues': 186.33499384615382, 'indie': 208.20341608695654, 'memphis blues': 205.19138499999997, 'frenchcore': 246.93505636363636, 'gabba': 281.05931647058827, 'batucada': 188.89098000000001, 'jump blues': 200.18712569230775, 'flamenco': 245.43309372781081, 'string': 355.69587399999995, 'swedish': 187.47165333333331, 'cantonese pop': 234.39457673913046, 'san francisco bay area': 126.87627999999999, 'dark wave': 260.77942520270273, 'poetry': 33.410159999999998, 'hardcore punk': 162.15300672131153, 'musica': 226.37669, 'free improvisation': 372.6122575892856, 'neue deutsche welle': 308.28101600000002, 'ambient black metal': 784.79628333333346, 'bebop': 319.66305355932201, 'funny': 177.60027333333335, 'alternative pop rock': 294.08607999999998, 'deathgrind': 135.31963777777779, 'soundtrack': 199.56697934210533, 'comedy rock': 197.07383187499997, 'salsa': 259.85319263681589, 'jungle music': 304.36578273333345, 'canto nuevo': 168.20199500000001, 'neoclassical': 258.25210461538461, 'alternative metal': 247.9597046124031, 'visual kei': 255.99641199999999, 'tarantella': 200.22811999999999, 'gothic metal': 304.19198133333333, 'greek': 193.36164714285718, 'tex mex': 196.70312411764704, 'indie pop': 213.26940684210544, 'bossa nova': 219.11029537037044, 'minimal techno': 415.7905633333333, 'new rave': 253.62855058823533, 'brazilian pop music': 218.14579616666666, 'ragga jungle': 231.94076999999999, 'space age pop': 187.23904095238092, 'progressive metal': 308.83025468085106, 'arabesque': 330.30671857142852, 'urban folk': 201.75383281250001, 'instrumental rock': 268.92517181818181, 'klezmer': 259.01505718749996, 'detroit rap': 92.159549999999996, 'happy hardcore': 235.73443191304349, 'german pop': 208.24121792899419, 'stoner rock': 235.2446385714286, 'christmas music': 147.27790999999999, 'fado': 210.29745000000003, 'slack key guitar': 267.38893250000001, 'texas': 216.737505, 'funk rock': 238.40954265306124, 'celtic metal': 164.27056999999999, 'chamber music': 286.61342374999992, 'modern laika': 216.794974, 'space rock': 295.64023101010105, 'brutal death metal': 269.57342416666671, 'quiet storm': 283.34618818181821, 'sonata': 258.55954600000001, 'country rock': 215.81957045588263, 'teen pop': 235.50443438016521, 'comedy': 219.78934265625, 'miami bass': 234.31791250000003, 'post-grunge': 237.80490044827587, 'opera': 253.22134717647069, 'cowpunk': 167.72308666666666, 'slide': 428.48608000000002, 'rock': 217.94592350649333, 'acoustic': 248.77668666666668, 'quartet': 160.29995500000001, 'classic country': 173.45885499999997, 'soul music': 233.70839000000001, 'england': 165.66811999999999, 'j pop': 234.13195762711865, 'house': 455.82135428571428, 'neo soul': 237.84855492385782, 'g funk': 227.44397857142854, 'dubstep': 292.41648720430118, 'new orleans blues': 248.12449866666665, 'bulerias': 233.80417, 'blues': 238.44052299019611, 'hard rock': 236.41382689873407, 'electronica': 282.08812230769234, 'death core': 216.90368872340426, 'jazz metal': 353.54076999999995, 'future jazz': 297.69931354114721, 'new weird america': 227.490455, 'sadcore': 236.71195882352941, 'dutch': 192.16934499999999, 'delta blues': 215.88255528571437, 'ska punk': 186.87888228813563, 'bass': 153.80852999999999, 'drone doom metal': 602.53994999999998, 'techno': 346.32510573459706, 'california': 199.35562719999999, 'dance-punk': 198.36630000000002, 'pop rock': 229.22365429616116, 'power electronics': 336.73322724137932, 'chamber pop': 225.73564470000002, 'technical death metal': 312.36888888888888, 'soul blues': 272.63035537037041, 'punk': 172.2295546, 'stride': 183.34351818181818, 'swing': 170.93179250000003, 'old school hip hop': 375.73324444444444, 'nu metal': 208.39613257575758, 'industrial': 307.77424000000002, 'americana': 218.85777833333336, 'neo-progressive': 346.10662985294118, 'christian hip hop': 194.65750249999999, 'ethnic': 202.40063499999999, 'congo': 387.43138249999998, 'colombia': 227.452934, 'spanish': 340.53179, 'goregrind': 134.91079928571429, 'piano blues': 204.09270364705876, 'tribal house': 354.24395864864869, 'folk punk': 187.63259344827588, 'funk metal': 274.72972676470584, 'chanson': 204.0194761422413, 'world fusion': 224.93995000000001, 'mandarin pop': 230.62158749999998, 'grime': 225.44894656565654, 'tango': 246.00379100000001, 'roots rock': 218.91598059210531, 'piedmont blues': 172.52869789473689, 'sea shanties': 324.67546500000003, 'acid jazz': 267.8896145833333, 'twee pop': 219.76345744186048, 'urban': 198.58240000000001, 'reggaeton': 225.97955275362321, 'vocal jazz': 217.38691425531914, 'electro-funk': 285.68335666666667, 'folk-pop': 224.96331158823529, 'nu jazz': 294.24934250000001, 'canterbury scene': 297.47111000000001, 'christian rock': 257.93084685393251, 'italian rap': 263.02649000000002, 'industrial rock': 255.87183555555555, 'death metal': 236.86075480392159, 'bel canto': 226.17293599999999, 'vocal': 165.21423999999996, 'eurodance': 280.17400761904759, 'sweden': 265.47329111111111, 'yodel': 160.16673, 'marrabenta': 321.12280999999996, 'lullaby': 180.58062869565217, 'suomi rock': 214.32770916666666, 'merseybeat': 156.85440440000002, 'humppa': 207.69478416666666, 'rai': 256.68339428571431, 'rap': 228.30617600000008, 'paisley underground': 254.61505499999998, 'blue-eyed soul': 222.8671945652174, 'hip pop': 166.86539666666667, 'cool jazz': 283.74986914634144, 'ragamuffin': 255.84281285714286, 'trance': 401.82729629370635, 'oriental metal': 189.56145333333333, 'rap rock': 237.10779075000011, 'art rock': 264.76504160173147, 'christian hardcore': 226.33097624999999, 'new beat': 264.21706557692312, 'eurobeat': 247.00689249999996, 'frevo': 259.57832166666662, 'nardcore': 169.25995499999999, 'cello rock': 220.94322, 'electropop': 229.64967794117649, 'germany': 228.08335666666667, 'psychedelic': 274.06757166666665, 'celtic fusion': 229.48899285714285, 'southern soul': 217.70248130952379, 'singer': 199.14403999999999, 'electro rock': 223.10018714285715, 'argentine rock': 252.57750999999999, 'breakcore': 260.79570000000012, 'queercore': 170.63138333333333, 'metal': 268.88444500000003, 'hyphy': 227.36696727272724, 'dirty south rap': 228.55130439759037, 'cumbia': 214.74399813953494, 'los angeles': 232.87517817567561, 'gothic rock': 234.8363438461538, 'italian pop': 214.37603400000003, 'latin jazz': 250.35559303664934, 'indian classical': 546.63023058823546, 'murga': 193.69750499999998, 'german': 276.52308333333332, 'western swing': 171.36434882352944, 'jam band': 344.28406701492531, 'no wave': 322.24306346153844, 'radio': 140.61669000000001, 'dance rock': 254.34602247422674, 'deathrock': 211.34135428571426, 'minnesota': 161.89342499999998, 'rock argentina': 267.81995000000001, 'hardcore rap': 233.09669000000002, 'brass band': 227.95448600000003, 'deep house': 352.14391123529441, 'piano': 149.30893499999999, 'kraut rock': 256.92993093023256, 'punk blues': 215.4283111764706, 'rebetika': 164.77995000000001, 'gagaku': 54.32771000000001, 'waltz': 450.36101600000001, 'dj': 237.41342500000002, 'beat': 201.60542400000006, 'jazz': 259.74612017647041, 'gnawa': 325.655055, 'singer-songwriter': 219.60175529411759, 'breakbeat': 296.81821717948708, 'melodic hardcore': 171.93467301204817, 'futurepop': 276.55698142857148, 'progressive': 356.64934333333332, 'dance music': 320.70920666666666, 'emo': 214.22204553956831, 'disco': 265.40559974842779, 'electric blues': 268.9846952631579, 'hardstyle': 302.8906705084745, 'hip hop': 229.60960853806225, 'southern gospel': 215.18278400000008, 'ohio': 217.86077, 'gospel': 274.03198630434798, 'musette': 206.79464812499998, 'bachata': 244.41661545454545, 'outsider music': 183.48363000000001, 'dutch jazz': 373.44607999999999, 'chanson francaise': 193.75721428571433, 'traditional pop': 184.52852999999999, 'new wave': 233.42210892537312, 'stoner metal': 272.85860789473696, 'brazil': 235.66322, 'chicago soul': 197.20526125000004, 'motown': 219.11510234782614, 'soul jazz': 307.16095653409099, 'thrash core': 193.97279692307694, 'r&b': 227.5449655384615, 'sludge metal': 247.69841222222223, 'east coast blues': 158.07693399999999, 'alternative rap': 200.25424000000001, 'latin': 264.51964440000006, 'chill-out': 270.74908235795453, 'pop folk': 208.11402411764701, 'zouk': 294.30013638888892, 'avantgarde metal': 210.95138333333333, 'liedermacher': 334.83710000000002, 'guitar': 220.78648499999997, 'pagode': 260.64934, 'french': 316.6907066666667, 'martial industrial': 417.14893000000001, 'memphis soul': 180.03415850000005, 'country': 216.45525634146347, 'mariachi': 192.51834976744186, 'southern hip hop': 197.61587333333333, 'west coast rap': 258.50729999999999, 'memphis rap': 242.024035, 'experimental': 204.69506000000001, 'electronic music': 288.96607666666665, 'bongo': 163.98757333333336, 'pop punk': 190.64442179310345, 'chiptune': 218.83383249999997, 'cuddlecore': 253.98812000000001, 'pop': 228.02351531914888, 'beach music': 169.83726199999998, 'ranchera': 175.79460928571427, 'sexy': 220.99546000000001, 'rave': 297.43236300000001, 'melodic death metal': 262.71301499999998, 'noise rock': 242.93063964705891, 'jazz rap': 215.13683000000003, 'dub': 289.80029206521749, 'rapcore': 217.28337586206897, 'polka': 157.12027222222221, 'tropical': 285.04771, 'island music': 247.03150846153852, 'indietronica': 240.79453999999998, 'melodic trance': 442.21475693877574, 'skate punk': 151.91045017857144, 'chicago house': 336.27620818181816, 'christian punk': 184.94373578947366, 'charanga': 249.79149043478262, 'french pop': 208.29841823529409, 'soft rock': 233.02002706827292, 'motown and soul': 280.85505499999999, 'hardcore': 166.71301499999998, 'funk': 242.27896366071431, 'qawwali': 660.46938666666676, 'video game music': 211.20999599999999, 'pinoy rock': 256.31562799999995, 'northern soul': 179.84839451612899, 'serialism': 492.56183199999998, 'merengue': 278.86276372093016, 'canadian': 161.98485500000001, 'male vocalist': 224.6556805882353, 'kirtan': 323.60444000000001, 'schranz': 346.34142200000002, 'bolero': 197.867877254902, 'underground rap': 205.74866111111112, 'mexico': 186.00770875000001, 'industrial dance': 303.44770968750004, 'bluegrass': 196.61442903846148, 'mondiovision': 246.477915, 'ambient': 514.59611999999993, 'boy band': 200.50596454545456, 'doom metal': 365.13851607142851, 'space synth': 441.18158500000004, 'alternative dance': 266.81203274647879, 'christian music': 203.07173428571429, 'italy': 304.32607999999999, 'slow core': 288.70267541666664, 'jug band': 215.09416636363633, 'rock opera': 216.25859333333332, 'psychedelic pop': 359.77533, 'synthpop': 263.4884405263158, 'rhythm noise': 255.22767032258062, 'electro': 282.49929926056348, 'lovers rock': 245.41995435294123, 'nu breaks': 413.54925999999995, 'manouche': 185.85206333333335, 'finish': 201.41669000000002, 'kizomba': 266.20596142857141, 'digital hardcore': 190.87845749999997, 'rockabilly': 164.22239719999996, 'freak folk': 253.38492636363637, 'celtic': 244.80909115384623, 'poland': 237.19138333333333, 'punk pop': 144.50892999999999, 'hip house': 272.59155418604644, 'ccm': 244.98817494186048, 'psychobilly': 180.59006418803418, 'swiss': 226.91219749999999, 'soukous': 324.03285300000005, 'southern rock': 242.36771503401363, 'parody': 187.68852572916663, 'madchester': 214.64335499999996, 'tech house': 354.14623134814792, 'jazz fusion': 334.60671166666668, 'choro': 199.69261, 'chamber jazz': 271.13626190476191, 'chinese music': 241.78301962264146, 'neofolk': 258.98621323232328, 'baroque music': 207.75137999999998, 'power pop': 211.55604117647061, 'alternative': 240.17171818181816, 'cabaret': 210.58555336283192, 'nederpop': 216.23744714285712, 'british pop': 233.60882007434955, 'bhangra': 274.99943905660388, 'samba': 208.50571876923084, 'experimental pop': 207.59900166666668, 'musical theater': 170.82730000000001, 'easy listening': 213.18071791366896, 'ska': 198.32095035087724, 'new romantic': 266.46951767857144, "rock 'n roll": 198.20698642857147, 'experimental rock': 270.76678658878478, 'concerto': 286.05548578947366, 'all-female': 164.86765142857143, 'dark cabaret': 186.59219999999999, 'fusion': 225.4624, 'big band': 172.80825333333334, 'desi': 319.3551916666666, 'electro-industrial': 277.54403666666673, 'hard bop': 354.8384454893615, 'world': 229.68474090909092, 'banda': 182.47007719999999, 'symphonic metal': 332.9731829411765, 'trip rock': 303.55546500000003, 'calypso': 234.9686702941176, 'brazilian jazz': 253.57015909090904, 'electronic': 309.76370249999997, 'glitch': 281.75852817204304, 'free jazz': 423.16479455357148, 'latin pop': 236.24784040000006, 'chorus': 162.37669199999999, 'garage': 332.61669000000001, 'psychedelic rock': 242.7085678740157, 'classic female blues': 189.25668888888887, 'classical': 308.51522658536578, 'blues-rock': 250.24694395532256, 'female vocalist': 226.18809804878055, 'scottish': 225.77587499999998, 'honky tonk': 171.90831016042787, 'big beat': 266.76355238410605, 'italian disco': 249.08025355704697, 'sympho black metal': 300.21330976744184, 'zeuhl': 510.26675666666665, 'progressive rock': 287.97442884615384, 'vallenato': 267.02833130434783, 'crossover thrash': 160.22884333333332, 'swamp blues': 253.19399400000003, 'vocal house': 334.18077125000008, 'modern rock': 248.64349251336893, 'choral music': 208.66567000000001, 'free music': 236.14648333333332, 'shoegaze': 247.18828428571442, 'contemporary classical music': 178.55301750000001, 'afrobeat': 313.49259283018876, 'stand-up comedy': 162.09038800000005, 'gypsy jazz': 209.84732558823535, 'outlaw country': 196.2344558823529, 'spanish pop': 214.17750666666666, 'viking metal': 273.16278787878792, 'minimal': 462.12888777777778, 'doo-wop': 172.16683656976741, 'gaita': 237.91908095238094, 'requiem': 250.52036375, 'new orleans jazz': 304.0887975, 'symphonic rock': 276.4709234951456, 'kwaito': 264.01913999999999, 'progressive trance': 411.73024784313719, 'downtempo': 293.53578514970064, 'folk metal': 291.99628117647057, 'oi': 176.68382877005351, 'cowboy': 248.99872999999999, 'broken beat': 280.67251083333332, 'melbourne': 265.52444500000001, 'string quartet': 335.67861357142857, 'exotica': 165.40253999999999, 'anti-folk': 203.51457190476197, 'ghetto tech': 241.28785818181814, 'political': 180.29669000000001, 'smooth jazz': 265.80830467796602, 'beatboxing': 239.88200000000001, 'british invasion': 203.4000365, 'united states': 140.43383, 'thrash metal': 176.69179, 'mambo': 177.31624190476191, 'progressive house': 350.97560411764709, 'swamp rock': 256.51329000000004, 'math rock': 226.0962026262626, 'madagascar': 224.16933999999998, 'adult contemporary': 240.97914, 'soca': 264.93913874999998, 'funky house': 267.74884555555559, 'lambada': 212.11383000000001, 'buddamat': 279.32688999999999, 'brazilian': 218.90566999999999, 'industrial metal': 281.17493800000005, 'goa': 333.29913875, 'grunge': 243.87094394230766, 'pop rap': 236.70119804635766, 'space music': 241.07056666666668, 'boogie-woogie': 229.05080368421056, 'electroclash': 229.60243800000003, 'game': 180.3184588888889, 'drum and bass': 342.23410333333339, 'psychedelic trance': 437.33268136363625, 'horror punk': 172.19499809523811, 'jrock': 260.48514571428575, 'harmonica blues': 247.57042096774191, 'skacore': 298.76199624999998, 'rumba': 228.98594857142857, 'texas blues': 232.94945409090909, 'tejano': 210.13551944881891, 'roots reggae': 233.29176468468478, 'shibuya-kei': 360.07137999999998, '8-bit': 266.65315249999998, 'irish': 201.18681200000003, 'estonia': 435.06893000000002, 'classic': 462.76395400000001, 'cover': 240.22158999999999, 'lo-fi': 96.652609999999996, '2 tone': 198.31682833333332, 'hardcore hip hop': 271.50757333333337, 'portugese': 228.10077199999995, 'zouklove': 269.87056999999999, 'filk': 113.56502714285715, 'religious music': 325.83568365853654, 'speed garage': 173.43170999999998, 'instrumental pop': 244.16341657407403, 'country music': 224.22485250000003, 'guitarist': 253.85750999999999, 'brill building pop': 179.74555705882361, 'dream pop': 246.03559028846146, 'new age': 252.93202571428574, 'club': 349.21525465517249, 'synth punk': 223.89179249999995, 'electro hip hop': 293.96417000000002, 'jangle pop': 209.29875470588235, 'baile funk': 143.82974999999999, 'reggae': 231.48974550632906, 'country gospel': 196.15254946428573, 'celtic rock': 254.21312795454548, 'crunk': 198.85233555555558, 'tech metal': 304.40444500000001, 'post-hardcore': 224.57131626865669, 'massachusetts': 184.02958700000002, 'underground hip hop': 249.18158499999998, 'song writer': 216.06412666666665, 'national socialist black metal': 379.81995499999999, 'remix': 218.68362500000001, 'close harmony': 189.94062181818182, 'chinese rock': 196.44036, 'mento': 224.15628000000001, 'country blues': 189.33429386861317, 'new prog': 297.03791000000001, 'comedy rap': 283.04673199999996, 'detroit techno': 328.05448117647057, 'dance pop': 257.37778266666663, 'zydeco': 220.21544880000008, 'bubblegum pop': 202.34186, 'show tunes': 203.50264441176478, 'blackened death metal': 262.17895777777778}

In [15]:
#shortcut to populate genre variances from a sample (root folder A)
gv={'folk rock': 113.95439009839326, 'power metal': 92.8263398393315, 'dance': 118.67287598986474, 'speedcore': 97.70867652464301, 'folktronica': 54.42910984360048, 'ost': 0.0, 'swamp pop': 34.645704315761854, 'heavy metal': 110.65735254620664, 'meditation': 228.64854619496157, 'classic rock': 134.41313155278334, 'alternative rock': 112.84839579145847, 'ballet': 42.0882690463699, 'turkish': 0.0, 'dirty rap': 81.35467717376777, 'glam rock': 104.95373239230096, 'heartland rock': 75.27241598180787, 'alternative hip hop': 84.32952532910836, 'early music': 184.98248016389138, 'dancehall': 169.03375867515123, 'rap metal': 66.08652969539287, 'marimba': 95.00105784615113, 'piano rock': 55.34686054811072, 'bass music': 89.82654255304968, 'speed metal': 116.4649315302803, 'turnablism': 142.7354739000985, 'power violence': 126.08882365195524, 'neo classical metal': 112.04304967085967, 'crust punk': 107.84326275432994, 'juke joint blues': 28.538780000000003, 'jazz blues': 49.85750940755272, 'noise pop': 89.5916573471023, 'british blues': 64.64530183289278, 'hardcore techno': 193.28821208696812, 'dark pop': 49.90693999999999, 'black metal': 122.2104438573278, 'post rock': 130.62163672774415, 'nortec': 0.0, 'christian metal': 55.99270895351843, 'riot grrrl': 57.360710250372264, 'illbient': 126.41458975536341, 'alternative country': 51.52399739090099, 'japanese': 68.49664991832847, 'hard trance': 126.45484667439878, 'rock steady': 57.6086667045755, 'folk': 108.29781873007367, 'intelligent dance music': 123.07012766321387, 'garage rock': 78.11561950180666, 'indie rock': 78.52010564019315, 'new jack swing': 48.62965737380737, 'orchestra': 229.80917217607453, 'math-core': 87.40907795570462, 'funeral doom': 33.78939, 'nu-soul': 11.89056712593081, 'celtic punk': 40.03155070191996, 'christian': 64.22204, 'mbalax': 54.26147784940376, 'oldies': 72.42033084267887, 'groove metal': 84.02064149537253, 'melodic metalcore': 47.85275704221697, 'surf music': 65.59249492377714, 'ragtime': 68.00748204070813, 'german rap': 0.0, 'boogaloo': 134.66863394789976, 'metalcore': 67.7047352179025, 'screamo': 69.3356214365777, 'shock rock': 72.16505400244274, 'chant': 123.70285999999999, 'glam metal': 37.664245135643846, 'patriotic': 164.48648183131706, 'gangster rap': 71.83153168579294, 'mbaqanga': 70.89256464689663, 'sxsw 2007': 101.64244999999998, 'protopunk': 236.46522330314755, 'trip hop': 104.12679613957675, 'jazz funk': 127.4751808010732, 'irish folk': 73.48365737239733, 'world beat': 138.39444825996102, 'africa': 12.460405000000009, 'hardcore metal': 57.6696375871281, 'melodic black metal': 78.197555, 'hard house': 109.11211561160677, 'europop': 64.77831355761408, 'disco house': 76.32570191065628, 'freakbeat': 87.94741319857954, 'east coast hip hop': 15.889874108039686, 'uk garage': 106.95226813051384, 'symphony': 162.16353363260598, 'jive': 39.43881968066765, 'ballad': 70.4888525543505, 'dubtronica': 0.0, 'straight edge': 89.74103275447389, 'relax': 78.05803746374716, 'freestyle': 90.5775031754843, 'newgrass': 0.0, 'nintendocore': 80.60888659298223, 'instrumental': 0.0, 'peace punk': 76.85499914405922, 'chicago blues': 77.96679321724304, 'world music': 96.51829598025122, 'glam': 40.2998205874024, 'karaoke': 61.660551428775385, 'grindcore': 120.01361130272083, 'vaudeville': 73.10345323577955, 'highlife': 722.869629482183, 'soul': 73.2397895881696, 'progressive bluegrass': 129.7378915416972, 'dark ambient': 121.32687481837284, 'skiffle': 62.310985001837224, 'louisiana blues': 57.62127029791869, 'indie': 87.87027798844547, 'memphis blues': 64.67862985367559, 'frenchcore': 69.49211890787974, 'gabba': 215.32457244574272, 'batucada': 0.0, 'jump blues': 69.93813216875854, 'flamenco': 81.38105016384418, 'string': 212.92692302943146, 'swedish': 10.777324357310995, 'cantonese pop': 55.92624089800679, 'san francisco bay area': 0.0, 'dark wave': 89.96210443774692, 'poetry': 0.0, 'hardcore punk': 102.52160828558017, 'musica': 0.0, 'free improvisation': 381.31970783375033, 'neue deutsche welle': 85.38892802478705, 'ambient black metal': 402.62334527322406, 'bebop': 221.8474979278884, 'funny': 126.90547832337079, 'alternative pop rock': 0.0, 'deathgrind': 62.51667411917591, 'soundtrack': 148.04970665996186, 'comedy rock': 112.28922920397204, 'salsa': 71.73593190284275, 'jungle music': 131.21645545746097, 'canto nuevo': 16.666125000000008, 'neoclassical': 46.85058956842873, 'alternative metal': 104.97926016260631, 'visual kei': 120.31095996496695, 'tarantella': 0.0, 'gothic metal': 127.30534508455371, 'greek': 45.74429749348069, 'tex mex': 47.275097100963904, 'indie pop': 101.68450566419946, 'bossa nova': 77.94758149585421, 'minimal techno': 50.05244406582782, 'new rave': 98.81481277144704, 'brazilian pop music': 64.65098814291049, 'ragga jungle': 0.0, 'space age pop': 54.72668410404901, 'progressive metal': 134.9294454781952, 'arabesque': 62.10983713544336, 'urban folk': 45.821682916381626, 'instrumental rock': 186.80587213293083, 'klezmer': 120.51437243129939, 'detroit rap': 0.0, 'happy hardcore': 103.32139193969958, 'german pop': 59.37016802453245, 'stoner rock': 94.8410248098497, 'christmas music': 0.0, 'fado': 56.1671800264254, 'slack key guitar': 113.09527867086813, 'texas': 17.86775499999999, 'funk rock': 92.34886929680626, 'celtic metal': 111.05958999999999, 'chamber music': 133.01458988327872, 'modern laika': 39.893350721659665, 'space rock': 239.51179122879768, 'brutal death metal': 207.5565225188028, 'quiet storm': 81.99433426244113, 'sonata': 70.43851886681139, 'country rock': 87.7449152644931, 'teen pop': 56.28684414245064, 'comedy': 192.01079038539262, 'miami bass': 47.230068955899476, 'post-grunge': 76.76644606987954, 'opera': 182.30724916331806, 'cowpunk': 29.2596647382494, 'slide': 0.0, 'rock': 82.6135376765678, 'acoustic': 39.47863876351812, 'quartet': 109.962445, 'classic country': 30.121033224860884, 'soul music': 11.316967558054875, 'england': 0.0, 'j pop': 73.46762906645756, 'house': 44.408307938826646, 'neo soul': 85.54972665876403, 'g funk': 37.58388829302579, 'dubstep': 77.19656155795612, 'new orleans blues': 92.6070973412221, 'bulerias': 11.820543609752757, 'blues': 73.9106650194068, 'hard rock': 81.46531806555345, 'electronica': 123.08661269921515, 'death core': 81.82969606697799, 'jazz metal': 46.94203999999999, 'future jazz': 135.2696018053122, 'new weird america': 65.61720528043374, 'sadcore': 140.6335634952445, 'dutch': 29.897144999999995, 'delta blues': 103.71308923536519, 'ska punk': 85.46045403759749, 'bass': 0.0, 'drone doom metal': 0.0, 'techno': 151.03643885689834, 'california': 77.93924023434704, 'dance-punk': 60.9189128293324, 'pop rock': 77.342007362306, 'power electronics': 148.23031874781432, 'chamber pop': 72.56996847430459, 'technical death metal': 144.1426926440334, 'soul blues': 88.3392994327214, 'punk': 57.574543181133485, 'stride': 14.100629133607553, 'swing': 32.067062898278024, 'old school hip hop': 167.20954085306604, 'nu metal': 68.43536336429106, 'industrial': 0.0, 'americana': 17.45741677759954, 'neo-progressive': 153.96250583921395, 'christian hip hop': 57.575146419595136, 'ethnic': 66.53787889486328, 'congo': 86.13257599424226, 'colombia': 50.16361074820694, 'spanish': 0.0, 'goregrind': 75.0571954382175, 'piano blues': 81.77895336050034, 'tribal house': 143.23086851768858, 'folk punk': 75.91801534414424, 'funk metal': 183.68714683775383, 'chanson': 69.77774290510051, 'world fusion': 0.0, 'mandarin pop': 32.39275506917046, 'grime': 85.18795616318303, 'tango': 108.96721950961577, 'roots rock': 88.20108624108838, 'piedmont blues': 19.84254402652346, 'sea shanties': 58.070205000000016, 'acid jazz': 99.4470843323502, 'twee pop': 72.17790067757211, 'urban': 0.0, 'reggaeton': 62.04251470662271, 'vocal jazz': 76.9719843955982, 'electro-funk': 86.04488469744705, 'folk-pop': 78.85567902957365, 'nu jazz': 108.8411069882361, 'canterbury scene': 163.68956842453457, 'christian rock': 93.32933126882241, 'italian rap': 0.0, 'industrial rock': 79.40215289447119, 'death metal': 82.04064384181694, 'bel canto': 93.80460085344015, 'vocal': 19.337489351808962, 'eurodance': 88.54732983976514, 'sweden': 27.925306186056975, 'yodel': 10.036606320979217, 'marrabenta': 22.28245000000001, 'lullaby': 41.01034408006322, 'suomi rock': 65.5700841044947, 'merseybeat': 83.88627151351025, 'humppa': 29.963731622979495, 'rai': 91.17820196260286, 'rap': 87.28021411988267, 'paisley underground': 131.7768020125447, 'blue-eyed soul': 54.266433536969195, 'hip pop': 98.15716919991031, 'cool jazz': 218.34354901425334, 'ragamuffin': 28.43005625828247, 'trance': 130.82122717232386, 'oriental metal': 9.159147260803758, 'rap rock': 68.63151033674951, 'art rock': 153.62050115475668, 'christian hardcore': 97.58032502550383, 'new beat': 128.34584505814044, 'eurobeat': 55.60721597415364, 'frevo': 47.740347290632975, 'nardcore': 59.82938832130799, 'cello rock': 55.74531, 'electropop': 62.476065879047155, 'germany': 119.50675653841064, 'psychedelic': 110.7615961505412, 'celtic fusion': 61.52179207116142, 'southern soul': 59.88862170982696, 'singer': 6.622039999999998, 'electro rock': 64.41783085688878, 'argentine rock': 0.0, 'breakcore': 192.05719943147608, 'queercore': 38.960078218482685, 'metal': 55.64999271768798, 'hyphy': 53.00774496510094, 'dirty south rap': 78.3279996517217, 'cumbia': 69.30597575918111, 'los angeles': 76.8053203123532, 'gothic rock': 72.22370029153092, 'italian pop': 27.211462852721162, 'latin jazz': 112.05429075169693, 'indian classical': 359.0689203648659, 'murga': 29.481301619187793, 'german': 75.21656646584174, 'western swing': 35.85664264244404, 'jam band': 221.12958232970635, 'no wave': 319.92561831731626, 'radio': 0.0, 'dance rock': 88.69318201655186, 'deathrock': 65.71896004674674, 'minnesota': 49.74998845977647, 'rock argentina': 16.0, 'hardcore rap': 22.456165844844254, 'brass band': 63.46882707315132, 'deep house': 117.15329514620852, 'piano': 47.61574323172825, 'kraut rock': 119.90460331118061, 'punk blues': 134.64504063638037, 'rebetika': 0.0, 'gagaku': 38.346351477711025, 'waltz': 414.1349852646937, 'dj': 3.1738749999999953, 'beat': 97.36992489402562, 'jazz': 126.89223157308408, 'gnawa': 82.13960066936122, 'singer-songwriter': 63.84016727246694, 'breakbeat': 122.31044231691729, 'melodic hardcore': 87.89186531489452, 'futurepop': 64.91522252804003, 'progressive': 98.20003000381426, 'dance music': 75.99960078714946, 'emo': 71.46194848542204, 'disco': 91.52787383292174, 'electric blues': 114.70631701905289, 'hardstyle': 130.9001911752585, 'hip hop': 115.6026841008109, 'southern gospel': 69.08458614184525, 'ohio': 0.0, 'gospel': 66.47241377022051, 'musette': 110.84279804889617, 'bachata': 46.02274843624269, 'outsider music': 0.0, 'dutch jazz': 0.0, 'chanson francaise': 49.781749069144915, 'traditional pop': 15.882449999999992, 'new wave': 75.6774885428614, 'stoner metal': 101.80957602806973, 'brazil': 16.365710000000007, 'chicago soul': 62.226856551798974, 'motown': 81.03362874486216, 'soul jazz': 130.08229508428835, 'thrash core': 115.9185855239933, 'r&b': 81.40407879101546, 'sludge metal': 97.33913760445328, 'east coast blues': 15.711114753802292, 'alternative rap': 0.0, 'latin': 114.7809397051674, 'chill-out': 123.6996316491575, 'pop folk': 73.30339662334278, 'zouk': 115.51489076658501, 'avantgarde metal': 97.17055942159722, 'liedermacher': 0.0, 'guitar': 65.66457997959904, 'pagode': 0.0, 'french': 33.49927651974626, 'martial industrial': 0.0, 'memphis soul': 56.87783273038489, 'country': 48.45965693042385, 'mariachi': 49.52344740185863, 'southern hip hop': 47.79853121110266, 'west coast rap': 0.0, 'memphis rap': 67.88918138579831, 'experimental': 125.19728917497514, 'electronic music': 91.01367295944067, 'bongo': 18.6572610606279, 'pop punk': 76.29043261527649, 'chiptune': 23.27236342404052, 'cuddlecore': 0.0, 'pop': 57.24775428851943, 'beach music': 17.68239729594197, 'ranchera': 36.014592444486226, 'sexy': 0.0, 'rave': 140.09955421162638, 'melodic death metal': 50.70137798760409, 'noise rock': 142.19518728642592, 'jazz rap': 52.167271258359705, 'dub': 121.69641916110051, 'rapcore': 52.21319152297761, 'polka': 46.71134425313323, 'tropical': 0.0, 'island music': 83.79629380596475, 'indietronica': 69.95028058414982, 'melodic trance': 105.99744491081495, 'skate punk': 61.875143587520576, 'chicago house': 118.9457550670983, 'christian punk': 45.40001280274434, 'charanga': 72.87091740341036, 'french pop': 63.33099554330912, 'soft rock': 92.5890706073165, 'motown and soul': 40.711834999999994, 'hardcore': 147.21036322371177, 'funk': 92.84364711193567, 'qawwali': 485.03477092222687, 'video game music': 51.074709262370796, 'pinoy rock': 44.588481190722966, 'northern soul': 39.876137411083484, 'serialism': 404.71894410668693, 'merengue': 130.9514984927092, 'canadian': 15.960814999999997, 'male vocalist': 52.57006468939584, 'kirtan': 0.0, 'schranz': 30.97501845314472, 'bolero': 41.100227619555, 'underground rap': 58.211645849552866, 'mexico': 34.89838893920622, 'industrial dance': 133.92305809762513, 'bluegrass': 60.18718756863324, 'mondiovision': 28.32979499999999, 'ambient': 325.16860940833396, 'boy band': 45.084004178986575, 'doom metal': 232.89073054127599, 'space synth': 118.36081500000003, 'alternative dance': 91.90084745136558, 'christian music': 54.14723496812632, 'italy': 0.0, 'slow core': 205.14509285584205, 'jug band': 55.01380316510035, 'rock opera': 54.7705922061157, 'psychedelic pop': 136.6304093786082, 'synthpop': 63.09028526540379, 'rhythm noise': 81.20256415560905, 'electro': 103.03830974065141, 'lovers rock': 110.12841737313502, 'nu breaks': 70.73058201486879, 'manouche': 25.092977040245522, 'finish': 33.716734237795784, 'kizomba': 75.3278240101145, 'digital hardcore': 76.82581552998988, 'rockabilly': 49.43550474225356, 'freak folk': 65.89973999135341, 'celtic': 99.17568097234596, 'poland': 80.7734389119665, 'punk pop': 0.0, 'hip house': 135.6903304883117, 'ccm': 71.92635379974958, 'psychobilly': 68.27737782303821, 'swiss': 23.537099292026596, 'soukous': 105.8451273908565, 'southern rock': 61.80196253034925, 'parody': 89.98385743371871, 'madchester': 27.71410980285959, 'tech house': 126.18648383592694, 'jazz fusion': 178.30156867279507, 'choro': 53.746939999999995, 'chamber jazz': 97.29696302653994, 'chinese music': 98.88196290996783, 'neofolk': 104.7958226768167, 'baroque music': 45.76653, 'power pop': 75.63885965079221, 'alternative': 161.24085987373266, 'cabaret': 90.41690034958667, 'nederpop': 51.702253965636935, 'british pop': 84.24737421371577, 'bhangra': 107.14060875090021, 'samba': 77.07685583341947, 'experimental pop': 66.41768973750001, 'musical theater': 30.210610000000003, 'easy listening': 125.15982143489775, 'ska': 57.52051939244288, 'new romantic': 58.87155432270537, "rock 'n roll": 103.60371358043993, 'experimental rock': 217.01190221329688, 'concerto': 234.7239450963316, 'all-female': 52.029973256249804, 'dark cabaret': 107.98661830127357, 'fusion': 0.0, 'big band': 23.290152543757387, 'desi': 55.46972248510516, 'electro-industrial': 92.79572900743182, 'hard bop': 160.55466866562986, 'world': 63.01511728430127, 'banda': 56.09631113759124, 'symphonic metal': 110.16515167370524, 'trip rock': 75.54448282878256, 'calypso': 87.79604508494937, 'brazilian jazz': 130.07664791938132, 'electronic': 130.87231347324104, 'glitch': 159.79214399832347, 'free jazz': 370.48120951940155, 'latin pop': 60.36939653785454, 'chorus': 20.041233159125113, 'garage': 209.5569324763781, 'psychedelic rock': 167.52454647210166, 'classic female blues': 38.89076647995066, 'classical': 273.0737522572043, 'blues-rock': 103.48140950518246, 'female vocalist': 61.051311584187424, 'scottish': 15.281635000000009, 'honky tonk': 48.970623196429926, 'big beat': 165.34893836085288, 'italian disco': 118.51055093119835, 'sympho black metal': 126.57471057054576, 'zeuhl': 137.07501865612235, 'progressive rock': 114.16650763565643, 'vallenato': 33.06825011593228, 'crossover thrash': 102.7048418426563, 'swamp blues': 72.89472755469427, 'vocal house': 139.88295831075862, 'modern rock': 142.68342777664927, 'choral music': 0.0, 'free music': 39.362857427908544, 'shoegaze': 143.80749621236845, 'contemporary classical music': 64.19098300515438, 'afrobeat': 122.66186934113178, 'stand-up comedy': 131.88829571289577, 'gypsy jazz': 66.95544459229515, 'outlaw country': 54.18226110328498, 'spanish pop': 7.509374553655511, 'viking metal': 132.92520588362953, 'minimal': 89.21955718860508, 'doo-wop': 56.70070377765614, 'gaita': 70.50859315755214, 'requiem': 60.90938912012744, 'new orleans jazz': 197.28981639020873, 'symphonic rock': 139.14535545753967, 'kwaito': 69.95591999999999, 'progressive trance': 135.35951631839296, 'downtempo': 141.30758264288147, 'folk metal': 126.35082318019127, 'oi': 66.81776342297135, 'cowboy': 0.0, 'broken beat': 108.80167114142776, 'melbourne': 70.97498388362956, 'string quartet': 202.9835261126113, 'exotica': 40.93820473654611, 'anti-folk': 58.70121536660812, 'ghetto tech': 102.85872167204782, 'political': 0.0, 'smooth jazz': 94.17881854708504, 'beatboxing': 0.0, 'british invasion': 59.45343885744344, 'united states': 0.0, 'thrash metal': 0.0, 'mambo': 45.921883323225536, 'progressive house': 135.88600147998824, 'swamp rock': 55.456919905999705, 'math rock': 112.47271705085709, 'madagascar': 30.184489999999997, 'adult contemporary': 0.0, 'soca': 86.41656604742059, 'funky house': 119.9225998242068, 'lambada': 7.471019999999996, 'buddamat': 0.0, 'brazilian': 0.0, 'industrial metal': 124.41624000569071, 'goa': 103.94544781254785, 'grunge': 117.86366869064415, 'pop rap': 76.33743136675857, 'space music': 94.98893863335469, 'boogie-woogie': 64.09270988824204, 'electroclash': 86.72770232566654, 'game': 114.90914423867154, 'drum and bass': 66.08080431048934, 'psychedelic trance': 107.55656947575777, 'horror punk': 65.00980009231169, 'jrock': 23.052105581673533, 'harmonica blues': 99.42714122311482, 'skacore': 337.76809243952687, 'rumba': 78.82279008616433, 'texas blues': 130.4977439476901, 'tejano': 48.61801485855947, 'roots reggae': 90.56572438567929, 'shibuya-kei': 0.0, '8-bit': 163.4252662120674, 'irish': 116.99833357888194, 'estonia': 74.03101999999998, 'classic': 141.56641886728218, 'cover': 0.0, 'lo-fi': 0.0, '2 tone': 34.61838968403423, 'hardcore hip hop': 38.8304189624941, 'portugese': 13.45417970860713, 'zouklove': 0.0, 'filk': 65.39534717474865, 'religious music': 222.54502003769414, 'speed garage': 59.741180995392114, 'instrumental pop': 148.65520728170713, 'country music': 103.52098125188893, 'guitarist': 0.0, 'brill building pop': 65.47734453865581, 'dream pop': 95.68513522473309, 'new age': 90.94771592428114, 'club': 196.2183644788199, 'synth punk': 59.613027475318, 'electro hip hop': 26.910471526743382, 'jangle pop': 52.401543975995374, 'baile funk': 0.0, 'reggae': 68.82106467081393, 'country gospel': 54.95802047358526, 'celtic rock': 84.00952163050819, 'crunk': 81.94109190243736, 'tech metal': 44.382045000000005, 'post-hardcore': 89.8052918343288, 'massachusetts': 68.40930464536532, 'underground hip hop': 38.55673499999999, 'song writer': 106.45697146272332, 'national socialist black metal': 181.81224500000002, 'remix': 13.492244999999997, 'close harmony': 103.47693761926142, 'chinese rock': 0.0, 'mento': 0.0, 'country blues': 50.68917671405578, 'new prog': 0.0, 'comedy rap': 21.66967461218041, 'detroit techno': 168.00334992839683, 'dance pop': 91.38823052308786, 'zydeco': 65.15342493782715, 'bubblegum pop': 69.87125795292606, 'show tunes': 80.22379587396037, 'blackened death metal': 76.47965604037495}

In [18]:
#cut songs that do not appear to be actual songs; criteria may be length or keywords
#length should be relative to genre since some genres are much shorter/longer than others
keywords=['interview','commentary','introduction']#,'intro']
billboard=False #set billboard to True and populate bb_hots to extract Hot 100 songs over entire dataset
stop_chars=['"',"'",",",".","?","!","(",")"," "]

def selectSong(hfile,std):

    if billboard:
        songname=h.get_title(hfile).lower().strip()
        artistname=h.get_artist_name(hfile).lower().strip()
        for c in stop_chars: 
            songname=songname.replace(c,"")
            artistname=artistname.replace(c,"")
        songname=songname.replace('&','and').replace('featuring','ft')
        artistname=artistname.replace('&','and').replace('featuring','ft')
        for tup in bb_hots: #check if song and artist names are reversed
            if (songname==tup[0] and artistname==tup[1]) or (songname==tup[1] and artistname==tup[0]):
                print(h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile))
                return 1
            #elif songname==tup[0]: 
            #    print("Mismatch in artist name",h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile),tup[1])
            else: pass
        return 0
    
    for keyword in keywords:
        if keyword in h.get_title(hfile).lower(): 
            print("Bad Keyword", h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile))
            return 0
    try: genre=h.get_artist_terms(hfile)[0]
    except: 
        print("Bad Genre", h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile))
        return 0 #bad genre
    try:
        if h.get_duration(hfile)<(gd[genre]-gv[genre]*std): #set threshold for minimum length
            print("Short Duration", h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile),h.get_duration(hfile), genre)
            return 0
        if h.get_duration(hfile)=='':
            print("No Duration", h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile),h.get_duration(hfile), genre)
            return 0
        #can songs be too long to be characteristic?
        if h.get_duration(hfile)>(gd[genre]+gv[genre]*std): #set threshold for maximum length
            print("Long Duration", h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile),h.get_duration(hfile), genre)
            return 1
    except: #not all genres will have been seen in A or be in gd/gv
        print("New Genre", h.get_song_id(hfile),h.get_title(hfile),h.get_artist_name(hfile),h.get_duration(hfile), genre)
        return 1
    return 1

In [None]:
#Run to populate songs, set basedir
#NOTE: it's necessary to look through all files to get segment data
#this takes a lot of memory so may be useful to run a portion of the alphabet at once
basedir='music_complexity/data/' #run on entire MSD; 1/26 takes ~15 minutes and 6% of memory
#filename=basedir+'TRAAABD128F429CF47.h5'
basedir='music_complexity/MillionSongSubset/data' #run over 10,000 song subset (1% of dataset)
#songs=set([])
fileDict={}
genres={} #genre hash table instead of song object set
bad_genres=0
for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*.h5'))
        print(root)
        #try: #pass over directories previously looked at
        #    if root[22]<'N': continue
        #except: pass
        for f in files:
            hfile = h.open_h5_file_read(f)
            numSongs=h.get_num_songs(hfile)
            if numSongs>1: print("%d songs in file" % numSongs)
            if selectSong(hfile,3): #try with three standard deviation minimum length
            #if True: #select all
                pe=calculatePitchEntropy(hfile)
                le=calculateLoudnessEntropy(hfile)
                te=calculateTimbreEntropy(hfile)
                re=calculateRhythmEntropy(hfile)
                song=Song(hfile)
                if pe: song.pentropy, song.pconEntropy = pe[0], pe[1]
                if le: song.lentropy, song.lconEntropy = le[0], le[1]
                if te: song.tentropy, song.tconEntropy = te[0], te[1]
                if re: song.rentropy, song.rconEntropy = re[0], re[1]
                #if song.year>0: songs.add(song)
                try:
                    genres.setdefault(song.terms,[]).append(song)
                except: bad_genres+=1
            #create dictionary of filename-song object for quick reference
            #fileDict[f]=song
            #song=Song(hfile)
            #songs.add(song)
            hfile.close()
            del hfile
print(bad_genres, "bad songs")
#print(len(songs))
#print(song.pentropy, song.pconEntropy, song.lentropy, song.lconEntropy)
#print(song.tentropy, song.tconEntropy, song.rentropy, song.rconEntropy)

In [3]:
#fast extract to only retrieve certain features
#update: one letter in dataset (1/26) takes ~20 minutes and 4% memory
def fastExtract(basedir,features,startLetter="A",endLetter="a"): #note, lowercase > uppercase, all directories in dataset uppercase
    songs={}
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*.h5'))
        print(root)
        try:
            if root[22] < startLetter: continue
            if root[22] >= endLetter: #early exit to parse dataset in chunks 
                return songs
        except: pass #NOTE: this catches the root directory before the first A directory
        for f in files:
            hfile = h.open_h5_file_read(f)
            numSongs=h.get_num_songs(hfile)
            #if numSongs>1: print("%d songs in file" % numSongs)
            song=h.get_song_id(hfile)
            songs[song]={}
            for feature in features:
                #extract only what we want
                songs[song][feature]=eval("h.get_"+feature+"(hfile)")
            hfile.close()
            del hfile
    return songs

In [None]:
#example
songs=fastExtract('music_complexity/data/',["artist_name","title","key","artist_terms","mode"],"S","z")

### Data Filtering

In [27]:
#additional selection criteria when reading from file
keywords=['interview','commentar','introduction','discuss','conference','intro']
stop_chars=['"',"'",",",".","?","!","(",")"," "]
def additionalSelect(song):
    for keyword in keywords:
        if keyword in song.title.lower(): 
            return 0
    return 1

In [7]:
#do an additonal selection across all songs
bad_keywords=set([])
total, qualifiers = 0, 0
for genre in genres:
    for song in genres[genre]:
        if additionalSelect(song):
            qualifiers+=1
        else: bad_keywords.add(song)
        total+=1
print(total, qualifiers, len(bad_keywords)       )
SONG_LENGTH=total

993214 988754 4460


In [8]:
#find duplicates based on artistname and songname
stop_chars=['"',"'",",",".","?","!","(",")"," "]
songs={} 
for genre in genres:
    #print genre
    for song in genres[genre]:
        songname=song.title.lower()
        artistname=song.aname.lower()
        for c in stop_chars: 
            songname=songname.replace(c,"")
            artistname=artistname.replace(c,"")
        songname=songname.replace('&','and').replace('featuring','ft')
        artistname=artistname.replace('&','and').replace('featuring','ft')
        songs.setdefault(artistname+songname,[]).append(song)
print len(songs)

913691


In [116]:
#write duplicates to file
with open("my_duplicates.txt",'w') as f:
    for key in songs:
        if len(songs[key])>1:
            f.write('%'+key+'\n')
            for song in songs[key]: f.write(song.song+'\n')

In [8]:
#check to see if our duplicates are part of official duplicate set
#NOTE: IDs do not match, these begin with S, not T; must check artistname+songname keys
stop_chars=['"',"'",",",".","?","!","(",")"," ","-"] #add hyphen stop character
duplicate_keys=set([]) #MSD duplicates
my_duplicates=set([]) #my duplicates
with open("music_complexity\duplicates.txt") as f:
    for line in f:
        if line[0]=='%':
            index=line.find(" ") #filter out numeric counter
            line=line[index:].strip().lower()
            for c in stop_chars:
                line=line.replace(c,"")
            line=line.replace('&','and').replace('featuring','ft')
            duplicate_keys.add(line)
print(len(duplicate_keys),"MSD duplicate keys")
with open("music_complexity\my_duplicates.txt") as f2:
    for line in f2:
        if line[0]=='%':
            my_duplicates.add(line.strip().replace("%",""))
            #if not(line.strip().replace("%","")) in duplicate_keys:
print(len(my_duplicates),"my duplicates")
union=duplicate_keys.union(my_duplicates)
intersection=duplicate_keys.intersection(my_duplicates)
MSD_only=duplicate_keys.difference(my_duplicates)
mine_only=my_duplicates.difference(duplicate_keys)
print("%d alltogether, %d in both, %d only in MSD, %d only in my set" % (len(union),len(intersection),len(MSD_only),len(mine_only)))

53305 MSD duplicate keys
54856 my duplicates
60558 alltogether, 47603 in both, 5702 only in MSD, 7253 only in my set


In [12]:
#check length of actual duplicate songs, not artistname+songname keys
MSD_total,my_total=0,0
MSD_keys,my_keys=0,0
with open("music_complexity/duplicates.txt") as f:
    for line in f:
        if line[0]=='%' or (line[0]=='#'): MSD_keys+=1
        else: MSD_total+=1
with open("music_complexity/my_duplicates.txt") as f:
    for line in f:
        if line[0]=='%': my_keys+=1
        else: my_total+=1
print(MSD_total,my_total,MSD_keys,my_keys,my_total-my_keys)

131661 134379 53477 54856 79523


In [9]:
#save dataset of duplicate songs
duplicates=set([])
for key in songs:
    length=len(songs[key])
    for i in range(length):
        if i==0: 
            pass #keep the first song
        else:
            duplicates.add(songs[key][i])
print(len(duplicates))

79523


In [11]:
#remove references to (my) duplicates and songs with bad keywords, depends on sets being populated above
#NOTE: first filter out all duplicates, then worry about multiple versions!
new_genres={}
total,new_total=0,0
print(len(duplicates),"duplicates")
print(len(bad_keywords),"bad keywords")
print(len(duplicates.intersection(bad_keywords)),"intersection of duplicates and bad keywords")

for genre in genres:
    new_genres[genre]=[]
    for song in genres[genre]:
        total+=1
        #delete duplicates
        if not(song in duplicates) and not(song in bad_keywords): 
            new_genres[genre].append(song)
            new_total+=1
print(len(duplicates),len(bad_keywords))
genres=new_genres #reduction to 909514 songs
print(total,new_total)

79523 duplicates
4460 bad keywords
283 intersection of duplicates and bad keywords
79523 4460
913691 909514


In [42]:
#check for duplicate ids
songids={}
for genre in genres:
    for song in genres[genre]:
        songids.setdefault(song.song,[]).append(song)
print(len(songids))

909419


In [13]:
#NOTE: duplicates (of song id) are often different versions of the same song, filter them out
#note the curious cases of SOLSLVG12A8C144092, SOBHHUS12A58A78589, SOSOWBY12A6D4F4D45, SOWIZVD12AAFF44372, SODHZES12A813557D6, SOGCDKF12A8C140740
duplicate_versions=set([]) #set of songs where multiple songs have the same id
print(SONG_LENGTH-len(songids))
for key in songids:
    length=len(songids[key])
    for i in range(length):
        if i==0: pass #keep first version
        else:
            song=songids[key][i]
            #print(song.song,song.aname,song.title,song.duration)
            duplicate_versions.add(song)
print(len(duplicate_versions))

83795
95


In [14]:
#remove references to song ids with multiple versions
new_genres={}
total,new_total=0,0
print(len(duplicate_versions),"duplicate versions")
for genre in genres:
    new_genres[genre]=[]
    for song in genres[genre]:
        total+=1
        #delete duplicates
        if not(song in duplicate_versions): 
            new_genres[genre].append(song)
            new_total+=1
genres=new_genres
print(total,new_total) #reduction to 909419

95 duplicate versions
909514 909419


In [87]:
#check only for duplicates that have multiple songids (because our songids are unique)
#uses official MSD duplicate list
title_songs={}
with open("music_complexity/duplicates.txt") as fread:
    for line in fread:
            if line[0]=='%' or (line[0]=='#'): 
                title=line.strip()
                title_songs[title]=set([])
            else:
                songid=song_tracks[line.strip()]
                if songid in songs: #it's in our filtered set
                    title_songs[title].add(songid)
                
multi_titles={title for title in title_songs if len(title_songs[title])>1}
print(len(title_songs),len(multi_titles))

53477 3367


In [93]:
#to us, these tracks are equivalent so keep the first one (6890 total duplicates so 3367 should be kept, 3523 removed)
removeids=[]
for title in multi_titles:
    for i in range(len(title_songs[title])):
        songid=list(title_songs[title])[i]
        if i==0: 
            skipped=songid #keep the first one unless it's in the Hot 100
        #elif songid in keepers:
        #    removeids.append(skipped) #keep this one in favor of the skipped one
        else:
            removeids.append(songid)
print(len(removeids))

3523


In [95]:
#perform the filter, should leave 909419-3523=905,896
new_songs={song for song in allsongs if song.song not in removeids}
print(len(new_songs))

905896


In [100]:
#write new filtering to file for easier retrieval later
total=0
with open("music_complexity/allsongs_filtered_new.txt~",'w') as fileInput:
    for song in new_songs:
        total+=1
        fileInput.write(song.song+'\n')
        fileInput.write(song.aname+'\n')
        fileInput.write(song.title+'\n')
        fileInput.write(str(song.duration)+'\n')
        fileInput.write(str(song.tempo)+'\n')
        fileInput.write(str(song.year)+'\n')
        try: fileInput.write(song.terms+'\n')
        except: fileInput.write('\n')
        fileInput.write(str(song.totalEntropies)+'\n')
        fileInput.write(str(song.pentropy)+'\n')
        fileInput.write(str(song.pconEntropy)+'\n')
        fileInput.write(str(song.lentropy)+'\n')
        fileInput.write(str(song.lconEntropy)+'\n')
        fileInput.write(str(song.tentropy)+'\n')
        fileInput.write(str(song.tconEntropy)+'\n')
        fileInput.write(str(song.rentropy)+'\n')
        fileInput.write(str(song.rconEntropy)+'\n')
fileInput.close()
print(total, total*16) #total, number of lines

905896 14494336


#### Billboard Hot 100

In [15]:
#billboard Hot 100 data from EvolutionPopUSA_MainData
#from Mauch, Matthias (2015): Main Dataset for "Evolution of Popular Music: USA 1960–2010". figshare. Dataset.
bb_hots=set([])
stop_chars=['"',"'",",",".","?","!","(",")"," "]
with open('music_complexity/EvolutionPopUSA_MainData.csv') as fileInput:
    f=fileInput.readline() #header
    for line in fileInput:
        songname=line.split(",")[3].lower().strip()
        artistname=line.split(",")[1].lower().strip()
        for c in stop_chars: 
            songname=songname.replace(c,"")
            artistname=artistname.replace(c,"")
        songname=songname.replace('&','and').replace('featuring','ft')
        artistname=artistname.replace('&','and').replace('featuring','ft')
        bb_hots.add((songname,artistname))
print(len(bb_hots))

17009


In [16]:
#build bb_songs from spreadsheet, depends on bb_hots defined above; takes ~15 minutes to run
bb_overlap=set([])
i=0
for song in new_songs:
    if i%100000==0: print(i)
    i+=1
    songname=song.title.lower()
    artistname=song.aname.lower()
    for c in stop_chars: 
        songname=songname.replace(c,"")
        artistname=artistname.replace(c,"")
    songname=songname.replace('&','and').replace('featuring','ft')
    artistname=artistname.replace('&','and').replace('featuring','ft')
    for tup in bb_hots: #check if song and artist names are reversed
        if (songname==tup[0] and artistname==tup[1]) or (songname==tup[1] and artistname==tup[0]):
            bb_overlap.add(song)
            continue
print len(bb_overlap)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
6661


In [18]:
#WRITE bb song data to single file for easier retrieval later
fileInput=open('~allsongs_bb_filtered_mauch_data.txt~','w')
total=0
if True:
    for song in bb_overlap:
        total+=1
        fileInput.write(song.song+'\n')
        fileInput.write(song.aname+'\n')
        fileInput.write(song.title+'\n')
        fileInput.write(str(song.duration)+'\n')
        fileInput.write(str(song.tempo)+'\n')
        fileInput.write(str(song.year)+'\n')
        try: fileInput.write(song.terms+'\n')
        except: fileInput.write('\n')
        fileInput.write(str(song.totalEntropies)+'\n')
        fileInput.write(str(song.pentropy)+'\n')
        fileInput.write(str(song.pconEntropy)+'\n')
        fileInput.write(str(song.lentropy)+'\n')
        fileInput.write(str(song.lconEntropy)+'\n')
        fileInput.write(str(song.tentropy)+'\n')
        fileInput.write(str(song.tconEntropy)+'\n')
        fileInput.write(str(song.rentropy)+'\n')
        fileInput.write(str(song.rconEntropy)+'\n')
fileInput.close()
print(total, total*16)

6661
6661 106576


#### Alternative Classes

In [4]:
#song class for reading from data file; contains just metadata necessary for analysis
class SongData:
    def __init__(self,array):
        #initialization from aggregate text file
        self.song=array[0]
        self.aname=array[1]
        self.title=array[2]
        self.duration=float(array[3])
        self.tempo=float(array[4])
        self.year=int(array[5])
        self.terms=array[6]
        #indexed zip, shannon, conditional, average; leave out if equal to 0 (does not convert to float)
        try:
            self.totalEntropies=[float(l) for l in arr[7].strip("[").strip("]").split(",")] #if l!='0']
        except: self.totalEntropies=["","","",""]
        self.pentropy=float(array[8])
        self.pconEntropy=float(array[9])
        self.lentropy=float(array[10])
        self.lconEntropy=float(array[11])
        self.tentropy=float(array[12])
        self.tconEntropy=float(array[13])
        self.rentropy=float(array[14])
        self.rconEntropy=float(array[15])

In [11]:
#get song pitch codewords
def calculatePE(pitches):
    #calculate pentropy and pconEntropy for song object based on pitch codewords
    pcodes=[]
    for pitch in pitches:
        codeword="" #use string representation for later comparisons
        for element in pitch:
            if element>.5: codeword+='1'
            else: codeword+='0'
        pcodes.append(codeword)
    if pcodes: return (pcodes,entropy(pcodes),conditionalEntropy(pcodes))
    else: return None

In [12]:
#get song loudness codewords; rounds to nearest decibel
def calculateLE(volumes):
    lcodes=[]
    for volume in volumes:
        lcodes.append(int(round(volume)))
        #song.lcodes.append(round(volume*5)/5) #rounds to nearest fifth of decibel
    if lcodes: return (lcodes,entropy(lcodes),conditionalEntropy(lcodes))
    else: return None 

In [13]:
#create ternary timbre codewords (based on 11 dimensional vectors); depends on thresholds set above
def calculateTE(timbres):
    tcodes=[]
    for timbre in timbres:
        codeword="" #string representation
        for i in range(11):
            if timbre[i+1]<thresholds[i][0]: codeword+='0'
            elif timbre[i+1]<thresholds[i][1]: codeword+='1'
            else: codeword+='2'
        tcodes.append(codeword)
    if tcodes: return (tcodes,entropy(tcodes),conditionalEntropy(tcodes))
    else: return None

In [14]:
#calculate rhythm based on beats in section
def calculateRE(segments,beats):
    #calculate the average beat
    rcodes=[]
    segDiffs=[]
    for i in range(len(beats)):
        if i==0: pass #skip the first iteration
        elif beats[i]==0: pass
        elif beats[i-1]==0: pass
        else: segDiffs.append(beats[i]-beats[i-1])
    average=0
    for diff in segDiffs: average+=diff
    try: average=average/len(segDiffs) #average beat
    except: #division by zero
        return #ignore song with bad beat data
    sixteenth=average/4 #sixteenth note
    #calculate rhythm in terms of number of sixteenths based on average beat
    for i in range(len(segments)): 
        if i==0: pass #skip the first iteration
        else: 
            duration=segments[i]-segments[i-1]
            rcodes.append(round(duration/sixteenth*4)/4) #can distinguish 64th notes
            #song.rcodes.append(round(duration/sixteenth*2)/2) #can distinguish 32nd notes
            #rcodes.append(round(duration/sixteenth)) #can distinguish 16th notes
    if rcodes: return (rcodes,entropy(rcodes),conditionalEntropy(rcodes))
    else: return None 

In [219]:
#write codeword
def toString(arr):
    codeword=""
    for el in arr: codeword+=str(el)+','
    return codeword

In [220]:
def toStringRecursive(arr):
    codeword=""
    for el in arr: codeword+=toString(el)+','
    return codeword

In [16]:
#recover codeword
def fromString(line):
    arr=[]
    for code in line.strip().split(",")[:-1]: 
        try: 
            arr.append(float(code))
        except ValueError:
            arr.append(code)
    return arr

In [17]:
def fromStringRecursive(line):
    arr=[]
    for code in line.strip().split(",,")[:-1]:
        arr.append(list(fromString(code+','))) #add comma to be thrown away in the fromString function
    return arr

In [36]:
#WRITE song data to single file for easier retrieval later
#alternative to include more metadata than above (done for root folder A)
fileInput=open('~msdAentropies.txt','w')
total=0
for genre in genres:
    for song in genres[genre]:
    #for song in songs:
        total+=1
        fileInput.write(song.song+'\n')
        fileInput.write(song.artist+'\n')
        fileInput.write(song.aname+'\n')
        fileInput.write(song.title+'\n')
        fileInput.write(str(song.duration)+'\n')
        fileInput.write(str(song.key)+'\n')
        #fileInput.write(str(song.loudness)+'\n')
        fileInput.write(str(song.tempo)+'\n')
        fileInput.write(str(song.timeSig)+'\n')
        #fileInput.write(toString(song.segments)+'\n')
        #fileInput.write(toStringRecursive(song.pitches)+'\n')
        #fileInput.write(toStringRecursive(song.timbres)+'\n')
        #fileInput.write(toString(song.volumes)+'\n')
        #fileInput.write(toString(song.sections)+'\n')
        #fileInput.write(toString(song.beats)+'\n')
        #fileInput.write(toString(song.bars)+'\n')
        fileInput.write(str(song.year)+'\n')
        try: fileInput.write(toString(song.terms)+'\n')
        except: fileInput.write('\n')
        fileInput.write(toString(song.term_weights)+'\n')
        fileInput.write(toString(song.tags)+'\n')
        fileInput.write(str(song.energy)+'\n')
        fileInput.write(str(song.mode)+'\n')
        try:
            fileInput.write(str(song.pcodes)+'\n')
            fileInput.write(str(song.pentropy)+'\n')
            fileInput.write(str(song.pconEntropy)+'\n')
        except:
            fileInput.write('\n\n\n')
        try:
            fileInput.write(str(song.lcodes)+'\n')
            fileInput.write(str(song.lentropy)+'\n')
            fileInput.write(str(song.lconEntropy)+'\n')
        except:
            fileInput.write('\n\n\n')
        try:
            fileInput.write(str(song.tcodes)+'\n')
            fileInput.write(str(song.tentropy)+'\n')
            fileInput.write(str(song.tconEntropy)+'\n')
        except:
            fileInput.write('\n\n\n')
        try:
            fileInput.write(str(song.rcodes)+'\n')
            fileInput.write(str(song.rentropy)+'\n')
            fileInput.write(str(song.rconEntropy)+'\n')
        except:
            fileInput.write('\n\n\n')
            
        #fileInput.write(str(song.totalEntropies)+'\n')
        fileInput.write(str(song.afam)+'\n')
        fileInput.write(str(song.ahot)+'\n')
        fileInput.write(str(song.shot)+'\n')
        fileInput.write(toString(song.simartists)+'\n')
fileInput.close()
print(total, total*26)

39100 1016600


In [15]:
#alternate version of SongData
class FullSongData:
    def __init__(self,array):
        #initialization from aggregate text file
        self.song=array[0]
        self.artist=array[1]
        self.aname=array[2]
        self.title=array[3]
        self.duration=float(array[4])
        self.key=int(array[5])
        self.loudness=float(array[6])
        self.tempo=float(array[7])
        self.timeSig=int(array[8])
        self.segments=fromString(array[9])
        self.pitches=fromStringRecursive(array[10])
        self.timbres=fromStringRecursive(array[11])
        self.volumes=fromString(array[12])
        self.sections=fromString(array[13])
        self.beats=fromString(array[14])
        self.bars=fromString(array[15])
        self.year=int(array[16])
        self.terms=fromString(array[17])
        self.term_weights=fromString(array[18])
        self.tags=fromString(array[19])
        self.energy=float(array[20])
        self.mode=int(array[21])
        self.afam=float(array[22])
        self.ahot=float(array[23])
        self.shot=float(array[24])
        self.simartists=fromString(array[25])
        #entropy calculations
        pe=calculatePE(fromStringRecursive(array[10]))
        le=calculateLE(fromString(array[12]))
        te=calculateTE(fromStringRecursive(array[11]))
        re=calculateRE(fromString(array[9]),fromString(array[14]))
        if pe: self.pcodes, self.pentropy, self.pconEntropy = pe[0], pe[1], pe[2]
        if le: self.lcodes, self.lentropy, self.lconEntropy = le[0], le[1], le[2]
        if te: self.tcodes, self.tentropy, self.tconEntropy = te[0], te[1], te[2]
        if re: self.rcodes, self.rentropy, self.rconEntropy = re[0], re[1], re[2]
        self.totalEntropies=["","","",""] #placeholder

#### Data Retrieval

In [18]:
#READ song data from file for easier retrieval, set genres dictionary
#for msd part A, takes ~40% memory if including codewords; ~10% memory for all filtered
total=0
genres={}
with open('music_complexity/allsongs_filtered_new.txt', encoding="utf8") as f:
    #905896 filtered songs, 6661 bb, 39100 msdA
    for i in range(909419):  
        arr=[]
        for k in range(16): #16 for thin version, 26 for full version
            arr.append(f.readline().strip())
        #song=FullSongData(arr)
        try: song=SongData(arr)
        except: 
            print(arr[:4])
            break
        try:
            genres.setdefault(song.terms,set([])).add(song)
        except IndexError: 
            genres.setdefault('unknown',set([])).add(song)
        total+=1
        del arr
print(total)
SONG_LENGTH=total

['', '', '', '']
905896


In [19]:
#READ billboard hot 100 song data from file for easier retrieval, set genres_bb dictionary
total=0
genres_bb={}
with open('allsongs_bb_filtered_mauch_data.txt', encoding="utf8") as f:
    for i in range(6661): #6661 Mauch 
        arr=[]
        for k in range(16):
            arr.append(f.readline().strip())
        #song=SongData(arr)
        try: song=SongData(arr)
        except: 
            print(arr)
            break
        genres_bb.setdefault(song.terms,[]).append(song)
        total+=1
print(total)
print(len(genres_bb))

6661
224


In [20]:
#check that billboard songs are in filtered MSD set
#SET bb and allsongs sets; should be completely intersecting if comparing ids
bb=set([])
allsongs=set([])
for genre in genres_bb:
    for song in genres_bb[genre]:
        bb.add(song)
for genre in genres:
    for song in genres[genre]:
        allsongs.add(song)
print(len(allsongs),len(bb))
bb_intersection={song.song for song in allsongs}.intersection({song.song for song in bb})
print(len(bb_intersection),"intersection")

905896 6661
6661 intersection
