# Comparing Year Prediction Using Complex Audio Features vs Song Lyrics

## Special Notes
1. We can write the paper in here and then hide the code when we knit it to PDF using the following command:
    (jupyter nbconvert --to pdf --template hidecode Example.ipynb
2. Bag of Words data is stored in a SQLite database that we will have to figure out how to use over spring break


In [1]:
import h5py
import numpy as np
import pandas as pd
import tables
#from sklearn.model_selection import train_test_split
#from sklearn.naive_bayes import GaussianNB

In [2]:
#################### THIS CREATES THE PANDAS DATAFRAME FROM THE DATAFILES WE HAVE ###########################
import os
import sys
import time
import glob
import scipy.io as sio
import pandas as pd
import hdf5_getters
from pandas import read_hdf

def get_all_files(basedir,ext='.h5') :
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
    return allfiles

def transfer(h5path,matpath=None,force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print ('path to HF5 files does not exist:',h5path)
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print ('expecting a .h5 extension for file:',h5path)
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if not force: 
            print('matfile',matpath,'already exists (delete or force):')
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = list(filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()))
    getters.remove("get_num_songs") # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path}
    try:
        # iterate over songs
        for songidx in range(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx+1)
                data = hdf5_getters.__getattribute__(getter)(h5,songidx)
                matdata[gettername] = data
    except MemoryError:
        print('asdfasdfasdfasdf')
        raise
    finally:
        # close h5
        h5.close()
    # create
    return matdata



h5s = get_all_files('MillionSongSubset/data/')

df = pd.DataFrame()
for file in h5s:
    xd = transfer(file)
    df = df.append(pd.Series(xd), ignore_index=True)

df.head(10)

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_mbtags,...,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,title,track_7digitalid,track_id,transfer_note,year
0,22050.0,16971.0,0.55746,0.386152,b'AREJXK41187B9A4ACC',46.71067,b'France',1.71819,b'c43bb0d6-94d7-410f-80fb-e5a243b18d23',[],...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.04257, 0.28357, 0.52458, 0.76558, 1.00659, ...",124.059,4.0,0.372,b'Je Sais Que La Terre Est Plate',3073568.0,b'TRARRZU128F4253CA2',transferred on Fri Mar 1 19:59:51 2019 from f...,2008.0
1,22050.0,92108.0,0.626958,0.43486,b'AR2XRFQ1187FB417FE',,b'',,b'a69cd724-2f57-4ed0-bfed-ba20401eb84c',[],...,"[0.365, 0.321, 0.29, 0.264, 0.238, 0.217, 0.20...","[0.59161, 0.84042, 1.08799, 1.33555, 1.58436, ...",80.084,4.0,0.533,b'On Efface',4249244.0,b'TRARRJL128F92DED0E',transferred on Fri Mar 1 19:59:51 2019 from f...,2004.0
2,22050.0,1701.0,0.425724,0.0,b'ARODOO01187FB44F4A',,b'',,b'60bd8a1c-c093-4849-8f28-08101ca059b1',[],...,"[0.307, 0.305, 0.291, 0.284, 0.281, 0.28, 0.27...","[1.11537, 1.39701, 1.67864, 1.95344, 2.23097, ...",54.874,4.0,0.0,b'Howells Delight',5436063.0,b'TRARRUZ128F9307C57',transferred on Fri Mar 1 19:59:52 2019 from f...,0.0
3,22050.0,92184.0,0.611495,0.33452,b'ARJGW911187FB586CA',,b'',,b'44b5b950-2ae2-403a-8c67-82d8fc72033d',[],...,"[0.731, 0.628, 0.526, 0.359, 0.287, 0.246, 0.2...","[0.11929, 0.39309, 0.65603, 0.91083, 1.17222, ...",77.15,3.0,0.369,b'Martha Served',1199928.0,b'TRARRWA128F42A0195',transferred on Fri Mar 1 19:59:52 2019 from f...,2007.0
4,22050.0,278655.0,0.367255,0.311616,b'AR9HQ6Y1187FB3C2CB',,b'',,b'0e6524bd-6641-46a6-bce5-96f06c19aa46',[],...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.24248, 0.49266, 0.74283, 0.99301, 1.24318, ...",120.382,4.0,1.0,b'Zip-A-Dee-Doo-Dah (Song of the South)',8493899.0,b'TRARRPG12903CD1DE9',transferred on Fri Mar 1 19:59:52 2019 from f...,0.0
5,22050.0,16313.0,0.601306,0.363676,b'ARDPTGD1187B9AD361',36.87652,"b'Sikeston, MO'",-89.58828,b'097c86bc-b2b6-4791-833b-39b5385fe3e6',"[b'fusion', b'jazz fusion', b'classic pop and ...",...,"[0.237, 0.235, 0.234, 0.233, 0.226, 0.218, 0.2...","[0.11365, 0.26495, 0.41625, 0.56755, 0.71126, ...",99.024,4.0,1.0,b'Liquid Time (composition by John Goodsall)',5570526.0,b'TRARRER128F9328521',transferred on Fri Mar 1 19:59:52 2019 from f...,0.0
6,22050.0,21896.0,0.709011,0.553566,b'ARV8T9T1187B99F3F4',,b'',,b'efaefde1-e09b-4d49-9d8e-b1304d2ece8d',[b'finnish'],...,"[0.161, 0.154, 0.134, 0.117, 0.103, 0.094, 0.0...","[0.50326, 0.84168, 1.1801, 1.51853, 1.86368, 2...",175.673,4.0,0.0,b'Misery Path (From the Privilege of Evil)',2999402.0,b'TRARRYC128F428CCDA',transferred on Fri Mar 1 19:59:52 2019 from f...,0.0
7,22050.0,98670.0,0.548022,0.440135,b'ARJ5BEW1187FB52361',,b'',,b'de885e5b-284d-4dac-954c-48c7d7e2ebe5',[],...,"[0.351, 0.325, 0.44, 0.372, 0.364, 0.315, 0.30...","[0.09174, 0.41036, 0.73235, 1.09641, 1.45872, ...",87.999,4.0,0.954,b'Nuovi Re pt. I I (feat. Tek money - Lady Tam...,1283021.0,b'TRARROY128F42281F7',transferred on Fri Mar 1 19:59:52 2019 from f...,0.0
8,22050.0,94403.0,0.737038,0.539245,b'AR050VJ1187B9B13A7',,b'',,b'37c78aeb-d196-42b5-b991-6afb4fc9bc2e',"[b'punk', b'california', b'san francisco', b'r...",...,"[0.767, 0.742, 0.721, 0.704, 0.685, 0.672, 0.6...","[0.06784, 0.23214, 0.39241, 0.5559, 0.7194, 0....",92.897,4.0,0.879,b'Halloween',1959132.0,b'TRARREF128F422FD96',transferred on Fri Mar 1 19:59:52 2019 from f...,1982.0
9,22050.0,263016.0,0.435915,0.358149,b'AR8KUS11187B98C991',,b'',,b'050ce7ea-0935-430f-bcec-b83e702298eb',[],...,"[0.578, 0.485, 0.418, 0.349, 0.293, 0.212, 0.1...","[0.39517, 0.73361, 1.07887, 1.41731, 1.75062, ...",86.981,5.0,0.688,b'Parto em terras distantes',3779273.0,b'TRARRVB128F92F47CA',transferred on Fri Mar 1 19:59:52 2019 from f...,1998.0


In [4]:
############ HERE WE WILL CLEAN THE DATA ########################

# All songs have the same analysis_sample_rate so I remove it
newdf = df.drop("analysis_sample_rate",axis=1)

# Remove rows that have year==0 (no year) because we cannot use these data points
newdf = newdf[newdf.year != 0]

newdf.shape

#!pip install ipython-cache
#import cache_magic
#%cache DF = newdf

%cache magic is now registered in ipython
loading cached value for variable 'DF'. Time since pickling  7:28:50.702728


In [5]:
workingData = DF

In [6]:
workingData = workingData.drop(["artist_familiarity","artist_location","artist_mbtags", "artist_mbtags_count", "title"], axis=1)

workingData = workingData.drop(["artist_7digitalid", "artist_id", "artist_mbid", "artist_name", "artist_playmeid", "artist_terms", "artist_terms_freq"], axis=1)

workingData = workingData.drop(["artist_terms_weight", "audio_md5", "transfer_note", "similar_artists"], axis=1)

workingData = workingData.drop(["track_id", "track_7digitalid","song_id","release","release_7digitalid"], axis=1)

workingData.head(5)


Unnamed: 0,artist_hotttnesss,artist_latitude,artist_longitude,bars_confidence,bars_start,beats_confidence,beats_start,danceability,duration,end_of_fade_in,...,segments_start,segments_timbre,song_hotttnesss,start_of_fade_out,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,year
0,0.386152,46.71067,1.71819,"[0.179, 0.373, 0.127, 0.015, 0.012, 0.119, 0.0...","[0.52458, 2.4444, 4.3658, 6.30006, 8.22086, 10...","[0.886, 0.725, 0.748, 0.721, 0.784, 0.417, 0.4...","[0.04257, 0.52458, 1.00659, 1.48632, 1.96605, ...",0.0,148.74077,0.192,...,"[0.0, 0.19188, 0.60499, 0.84939, 1.10018, 1.36...","[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...",0.547953,141.607,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.04257, 0.28357, 0.52458, 0.76558, 1.00659, ...",124.059,4.0,0.372,2008.0
1,0.43486,,,"[0.716, 0.306, 0.015, 0.572, 0.529, 0.353, 0.2...","[8.69527, 12.01758, 14.81363, 20.09003, 22.693...","[0.773, 0.247, 0.402, 0.115, 0.092, 0.436, 0.5...","[0.59161, 1.33555, 2.07949, 2.82093, 3.56238, ...",0.0,252.99546,0.514,...,"[0.0, 0.51379, 0.89138, 1.30912, 1.65297, 2.05...","[[0.008, 170.986, 9.126, -28.685, 57.183, -50....",0.475638,241.424,"[0.365, 0.321, 0.29, 0.264, 0.238, 0.217, 0.20...","[0.59161, 0.84042, 1.08799, 1.33555, 1.58436, ...",80.084,4.0,0.533,2004.0
3,0.33452,,,"[0.095, 0.147, 0.157, 0.191, 0.115, 0.222, 0.2...","[0.39309, 2.75109, 5.13649, 7.52978, 9.95582, ...","[0.862, 0.509, 0.011, 0.39, 0.479, 0.422, 0.45...","[0.39309, 1.17222, 1.95902, 2.75109, 3.55236, ...",0.0,163.63057,0.0,...,"[0.0, 0.07306, 0.36862, 0.64739, 0.79238, 1.04...","[[14.832, -56.33, 69.981, -43.028, 138.861, 58...",,158.511,"[0.731, 0.628, 0.526, 0.359, 0.287, 0.246, 0.2...","[0.11929, 0.39309, 0.65603, 0.91083, 1.17222, ...",77.15,3.0,0.369,2007.0
8,0.539245,,,"[0.016, 0.555, 0.028, 0.082, 0.024, 0.082, 0.0...","[1.04773, 3.659, 6.31974, 8.97532, 11.61875, 1...","[0.379, 0.907, 0.342, 0.812, 0.49, 0.83, 0.203...","[0.39241, 1.04773, 1.69575, 2.34571, 3.00003, ...",0.0,216.842,0.0,...,"[0.0, 0.19837, 0.49914, 0.85873, 1.01193, 1.17...","[[23.928, -119.362, 33.96, -122.038, 8.715, 18...",0.788388,213.02,"[0.767, 0.742, 0.721, 0.704, 0.685, 0.672, 0.6...","[0.06784, 0.23214, 0.39241, 0.5559, 0.7194, 0....",92.897,4.0,0.879,1982.0
9,0.358149,,,"[0.274, 0.1, 0.029, 0.045, 0.083, 0.061, 0.081...","[2.09247, 5.53315, 9.00626, 12.52768, 16.08701...","[0.335, 0.35, 0.427, 0.214, 0.333, 0.0, 0.592,...","[0.73361, 1.41731, 2.09247, 2.78129, 3.47181, ...",0.0,312.99873,0.0,...,"[0.0, 0.38163, 1.00712, 1.6937, 2.02327, 2.417...","[[28.357, -44.341, 35.351, -159.74, -9.866, 11...",,296.316,"[0.578, 0.485, 0.418, 0.349, 0.293, 0.212, 0.1...","[0.39517, 0.73361, 1.07887, 1.41731, 1.75062, ...",86.981,5.0,0.688,1998.0


In [7]:
######### REMOVING CONFIDENCE MEASURES ####################
workingData = workingData.drop(["bars_confidence", "beats_confidence","key_confidence","mode_confidence","sections_confidence", "segments_confidence", "tatums_confidence", "time_signature_confidence"], axis=1)
workingData.head(5)

Unnamed: 0,artist_hotttnesss,artist_latitude,artist_longitude,bars_start,beats_start,danceability,duration,end_of_fade_in,energy,key,...,segments_loudness_start,segments_pitches,segments_start,segments_timbre,song_hotttnesss,start_of_fade_out,tatums_start,tempo,time_signature,year
0,0.386152,46.71067,1.71819,"[0.52458, 2.4444, 4.3658, 6.30006, 8.22086, 10...","[0.04257, 0.52458, 1.00659, 1.48632, 1.96605, ...",0.0,148.74077,0.192,0.0,0.0,...,"[-60.0, -60.0, -55.39, -46.295, -48.97, -39.43...","[[0.726, 0.112, 0.02, 0.019, 0.01, 0.038, 0.03...","[0.0, 0.19188, 0.60499, 0.84939, 1.10018, 1.36...","[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...",0.547953,141.607,"[0.04257, 0.28357, 0.52458, 0.76558, 1.00659, ...",124.059,4.0,2008.0
1,0.43486,,,"[8.69527, 12.01758, 14.81363, 20.09003, 22.693...","[0.59161, 1.33555, 2.07949, 2.82093, 3.56238, ...",0.0,252.99546,0.514,0.0,1.0,...,"[-60.0, -59.869, -37.121, -39.899, -38.511, -3...","[[0.943, 1.0, 0.91, 0.782, 0.726, 0.806, 0.314...","[0.0, 0.51379, 0.89138, 1.30912, 1.65297, 2.05...","[[0.008, 170.986, 9.126, -28.685, 57.183, -50....",0.475638,241.424,"[0.59161, 0.84042, 1.08799, 1.33555, 1.58436, ...",80.084,4.0,2004.0
3,0.33452,,,"[0.39309, 2.75109, 5.13649, 7.52978, 9.95582, ...","[0.39309, 1.17222, 1.95902, 2.75109, 3.55236, ...",0.0,163.63057,0.0,0.0,7.0,...,"[-60.0, -41.859, -13.428, -14.469, -13.079, -1...","[[0.835, 0.639, 0.492, 0.399, 0.326, 0.697, 0....","[0.0, 0.07306, 0.36862, 0.64739, 0.79238, 1.04...","[[14.832, -56.33, 69.981, -43.028, 138.861, 58...",,158.511,"[0.11929, 0.39309, 0.65603, 0.91083, 1.17222, ...",77.15,3.0,2007.0
8,0.539245,,,"[1.04773, 3.659, 6.31974, 8.97532, 11.61875, 1...","[0.39241, 1.04773, 1.69575, 2.34571, 3.00003, ...",0.0,216.842,0.0,0.0,10.0,...,"[-60.0, -30.852, -36.376, -35.201, -34.052, -3...","[[0.146, 0.203, 0.219, 0.209, 0.172, 0.123, 0....","[0.0, 0.19837, 0.49914, 0.85873, 1.01193, 1.17...","[[23.928, -119.362, 33.96, -122.038, 8.715, 18...",0.788388,213.02,"[0.06784, 0.23214, 0.39241, 0.5559, 0.7194, 0....",92.897,4.0,1982.0
9,0.358149,,,"[2.09247, 5.53315, 9.00626, 12.52768, 16.08701...","[0.73361, 1.41731, 2.09247, 2.78129, 3.47181, ...",0.0,312.99873,0.0,0.0,4.0,...,"[-60.0, -22.769, -22.695, -22.331, -19.033, -1...","[[0.016, 0.01, 0.015, 0.122, 1.0, 0.081, 0.038...","[0.0, 0.38163, 1.00712, 1.6937, 2.02327, 2.417...","[[28.357, -44.341, 35.351, -159.74, -9.866, 11...",,296.316,"[0.39517, 0.73361, 1.07887, 1.41731, 1.75062, ...",86.981,5.0,1998.0


In [19]:
############### bars_start, beats_start, sections_start, segments_loudness_max, 
############### segments_loudness_max_time, segments_loudness_max_start, segments_start, tatums_start
############### these features are all in array form


############### segments_pitches, segments_timbre 
############### are in 2d array form


############ Calculate weighted average for segments_timbre and segments_pitches

#workingData.loc[2,"segments_timbre"]
addingNewTimber =  pd.DataFrame(columns=['Tim1','Tim2','Tim3','Tim4','Tim5','Tim6','Tim7',
                                          'Tim8','Tim9','Tim10','Tim11','Tim12'])
addingNewPitch =  pd.DataFrame(columns=['Pitch1','Pitch2','Pitch3','Pitch4','Pitch5','Pitch6','Pitch7',
                                          'Pitch8','Pitch9','Pitch10','Pitch11','Pitch12'])

for s in range(workingData.shape[0]):
    valLengthsT = []
    valLengthsP = []
    storeT = workingData["segments_start"].iloc[s]
    storeP = workingData["segments_pitches"].iloc[s]
    for k in range(len(storeT)-1):
        valLengthsT.append(storeT[k+1]-storeT[k+1])
        valLengthsP.append(storeP[k+1]-storeP[k+1])
    valLengthsT.append(workingData["duration"].iloc[s]-storeT[-1])
    valLengthsP.append(workingData["duration"].iloc[s]-storeP[-1])
    myWeightedMeanT = np.repeat(0,12).astype(float)
    myWeightedMeanP = np.repeat(0,12).astype(float)
    for q in range(workingData["segments_timbre"].iloc[s].shape[0]):
        myWeightedMeanT += valLengthsT[q]*(workingData["segments_timbre"].iloc[s])[q]
        myWeightedMeanP += valLengthsP[q]*(workingData["segments_timbre"].iloc[s])[q]
    myWeightedMeanT = myWeightedMeanT/workingData["duration"].iloc[s]
    myWeightedMeanP = myWeightedMeanP/workingData["duration"].iloc[s]
    
    addingNewTimber.loc[s] = myWeightedMeanT
    addingNewPitch.loc[s] = myWeightedMeanP

#workingData.append(addingNewVecInfo)
mergeTim = pd.concat([workingData, addingNewTimber], axis=1, sort=False)
finalWorkingDF = pd.concat([mergeTim, addingNewPitch], axis=1, sort=False)
finalWorkingDF.head(5)

Unnamed: 0,artist_hotttnesss,artist_latitude,artist_longitude,bars_start,beats_start,danceability,duration,end_of_fade_in,energy,key,...,Tim3,Tim4,Tim5,Tim6,Tim7,Tim8,Tim9,Tim10,Tim11,Tim12
0,0.386152,46.71067,1.71819,"[0.52458, 2.4444, 4.3658, 6.30006, 8.22086, 10...","[0.04257, 0.52458, 1.00659, 1.48632, 1.96605, ...",0.0,148.74077,0.192,0.0,0.0,...,-5.3035,12.154634,-0.56791,0.109776,-3.014063,4.467465,0.445815,-1.269353,-5.34227,0.687577
1,0.43486,,,"[8.69527, 12.01758, 14.81363, 20.09003, 22.693...","[0.59161, 1.33555, 2.07949, 2.82093, 3.56238, ...",0.0,252.99546,0.514,0.0,1.0,...,-0.705043,1.432895,0.568409,-0.711969,-0.984139,1.804178,-0.326835,-0.286615,-0.527685,-0.334984
2,,,,,,,,,,,...,-4.174516,12.725661,-0.000647,-2.803362,-2.043108,3.685373,0.507239,-1.813248,-1.371595,-0.339817
3,0.33452,,,"[0.39309, 2.75109, 5.13649, 7.52978, 9.95582, ...","[0.39309, 1.17222, 1.95902, 2.75109, 3.55236, ...",0.0,163.63057,0.0,0.0,7.0,...,-1.583068,5.749732,-0.404468,-0.02115,-0.765626,1.984318,0.280834,-0.575246,-1.81987,0.269324
4,,,,,,,,,,,...,-0.033163,0.063919,0.21066,-0.17027,0.012275,-0.145145,-0.044378,0.033454,-0.065845,0.016333


In [None]:
finalWorkingDF = finalWorkingDF.drop(["artist_latitude", "artist_longitude", "segments_timbre", "segments_start"], axis=1)
finalWorkingDF = finalWorkingDF.drop(["bars_start", "beats_start"], axis=1)




In [116]:
from sklearn import neighbors
from sklearn.model_selection import train_test_split

# Separatee data into training and testing set
y = workingData['year']
x = workingData.drop('year', axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
###################### KNN CLASSIFIER ########################
n_neighbors = 15

nnClassifier = neighbors.KNeighborsClassifier(n_neighbors)
nnClassifier.fit(x_train, y_train)

# Get training and test predictions
classifier_trainMSE = np.mean((nnClassifier.predict(x_train) - y_train)**2)
classifier_testMSE = np.mean((nnClassifier.predict(x_test) - y_test)**2)

print("training MSE", classifier_trainMSE)
print("testing MSE", classifier_testMSE)

In [None]:
###################### KNN REGRESSION ########################
n_neighbors = 15

nnRegressor = neighbors.KNeighborsRegressor(n_neighbors)
nnRegressor.fit(x_train, y_train)

# Get training and test predictions
regressor_trainMSE = np.mean((nnRegressor.predict(x_train) - y_train)**2)
regressor_testMSE = np.mean((nnRegressor.predict(x_test) - y_test)**2)

print("training MSE", regressor_trainMSE)
print("testing MSE", regressor_testMSE)