# Comparing Year Prediction Using Complex Audio Features vs Song Lyrics

## Special Notes
1. We can write the paper in here and then hide the code when we knit it to PDF using the following command:
    (jupyter nbconvert --to pdf --template hidecode Example.ipynb
2. Bag of Words data is stored in a SQLite database that we will have to figure out how to use over spring break


In [1]:
import h5py
import numpy as np
import pandas as pd
import tables
#from sklearn.model_selection import train_test_split
#from sklearn.naive_bayes import GaussianNB

In [2]:
#################### THIS CREATES THE PANDAS DATAFRAME FROM THE DATAFILES WE HAVE ###########################
import os
import sys
import time
import glob
import scipy.io as sio
import pandas as pd
import hdf5_getters
from pandas import read_hdf

def get_all_files(basedir,ext='.h5') :
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
    return allfiles

def transfer(h5path,matpath=None,force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print ('path to HF5 files does not exist:',h5path)
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print ('expecting a .h5 extension for file:',h5path)
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if not force: 
            print('matfile',matpath,'already exists (delete or force):')
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = list(filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()))
    getters.remove("get_num_songs") # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path}
    try:
        # iterate over songs
        for songidx in range(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx+1)
                data = hdf5_getters.__getattribute__(getter)(h5,songidx)
                matdata[gettername] = data
    except MemoryError:
        print('asdfasdfasdfasdf')
        raise
    finally:
        # close h5
        h5.close()
    # create
    return matdata



h5s = get_all_files('MillionSongSubset/data/')

df = pd.DataFrame()
for file in h5s:
    xd = transfer(file)
    df = df.append(pd.Series(xd), ignore_index=True)

df.head(10)

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_mbtags,...,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,title,track_7digitalid,track_id,transfer_note,year
0,22050.0,165270.0,0.581794,0.401998,b'ARD7TVE1187B99BFB1',,b'California - LA',,b'e77e51a5-4761-45b3-9847-2051f811e366',[],...,"[0.779, 0.734, 0.674, 0.637, 0.597, 0.532, 0.4...","[0.28519, 0.58521, 0.89422, 1.19196, 1.49119, ...",92.198,4.0,0.778,"b""I Didn't Mean To""",3401791.0,b'TRAAAAW128F429D538',transferred on Wed Mar 6 23:35:05 2019 from f...,0.0
1,22050.0,1998.0,0.63063,0.4175,b'ARMJAGH1187FB546F3',35.14968,"b'Memphis, TN'",-90.04892,b'1c78ab62-db33-4433-8d0b-7c8dcf1849c2',[b'classic pop and rock'],...,"[0.969, 0.929, 0.897, 0.871, 0.856, 0.845, 0.8...","[0.20627, 0.45904, 0.71054, 0.96834, 1.21836, ...",121.274,4.0,0.384,b'Soul Deep',3400270.0,b'TRAAABD128F429CF47',transferred on Wed Mar 6 23:35:05 2019 from f...,1969.0
2,22050.0,290021.0,0.487357,0.343428,b'ARKRRTF1187B9984DA',,b'',,b'7a273984-edd9-4451-9c4d-39b38f05ebcd',[],...,"[0.482, 0.676, 0.627, 0.549, 0.279, 0.264, 0.2...","[0.42132, 0.73152, 1.06609, 1.39732, 1.72854, ...",100.07,1.0,0.0,b'Amor De Cabaret',5703798.0,b'TRAAADZ128F9348C2E',transferred on Wed Mar 6 23:35:05 2019 from f...,0.0
3,22050.0,19072.0,0.630382,0.454231,b'AR7G5I41187FB4CE6C',,"b'London, England'",,b'e188a520-9cb7-4f73-a3d7-2f70c6538e92',"[b'uk', b'british', b'english']",...,"[0.601, 0.556, 0.523, 0.49, 0.466, 0.44, 0.428...","[0.56254, 0.81002, 1.05749, 1.30621, 1.55494, ...",119.293,4.0,0.0,b'Something Girls',3226795.0,b'TRAAAEF128F4273421',transferred on Wed Mar 6 23:35:05 2019 from f...,1982.0
4,22050.0,30973.0,0.651046,0.401724,b'ARXR32B1187FB57099',,b'',,b'c6903a2e-063c-4f91-a284-17b8f421be7b',[],...,"[1.0, 0.98, 0.932, 0.87, 0.82, 0.793, 0.768, 0...","[0.13576, 0.36918, 0.59914, 0.83141, 1.06368, ...",129.738,4.0,0.562,b'Face the Ashes',6795666.0,b'TRAAAFD128F92F423A',transferred on Wed Mar 6 23:35:06 2019 from f...,2007.0
5,22050.0,432935.0,0.535293,0.385471,b'ARKFYS91187B98E58F',,b'',,b'79c403f9-5467-4f23-8426-9ca3fc60a115',[],...,"[0.136, 0.127, 0.113, 0.112, 0.104, 0.09, 0.07...","[0.53929, 0.74856, 0.95987, 1.17118, 1.38249, ...",147.782,3.0,0.454,b'The Moon And I (Ordinary Day Album Version)',444964.0,b'TRAAAMO128F1481E7F',transferred on Wed Mar 6 23:35:06 2019 from f...,0.0
6,22050.0,17970.0,0.556496,0.261941,b'ARD0S291187B9B7BF5',,b'Ohio',,b'56503d6d-094e-4c28-ae3d-04cc748ade5b',[],...,"[0.467, 0.474, 0.528, 0.541, 0.507, 0.482, 0.3...","[0.05611, 0.27253, 0.48785, 0.70535, 0.92722, ...",111.787,1.0,0.0,b'Keepin It Real (Skit)',276593.0,b'TRAAAMQ128F1460CD3',transferred on Wed Mar 6 23:35:06 2019 from f...,0.0
7,22050.0,21128.0,0.801136,0.605507,b'AR10USD1187B99F3F1',,"b'Burlington, Ontario, Canada'",,b'd89de379-665d-425c-b2e9-41b95d1edb36',[],...,"[0.292, 0.284, 0.282, 0.274, 0.27, 0.237, 0.21...","[0.36129, 0.65428, 0.94433, 1.24174, 1.53768, ...",101.43,3.0,0.408,b'Drop of Rain',90004.0,b'TRAAAPK128E0786D96',transferred on Wed Mar 6 23:35:06 2019 from f...,0.0
8,22050.0,276891.0,0.426668,0.332276,b'AR8ZCNI1187B9A069B',,b'',,b'19d232b9-b4d7-4dc8-b259-bf65efb655b1',[],...,"[0.121, 0.124, 0.126, 0.128, 0.13, 0.131, 0.13...","[1.22595, 1.39961, 1.57241, 1.74174, 1.91886, ...",86.643,4.0,0.487,b'Pink World',3996579.0,b'TRAAARJ128F9320760',transferred on Wed Mar 6 23:35:06 2019 from f...,1984.0
9,22050.0,242273.0,0.550514,0.422706,b'ARNTLGG11E2835DDB9',,b'',,b'4d96f7d0-2f0e-4e92-ba70-a405f96f8cec',[],...,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.09933, 0.36057, 0.62445, 0.88967, 1.15423, ...",114.041,4.0,0.878,b'Insatiable (Instrumental Version)',7684249.0,b'TRAAAVG12903CFA543',transferred on Wed Mar 6 23:35:06 2019 from f...,0.0


In [3]:
############ HERE WE WILL CLEAN THE DATA ########################

# All songs have the same analysis_sample_rate so I remove it
newdf = df.drop("analysis_sample_rate",axis=1)

# Remove rows that have year==0 (no year) because we cannot use these data points
newdf = newdf[newdf.year != 0]

print(newdf.shape)
# !pip install ipython-cache
# import cache_magic
# %cache DF = newdf

(4680, 54)


In [4]:
workingData = df

In [5]:
workingData = workingData.drop(["artist_familiarity","artist_location","artist_mbtags", "artist_mbtags_count", "title"], axis=1)

workingData = workingData.drop(["artist_7digitalid", "artist_id", "artist_mbid", "artist_name", "artist_playmeid", "artist_terms", "artist_terms_freq"], axis=1)

workingData = workingData.drop(["artist_terms_weight", "audio_md5", "transfer_note", "similar_artists"], axis=1)

workingData = workingData.drop(["track_id", "track_7digitalid","song_id","release","release_7digitalid"], axis=1)

workingData.head(5)


Unnamed: 0,analysis_sample_rate,artist_hotttnesss,artist_latitude,artist_longitude,bars_confidence,bars_start,beats_confidence,beats_start,danceability,duration,...,segments_start,segments_timbre,song_hotttnesss,start_of_fade_out,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,year
0,22050.0,0.401998,,,"[0.643, 0.746, 0.722, 0.095, 0.091, 0.362, 0.4...","[0.58521, 2.94247, 5.14371, 7.74554, 10.36149,...","[0.834, 0.851, 0.65, 0.635, 0.532, 0.753, 0.62...","[0.58521, 1.19196, 1.78893, 2.37813, 2.94247, ...",0.0,218.93179,...,"[0.0, 0.24671, 0.47116, 0.80376, 0.89551, 1.12...","[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...",0.60212,218.932,"[0.779, 0.734, 0.674, 0.637, 0.597, 0.532, 0.4...","[0.28519, 0.58521, 0.89422, 1.19196, 1.49119, ...",92.198,4.0,0.778,0.0
1,22050.0,0.4175,35.14968,-90.04892,"[0.007, 0.259, 0.172, 0.404, 0.011, 0.016, 0.0...","[0.71054, 2.71502, 4.70861, 6.69288, 8.66941, ...","[1.0, 0.945, 0.714, 0.973, 0.818, 0.974, 0.878...","[0.20627, 0.71054, 1.21836, 1.71841, 2.21729, ...",0.0,148.03546,...,"[0.0, 0.14803, 0.68104, 0.95492, 1.19878, 1.45...","[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...",,137.915,"[0.969, 0.929, 0.897, 0.871, 0.856, 0.845, 0.8...","[0.20627, 0.45904, 0.71054, 0.96834, 1.21836, ...",121.274,4.0,0.384,1969.0
2,22050.0,0.343428,,,"[0.98, 0.399, 0.185, 0.27, 0.422, 0.0, 0.445, ...","[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....","[0.98, 0.399, 0.185, 0.27, 0.422, 0.0, 0.445, ...","[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....",0.0,177.47546,...,"[0.0, 0.28154, 0.48395, 0.6937, 0.97859, 1.361...","[[0.0, 171.124, 9.459, -28.489, 57.482, -50.06...",,172.304,"[0.482, 0.676, 0.627, 0.549, 0.279, 0.264, 0.2...","[0.42132, 0.73152, 1.06609, 1.39732, 1.72854, ...",100.07,1.0,0.0,0.0
3,22050.0,0.454231,,,"[0.017, 0.05, 0.014, 0.008, 0.114, 0.019, 0.08...","[1.30621, 3.29887, 5.30252, 7.32327, 9.33775, ...","[0.809, 0.616, 0.789, 0.66, 0.439, 0.758, 0.60...","[0.81002, 1.30621, 1.80617, 2.2996, 2.80049, 3...",0.0,233.40363,...,"[0.0, 0.70517, 1.03052, 1.21052, 1.52404, 1.72...","[[24.937, 37.465, 177.22, -216.443, 56.3, 202....",,217.124,"[0.601, 0.556, 0.523, 0.49, 0.466, 0.44, 0.428...","[0.56254, 0.81002, 1.05749, 1.30621, 1.55494, ...",119.293,4.0,0.0,1982.0
4,22050.0,0.401724,,,"[0.175, 0.409, 0.639, 0.067, 0.016, 0.066, 0.0...","[1.06368, 2.91491, 4.76729, 6.61852, 8.46978, ...","[0.883, 0.738, 0.484, 0.609, 0.625, 0.719, 0.4...","[0.13576, 0.59914, 1.06368, 1.52591, 1.99045, ...",0.0,209.60608,...,"[0.0, 0.06603, 0.24395, 0.57034, 0.92567, 1.26...","[[0.089, 169.621, 5.435, -30.061, 54.144, -50....",0.604501,198.699,"[1.0, 0.98, 0.932, 0.87, 0.82, 0.793, 0.768, 0...","[0.13576, 0.36918, 0.59914, 0.83141, 1.06368, ...",129.738,4.0,0.562,2007.0


In [6]:
######### REMOVING CONFIDENCE MEASURES ####################
workingData = workingData.drop(["bars_confidence", "beats_confidence","key_confidence","mode_confidence","sections_confidence", "segments_confidence", "tatums_confidence", "time_signature_confidence"], axis=1)
workingData.head(5)



Unnamed: 0,analysis_sample_rate,artist_hotttnesss,artist_latitude,artist_longitude,bars_start,beats_start,danceability,duration,end_of_fade_in,energy,...,segments_loudness_start,segments_pitches,segments_start,segments_timbre,song_hotttnesss,start_of_fade_out,tatums_start,tempo,time_signature,year
0,22050.0,0.401998,,,"[0.58521, 2.94247, 5.14371, 7.74554, 10.36149,...","[0.58521, 1.19196, 1.78893, 2.37813, 2.94247, ...",0.0,218.93179,0.247,0.0,...,"[-60.0, -60.0, -40.84, -40.401, -38.456, -39.6...","[[0.946, 0.684, 0.679, 0.941, 0.744, 0.633, 0....","[0.0, 0.24671, 0.47116, 0.80376, 0.89551, 1.12...","[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...",0.60212,218.932,"[0.28519, 0.58521, 0.89422, 1.19196, 1.49119, ...",92.198,4.0,0.0
1,22050.0,0.4175,35.14968,-90.04892,"[0.71054, 2.71502, 4.70861, 6.69288, 8.66941, ...","[0.20627, 0.71054, 1.21836, 1.71841, 2.21729, ...",0.0,148.03546,0.148,0.0,...,"[-60.0, -60.0, -23.521, -25.16, -27.133, -24.2...","[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[0.0, 0.14803, 0.68104, 0.95492, 1.19878, 1.45...","[[0.0, 171.13, 9.469, -28.48, 57.491, -50.067,...",,137.915,"[0.20627, 0.45904, 0.71054, 0.96834, 1.21836, ...",121.274,4.0,1969.0
2,22050.0,0.343428,,,"[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....","[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....",0.0,177.47546,0.282,0.0,...,"[-60.0, -59.9, -12.744, -12.003, -12.991, -15....","[[1.0, 0.911, 0.18, 0.334, 0.327, 0.344, 0.302...","[0.0, 0.28154, 0.48395, 0.6937, 0.97859, 1.361...","[[0.0, 171.124, 9.459, -28.489, 57.482, -50.06...",,172.304,"[0.42132, 0.73152, 1.06609, 1.39732, 1.72854, ...",100.07,1.0,0.0
3,22050.0,0.454231,,,"[1.30621, 3.29887, 5.30252, 7.32327, 9.33775, ...","[0.81002, 1.30621, 1.80617, 2.2996, 2.80049, 3...",0.0,233.40363,0.0,0.0,...,"[-60.0, -27.665, -21.241, -15.222, -18.915, -1...","[[0.651, 0.592, 0.647, 0.494, 0.683, 0.919, 1....","[0.0, 0.70517, 1.03052, 1.21052, 1.52404, 1.72...","[[24.937, 37.465, 177.22, -216.443, 56.3, 202....",,217.124,"[0.56254, 0.81002, 1.05749, 1.30621, 1.55494, ...",119.293,4.0,1982.0
4,22050.0,0.401724,,,"[1.06368, 2.91491, 4.76729, 6.61852, 8.46978, ...","[0.13576, 0.59914, 1.06368, 1.52591, 1.99045, ...",0.0,209.60608,0.066,0.0,...,"[-60.0, -59.828, -19.551, -32.609, -21.899, -2...","[[1.0, 0.529, 0.407, 0.423, 0.524, 0.509, 0.65...","[0.0, 0.06603, 0.24395, 0.57034, 0.92567, 1.26...","[[0.089, 169.621, 5.435, -30.061, 54.144, -50....",0.604501,198.699,"[0.13576, 0.36918, 0.59914, 0.83141, 1.06368, ...",129.738,4.0,2007.0


In [8]:
############### bars_start, beats_start, sections_start, segments_loudness_max, 
############### segments_loudness_max_time, segments_loudness_max_start, segments_start, tatums_start
############### these features are all in array form


############### segments_pitches, segments_timbre 
############### are in 2d array form


############ Calculate weighted average for segments_timbre and segments_pitches

#workingData.loc[2,"segments_timbre"]
# workingData.to_csv("WorkingData.csv", ",")
# import pandas as pd

# workingData = pd.read_csv("WorkingData.csv", " ")
# print("done")
# sworkingData["segments_start"].iloc[s].split(" ")
# workingData["segments_pitches"].iloc[s].split(" ")

addingNewTimber =  pd.DataFrame(columns=['Tim1','Tim2','Tim3','Tim4','Tim5','Tim6','Tim7',
                                          'Tim8','Tim9','Tim10','Tim11','Tim12'])
addingNewPitch =  pd.DataFrame(columns=['Pitch1','Pitch2','Pitch3','Pitch4','Pitch5','Pitch6','Pitch7',
                                          'Pitch8','Pitch9','Pitch10','Pitch11','Pitch12'])

for s in range(workingData.shape[0]):
    valLengthsT = []
    valLengthsP = []
    storeT = workingData["segments_start"].iloc[s]
    storeP = workingData["segments_pitches"].iloc[s]
    for k in range(len(storeT)-1):
        valLengthsT.append(storeT[k+1]-storeT[k+1])
        valLengthsP.append(storeP[k+1]-storeP[k+1])
    valLengthsT.append(workingData["duration"].iloc[s]-storeT[-1])
    valLengthsP.append(workingData["duration"].iloc[s]-storeP[-1])
    myWeightedMeanT = np.repeat(0,12).astype(float)
    myWeightedMeanP = np.repeat(0,12).astype(float)
    for q in range(workingData["segments_timbre"].iloc[s].shape[0]):
        myWeightedMeanT += valLengthsT[q]*(workingData["segments_timbre"].iloc[s])[q]
        myWeightedMeanP += valLengthsP[q]*(workingData["segments_timbre"].iloc[s])[q]
    myWeightedMeanT = myWeightedMeanT/workingData["duration"].iloc[s]
    myWeightedMeanP = myWeightedMeanP/workingData["duration"].iloc[s]
    
    addingNewTimber.loc[s] = myWeightedMeanT
    addingNewPitch.loc[s] = myWeightedMeanP

#workingData.append(addingNewVecInfo)
mergeTim = pd.concat([workingData, addingNewTimber], axis=1, sort=False)
finalWorkingDF = pd.concat([mergeTim, addingNewPitch], axis=1, sort=False)
finalWorkingDF.to_csv("FinalDF.csv", ",")
finalWorkingDF.head(5)

Unnamed: 0,analysis_sample_rate,artist_hotttnesss,artist_latitude,artist_longitude,bars_start,beats_start,danceability,duration,end_of_fade_in,energy,...,Pitch3,Pitch4,Pitch5,Pitch6,Pitch7,Pitch8,Pitch9,Pitch10,Pitch11,Pitch12
0,22050.0,0.401998,,,"[0.58521, 2.94247, 5.14371, 7.74554, 10.36149,...","[0.58521, 1.19196, 1.78893, 2.37813, 2.94247, ...",0.0,218.93179,0.247,0.0,...,-79.841478,8.863259,-53.759702,-26.742869,-5.537102,-92.185639,22.108005,19.302711,-21.781842,16.926603
1,22050.0,0.4175,35.14968,-90.04892,"[0.71054, 2.71502, 4.70861, 6.69288, 8.66941, ...","[0.20627, 0.71054, 1.21836, 1.71841, 2.21729, ...",0.0,148.03546,0.148,0.0,...,-6.873309,20.289421,43.542628,-50.036787,4.912887,45.033462,-27.173885,-6.312071,4.703293,-6.4233
2,22050.0,0.343428,,,"[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....","[0.73152, 1.39732, 2.04852, 2.68691, 3.315, 3....",0.0,177.47546,0.282,0.0,...,-99.419082,347.182291,-5.632113,-10.669601,-73.693238,95.38863,-8.168712,-43.15566,-137.42379,5.076842
3,22050.0,0.454231,,,"[1.30621, 3.29887, 5.30252, 7.32327, 9.33775, ...","[0.81002, 1.30621, 1.80617, 2.2996, 2.80049, 3...",0.0,233.40363,0.0,0.0,...,51.774223,11.929518,9.978057,-13.800586,13.658217,20.780797,-73.369905,-11.486943,-22.934629,-34.280749
4,22050.0,0.401724,,,"[1.06368, 2.91491, 4.76729, 6.61852, 8.46978, ...","[0.13576, 0.59914, 1.06368, 1.52591, 1.99045, ...",0.0,209.60608,0.066,0.0,...,43.006366,169.472477,15.26681,-45.114801,-50.116896,86.399747,-17.554204,-32.974927,-35.801193,-5.641859


In [9]:
finalWorkingDF = finalWorkingDF.drop(["artist_latitude", "artist_longitude", "segments_timbre", "segments_start"], axis=1)
finalWorkingDF = finalWorkingDF.drop(["bars_start", "beats_start"], axis=1)

In [20]:
list(finalWorkingDF.columns.values)

['analysis_sample_rate',
 'artist_hotttnesss',
 'danceability',
 'duration',
 'end_of_fade_in',
 'energy',
 'key',
 'loudness',
 'mode',
 'sections_start',
 'segments_loudness_max',
 'segments_loudness_max_time',
 'segments_loudness_start',
 'segments_pitches',
 'song_hotttnesss',
 'start_of_fade_out',
 'tatums_start',
 'tempo',
 'time_signature',
 'year',
 'Tim1',
 'Tim2',
 'Tim3',
 'Tim4',
 'Tim5',
 'Tim6',
 'Tim7',
 'Tim8',
 'Tim9',
 'Tim10',
 'Tim11',
 'Tim12',
 'Pitch1',
 'Pitch2',
 'Pitch3',
 'Pitch4',
 'Pitch5',
 'Pitch6',
 'Pitch7',
 'Pitch8',
 'Pitch9',
 'Pitch10',
 'Pitch11',
 'Pitch12']

In [35]:
finalWorkingDF = finalWorkingDF.drop(['sections_start','segments_pitches'], axis=1)
list(finalWorkingDF.columns.values)

['analysis_sample_rate',
 'artist_hotttnesss',
 'danceability',
 'duration',
 'end_of_fade_in',
 'energy',
 'key',
 'loudness',
 'mode',
 'segments_loudness_max',
 'segments_loudness_max_time',
 'segments_loudness_start',
 'song_hotttnesss',
 'start_of_fade_out',
 'tatums_start',
 'tempo',
 'time_signature',
 'year',
 'Tim1',
 'Tim2',
 'Tim3',
 'Tim4',
 'Tim5',
 'Tim6',
 'Tim7',
 'Tim8',
 'Tim9',
 'Tim10',
 'Tim11',
 'Tim12',
 'Pitch1',
 'Pitch2',
 'Pitch3',
 'Pitch4',
 'Pitch5',
 'Pitch6',
 'Pitch7',
 'Pitch8',
 'Pitch9',
 'Pitch10',
 'Pitch11',
 'Pitch12']

In [36]:
finalWorkingDF = finalWorkingDF.drop(['segments_loudness_max', 'segments_loudness_max_time', 'segments_loudness_start'], axis=1)

In [39]:
finalWorkingDF = finalWorkingDF.drop('analysis_sample_rate', axis=1)
finalWorkingDF.iloc[:,5:20]

Unnamed: 0,key,loudness,mode,song_hotttnesss,start_of_fade_out,tatums_start,tempo,time_signature,year,Tim1,Tim2,Tim3,Tim4,Tim5,Tim6
0,1.0,-11.197,0.0,0.602120,218.932,"[0.28519, 0.58521, 0.89422, 1.19196, 1.49119, ...",92.198,4.0,0.0,0.017144,0.055785,-0.036810,0.004086,-0.024784,-0.012329
1,6.0,-9.843,0.0,,137.915,"[0.20627, 0.45904, 0.71054, 0.96834, 1.21836, ...",121.274,4.0,1969.0,0.043516,1.729722,-0.091416,0.270088,0.579956,-0.665265
2,8.0,-9.689,1.0,,172.304,"[0.42132, 0.73152, 1.06609, 1.39732, 1.72854, ...",100.070,1.0,0.0,0.872901,0.622345,-2.336168,8.166264,-0.132376,-0.250627
3,0.0,-9.013,1.0,,217.124,"[0.56254, 0.81002, 1.05749, 1.30621, 1.55494, ...",119.293,4.0,1982.0,0.009640,0.662275,0.195186,0.044835,0.037489,-0.051844
4,2.0,-4.501,1.0,0.604501,198.699,"[0.13576, 0.36918, 0.59914, 0.83141, 1.06368, ...",129.738,4.0,2007.0,0.303340,2.170500,1.028924,4.050972,0.365201,-1.078785
5,5.0,-9.323,1.0,,254.270,"[0.53929, 0.74856, 0.95987, 1.17118, 1.38249, ...",147.782,3.0,0.0,0.561244,-2.345180,-2.586280,3.284071,0.482798,-1.211293
6,1.0,-17.302,1.0,,114.782,"[0.05611, 0.27253, 0.48785, 0.70535, 0.92722, ...",111.787,1.0,0.0,0.036026,0.065173,-0.200837,0.094206,-0.045941,-0.038854
7,4.0,-11.642,0.0,,181.023,"[0.36129, 0.65428, 0.94433, 1.24174, 1.53768, ...",101.430,3.0,0.0,0.302180,1.033880,-2.051625,2.713645,0.460419,-1.759239
8,4.0,-13.496,1.0,0.265861,258.990,"[1.22595, 1.39961, 1.57241, 1.74174, 1.91886, ...",86.643,4.0,1984.0,0.164036,0.634866,-0.192904,1.399303,0.118149,-0.634268
9,7.0,-6.697,0.0,,261.747,"[0.09933, 0.36057, 0.62445, 0.88967, 1.15423, ...",114.041,4.0,0.0,0.311712,0.453333,-1.205353,3.737511,-0.070557,-1.015891


In [41]:
tatums = finalWorkingDF['tatums_start']
diffLists = []
for tatum in tatums:
    diff = []
    old = 0.0
    for t in tatum:
        diff.append(t - old)
        old = t
    diffLists.append(np.mean(np.array(diff)))
    
finalWorkingDF['tatums_mean'] = diffLists

In [44]:
finalWorkingDF = finalWorkingDF.drop('tatums_start', axis=1)

Unnamed: 0,Tim7,Tim8,Tim9,Tim10,Tim11,Tim12,Pitch1,Pitch2,Pitch3,Pitch4,Pitch5,Pitch6,Pitch7,Pitch8,Pitch9,Pitch10,Pitch11,Pitch12,tatums_mean
0,-0.002554,-0.042519,0.010206,0.008911,-0.010066,0.007803,37.046004,120.477179,-79.841478,8.863259,-53.759702,-26.742869,-5.537102,-92.185639,22.108005,19.302711,-21.781842,16.926603,0.317613
1,0.065280,0.598581,-0.361809,-0.084183,0.062525,-0.085574,3.267913,129.780495,-6.873309,20.289421,43.542628,-50.036787,4.912887,45.033462,-27.173885,-6.312071,4.703293,-6.423300,0.246827
2,-1.730388,2.247878,-0.192745,-1.016610,-3.231945,0.119260,37.028750,26.466263,-99.419082,347.182291,-5.632113,-10.669601,-73.693238,95.388630,-8.168712,-43.155660,-137.423790,5.076842,0.298136
3,0.051303,0.078274,-0.275968,-0.043142,-0.086142,-0.128774,2.565910,176.003189,51.774223,11.929518,9.978057,-13.800586,13.658217,20.780797,-73.369905,-11.486943,-22.934629,-34.280749,0.251819
4,-1.198045,2.067701,-0.419762,-0.790450,-0.856176,-0.134791,12.685033,90.716096,43.006366,169.472477,15.266810,-45.114801,-50.116896,86.399747,-17.554204,-32.974927,-35.801193,-5.641859,0.230686
5,-1.398228,4.199186,-0.650176,-0.770442,-1.206507,0.434490,17.807354,-74.465271,-82.120785,104.280884,15.328938,-38.320318,-44.395356,133.276254,-20.644843,-24.457141,-38.309567,13.796382,0.204759
6,-0.126813,0.053011,-0.105488,-0.008007,-0.066531,0.033393,19.137802,34.616215,-106.620963,49.825890,-24.219144,-20.531045,-67.235354,28.165474,-55.977323,-4.249987,-35.275960,17.723879,0.263663
7,-0.394802,2.438936,0.305314,-0.863374,-0.122689,0.166735,12.338615,42.224327,-83.790580,110.826984,18.798255,-71.849968,-16.124405,99.605592,12.469079,-35.076984,-5.010207,6.808240,0.295114
8,-0.111470,1.431148,-0.210620,-0.212372,-0.480700,-0.390963,11.516335,44.571061,-13.543896,98.193915,8.274477,-44.530553,-7.826246,100.470633,-14.772940,-14.856733,-33.741867,-27.441826,0.174624
9,-0.737640,1.642799,0.046523,-0.551117,-0.608106,0.023756,24.597920,35.765957,-95.126286,294.957135,-5.566429,-80.168251,-58.208761,129.611920,3.666707,-43.335713,-47.944570,1.874367,0.263051


In [10]:
from sklearn import neighbors
from sklearn.model_selection import train_test_split

# Separatee data into training and testing set
y = finalWorkingDF['year']
x = finalWorkingDF.drop('year', axis = 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [25]:
type(x_train)

pandas.core.frame.DataFrame

In [24]:
print(str(x_train.shape))
print(str(x_test.shape))
# print(x_train.shape)
# print(x_train.shape)
# print(x_train.shape)

(8000, 25)
(2000, 25)


In [18]:
x_train.to_pickle("XTrain")
y_train.to_pickle("YTrain")
x_test.to_pickle("XTest")
y_test.to_pickle("YTest")

In [27]:
x_trainNumPy = x_train.to_numpy()
x_testNumPy = x_test.to_numpy()
y_trainNumPy = y_train.to_numpy()
y_testNumPy = y_test.to_numpy()

AttributeError: 'DataFrame' object has no attribute 'to_numpy'

In [12]:
###################### KNN CLASSIFIER ########################
n_neighbors = 15

nnClassifier = neighbors.KNeighborsClassifier(n_neighbors)
nnClassifier.fit(x_train, y_train)

# Get training and test predictions
classifier_trainMSE = np.mean((nnClassifier.predict(x_train) - y_train)**2)
classifier_testMSE = np.mean((nnClassifier.predict(x_test) - y_test)**2)

print("training MSE", classifier_trainMSE)
print("testing MSE", classifier_testMSE)

ValueError: setting an array element with a sequence.

In [None]:
###################### KNN REGRESSION ########################
n_neighbors = 15

nnRegressor = neighbors.KNeighborsRegressor(n_neighbors)
nnRegressor.fit(x_train, y_train)

# Get training and test predictions
regressor_trainMSE = np.mean((nnRegressor.predict(x_train) - y_train)**2)
regressor_testMSE = np.mean((nnRegressor.predict(x_test) - y_test)**2)

print("training MSE", regressor_trainMSE)
print("testing MSE", regressor_testMSE)

In [14]:
!pip install keras



In [15]:
###################### NEURAL NETWORK ########################
from keras.models import Sequential
from keras.layers import Dense

# create model
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model
model.fit(x_train, y_train, epochs=150, batch_size=10)

# evaluate the model
train_scores = model.evaluate(x_train, y_train)
print("\n%s: %.2f%%" % (model.metrics_names[1], train_scores[1]*100))
test_scores = model.evaluate(x_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], test_scores[1]*100))

ModuleNotFoundError: No module named 'keras'

In [None]:
# from keras.models import Sequential

In [None]:
# !pip install keras

In [None]:
# from keras.models import Sequential


In [None]:
# !python --version

In [16]:
!python --version

Python 3.6.8 :: Anaconda, Inc.


In [None]:
# !keras --version

In [None]:
# import keras

In [None]:
# !pip install keras

In [19]:
list(workingData.columns.values)

['analysis_sample_rate',
 'artist_hotttnesss',
 'artist_latitude',
 'artist_longitude',
 'bars_start',
 'beats_start',
 'danceability',
 'duration',
 'end_of_fade_in',
 'energy',
 'key',
 'loudness',
 'mode',
 'sections_start',
 'segments_loudness_max',
 'segments_loudness_max_time',
 'segments_loudness_start',
 'segments_pitches',
 'segments_start',
 'segments_timbre',
 'song_hotttnesss',
 'start_of_fade_out',
 'tatums_start',
 'tempo',
 'time_signature',
 'year']