# Comparing Year Prediction Using Complex Audio Features vs Song Lyrics

## Special Notes
1. We can write the paper in here and then hide the code when we knit it to PDF using the following command:
    (jupyter nbconvert --to pdf --template hidecode Example.ipynb
2. Bag of Words data is stored in a SQLite database that we will have to figure out how to use over spring break


In [1]:
import h5py
import numpy as np
import pandas as pd
import tables
#from sklearn.model_selection import train_test_split
#from sklearn.naive_bayes import GaussianNB

In [8]:
#################### THIS CREATES THE PANDAS DATAFRAME FROM THE DATAFILES WE HAVE ###########################
import os
import sys
import time
import glob
import scipy.io as sio
import pandas as pd
import hdf5_getters
from pandas import read_hdf

def get_all_files(basedir,ext='.h5') :
    """
    From a root directory, go through all subdirectories
    and find all files with the given extension.
    Return all absolute paths in a list.
    """
    allfiles = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files :
            allfiles.append( os.path.abspath(f) )
    return allfiles

def transfer(h5path,matpath=None,force=False):
    """
    Transfer an HDF5 song file (.h5) to a matfile (.mat)
    If there are more than one song in the HDF5 file, each
    field name gets a number happened: 1, 2, 3, ...., numfiles
    PARAM
        h5path  - path to the HDF5 song file
        matpath - path to the new matfile, same as HDF5 path
                  with a different extension by default
        force   - if True and matfile exists, overwrite
    RETURN
        True if the file was transfered, False if there was
        a problem.
        Could also raise an IOException
    NOTE
        All the data has to be loaded in memory! be careful
        if one file contains tons of songs!
    """
    # sanity checks
    if not os.path.isfile(h5path):
        print ('path to HF5 files does not exist:',h5path)
        return False
    if not os.path.splitext(h5path)[1] == '.h5':
        print ('expecting a .h5 extension for file:',h5path)
        return False
    # check matfile
    if matpath is None:
        matpath = os.path.splitext(h5path)[0] + '.mat'
    if os.path.exists(matpath):
        if not force: 
            print('matfile',matpath,'already exists (delete or force):')
            return False
    # get all getters! we assume that all we need is in hdf5_getters.py
    # further assume that they have the form get_blablabla and that's the
    # only thing that has that form
    getters = list(filter(lambda x: x[:4] == 'get_', hdf5_getters.__dict__.keys()))
    getters.remove("get_num_songs") # special case
    # open h5 file
    h5 = hdf5_getters.open_h5_file_read(h5path)
    # transfer
    nSongs = hdf5_getters.get_num_songs(h5)
    matdata = {'transfer_note':'transferred on '+time.ctime()+' from file: '+h5path}
    try:
        # iterate over songs
        for songidx in range(nSongs):
            # iterate over getter
            for getter in getters:
                gettername = getter[4:]
                if nSongs > 1:
                    gettername += str(songidx+1)
                data = hdf5_getters.__getattribute__(getter)(h5,songidx)
                matdata[gettername] = data
    except MemoryError:
        print('asdfasdfasdfasdf')
        raise
    finally:
        # close h5
        h5.close()
    # create
    return matdata



h5s = get_all_files('MillionSongSubset/data/')

df = pd.DataFrame()
for file in h5s:
    xd = transfer(file)
    df = df.append(pd.Series(xd), ignore_index=True)

df.head(10)

Unnamed: 0,analysis_sample_rate,artist_7digitalid,artist_familiarity,artist_hotttnesss,artist_id,artist_latitude,artist_location,artist_longitude,artist_mbid,artist_mbtags,...,tatums_confidence,tatums_start,tempo,time_signature,time_signature_confidence,title,track_7digitalid,track_id,transfer_note,year
0,22050.0,16971.0,0.55746,0.386152,b'AREJXK41187B9A4ACC',46.71067,b'France',1.71819,b'c43bb0d6-94d7-410f-80fb-e5a243b18d23',[],...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.04257, 0.28357, 0.52458, 0.76558, 1.00659, ...",124.059,4.0,0.372,b'Je Sais Que La Terre Est Plate',3073568.0,b'TRARRZU128F4253CA2',transferred on Thu Feb 28 15:38:21 2019 from f...,2008.0
1,22050.0,92108.0,0.626958,0.43486,b'AR2XRFQ1187FB417FE',,b'',,b'a69cd724-2f57-4ed0-bfed-ba20401eb84c',[],...,"[0.365, 0.321, 0.29, 0.264, 0.238, 0.217, 0.20...","[0.59161, 0.84042, 1.08799, 1.33555, 1.58436, ...",80.084,4.0,0.533,b'On Efface',4249244.0,b'TRARRJL128F92DED0E',transferred on Thu Feb 28 15:38:21 2019 from f...,2004.0
2,22050.0,1701.0,0.425724,0.0,b'ARODOO01187FB44F4A',,b'',,b'60bd8a1c-c093-4849-8f28-08101ca059b1',[],...,"[0.307, 0.305, 0.291, 0.284, 0.281, 0.28, 0.27...","[1.11537, 1.39701, 1.67864, 1.95344, 2.23097, ...",54.874,4.0,0.0,b'Howells Delight',5436063.0,b'TRARRUZ128F9307C57',transferred on Thu Feb 28 15:38:21 2019 from f...,0.0
3,22050.0,92184.0,0.611495,0.33452,b'ARJGW911187FB586CA',,b'',,b'44b5b950-2ae2-403a-8c67-82d8fc72033d',[],...,"[0.731, 0.628, 0.526, 0.359, 0.287, 0.246, 0.2...","[0.11929, 0.39309, 0.65603, 0.91083, 1.17222, ...",77.15,3.0,0.369,b'Martha Served',1199928.0,b'TRARRWA128F42A0195',transferred on Thu Feb 28 15:38:21 2019 from f...,2007.0
4,22050.0,278655.0,0.367255,0.311616,b'AR9HQ6Y1187FB3C2CB',,b'',,b'0e6524bd-6641-46a6-bce5-96f06c19aa46',[],...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.24248, 0.49266, 0.74283, 0.99301, 1.24318, ...",120.382,4.0,1.0,b'Zip-A-Dee-Doo-Dah (Song of the South)',8493899.0,b'TRARRPG12903CD1DE9',transferred on Thu Feb 28 15:38:21 2019 from f...,0.0
5,22050.0,16313.0,0.601306,0.363676,b'ARDPTGD1187B9AD361',36.87652,"b'Sikeston, MO'",-89.58828,b'097c86bc-b2b6-4791-833b-39b5385fe3e6',"[b'fusion', b'jazz fusion', b'classic pop and ...",...,"[0.237, 0.235, 0.234, 0.233, 0.226, 0.218, 0.2...","[0.11365, 0.26495, 0.41625, 0.56755, 0.71126, ...",99.024,4.0,1.0,b'Liquid Time (composition by John Goodsall)',5570526.0,b'TRARRER128F9328521',transferred on Thu Feb 28 15:38:21 2019 from f...,0.0
6,22050.0,21896.0,0.709011,0.553566,b'ARV8T9T1187B99F3F4',,b'',,b'efaefde1-e09b-4d49-9d8e-b1304d2ece8d',[b'finnish'],...,"[0.161, 0.154, 0.134, 0.117, 0.103, 0.094, 0.0...","[0.50326, 0.84168, 1.1801, 1.51853, 1.86368, 2...",175.673,4.0,0.0,b'Misery Path (From the Privilege of Evil)',2999402.0,b'TRARRYC128F428CCDA',transferred on Thu Feb 28 15:38:21 2019 from f...,0.0
7,22050.0,98670.0,0.548022,0.440135,b'ARJ5BEW1187FB52361',,b'',,b'de885e5b-284d-4dac-954c-48c7d7e2ebe5',[],...,"[0.351, 0.325, 0.44, 0.372, 0.364, 0.315, 0.30...","[0.09174, 0.41036, 0.73235, 1.09641, 1.45872, ...",87.999,4.0,0.954,b'Nuovi Re pt. I I (feat. Tek money - Lady Tam...,1283021.0,b'TRARROY128F42281F7',transferred on Thu Feb 28 15:38:21 2019 from f...,0.0
8,22050.0,94403.0,0.737038,0.539245,b'AR050VJ1187B9B13A7',,b'',,b'37c78aeb-d196-42b5-b991-6afb4fc9bc2e',"[b'punk', b'california', b'san francisco', b'r...",...,"[0.767, 0.742, 0.721, 0.704, 0.685, 0.672, 0.6...","[0.06784, 0.23214, 0.39241, 0.5559, 0.7194, 0....",92.897,4.0,0.879,b'Halloween',1959132.0,b'TRARREF128F422FD96',transferred on Thu Feb 28 15:38:22 2019 from f...,1982.0
9,22050.0,263016.0,0.435915,0.358149,b'AR8KUS11187B98C991',,b'',,b'050ce7ea-0935-430f-bcec-b83e702298eb',[],...,"[0.578, 0.485, 0.418, 0.349, 0.293, 0.212, 0.1...","[0.39517, 0.73361, 1.07887, 1.41731, 1.75062, ...",86.981,5.0,0.688,b'Parto em terras distantes',3779273.0,b'TRARRVB128F92F47CA',transferred on Thu Feb 28 15:38:22 2019 from f...,1998.0


In [15]:
############ HERE WE WILL CLEAN THE DATA ########################

# All songs have the same analysis_sample_rate so I remove it
newdf = df.drop("analysis_sample_rate",axis=1)

# Remove rows that have year==0 (no year) because we cannot use these data points
newdf = newdf[newdf.year != 0]

newdf.shape


(4680, 54)