# Data Processing / Feature Extraction

In [124]:
import librosa
import pandas as pd
import numpy as np
import requests
import os
import sklearn

In [125]:
rdf = pd.read_csv('data/rock_df.csv')
jdf = pd.read_csv('data/jazz_df.csv')
cdf = pd.read_csv('data/classical_df.csv')
pdf = pd.read_csv('data/punk_df.csv')
hdf = pd.read_csv('data/hiphop_df.csv')

Within each of these datasets, we've taken a link to Spotify's 30-second preview of each song

In [126]:
rdf['Preview'].head()

0    https://p.scdn.co/mp3-preview/c5742cc09643dc07...
1    https://p.scdn.co/mp3-preview/af0c42e6dacc0b8b...
2    https://p.scdn.co/mp3-preview/90e41778392f27b6...
3    https://p.scdn.co/mp3-preview/7cc3982631523940...
4    https://p.scdn.co/mp3-preview/b1dd1977653f3668...
Name: Preview, dtype: object

We're going to have to go about downloading these files in order to work with them, so we created a function in order to download these files, each being placed in a folder for each genre.

In [127]:
def download_and_save(songid, url, genre):
    filename = songid + '.mp3'
    outfile = os.path.join('data', 'mp3s', genre, filename)
    
    if not os.path.exists(outfile):                        
        print("downloading {}: {}".format(filename, url))
        r = requests.get(url, outfile)
        with open(outfile, 'wb') as f:
            f.write(r.content)
    else:
        pass

So, for each genre, we acquired each of the sample mp3 files.

In [128]:
for i,j in zip(rdf['Preview'], rdf['ID']):
    download_and_save(j,i,'rock')

In [129]:
for i,j in zip(cdf['Preview'], cdf['ID']):
    download_and_save(j,i,'classical')

In [130]:
for i,j in zip(hdf['Preview'], hdf['ID']):
    download_and_save(j,i,'hiphop')

In [131]:
for i,j in zip(pdf['Preview'], pdf['ID']):
    download_and_save(j,i,'punk')

In [132]:
for i,j in zip(jdf['Preview'], jdf['ID']):
    download_and_save(j,i,'jazz')

Now that we have the files, we can load them with librosa.

In [133]:
rys, rsrs = [], []
for i in rdf['ID']:
    y, sr = librosa.load(os.path.join('data','mp3s','rock',i+'.mp3'))
    rys.append(y)
    rsrs.append(sr)



In [134]:
jys, jsrs = [], []
for i in jdf['ID']:
    y, sr = librosa.load(os.path.join('data','mp3s','jazz',i+'.mp3'))
    jys.append(y)
    jsrs.append(sr)

In [135]:
hys, hsrs = [], []
for i in hdf['ID']:
    y, sr = librosa.load(os.path.join('data','mp3s','hiphop',i+'.mp3'))
    hys.append(y)
    hsrs.append(sr)

In [136]:
cys, csrs = [], []
for i in cdf['ID']:
    y, sr = librosa.load(os.path.join('data','mp3s','classical',i+'.mp3'))
    cys.append(y)
    csrs.append(sr)

In [137]:
pys, psrs = [], []
for i in pdf['ID']:
    y, sr = librosa.load(os.path.join('data','mp3s','punk',i+'.mp3'))
    pys.append(y)
    psrs.append(sr)

Unfortunately, while all were sampled at the same rate, the previews all vary in terms of time. Nearly all around around 30 seconds, except 2 of the classical previews, but even then some have 661500 samples, while others have 661501.

In [138]:
pd.Series(rys).apply(lambda x: x[:661500]).apply(lambda x: len(x)).unique()

array([661500], dtype=int64)

In [139]:
pd.Series(jys).apply(lambda x: len(x)).unique()

array([661500, 661501], dtype=int64)

In [140]:
pd.Series(cys).apply(lambda x: len(x)).unique()

array([ 661500,  661501,  816608, 1175107], dtype=int64)

In [141]:
pd.Series(hys).apply(lambda x: len(x)).unique()

array([661500, 661501], dtype=int64)

In [142]:
pd.Series(pys).apply(lambda x: len(x)).unique()

array([661501, 661500], dtype=int64)

In order to make better use of these later on, the previews will be trimmed to the initial 661500 samples. For reference, the sampling rate will be kept as well.

In [143]:
rdf['Time Series'] = pd.Series(rys).apply(lambda x: x[:661500])
cdf['Time Series'] = pd.Series(cys).apply(lambda x: x[:661500])
hdf['Time Series'] = pd.Series(hys).apply(lambda x: x[:661500])
pdf['Time Series'] = pd.Series(pys).apply(lambda x: x[:661500])
jdf['Time Series'] = pd.Series(jys).apply(lambda x: x[:661500])
rdf['Sampling Rate'] = pd.Series(rsrs)
cdf['Sampling Rate'] = pd.Series(csrs)
hdf['Sampling Rate'] = pd.Series(hsrs)
pdf['Sampling Rate'] = pd.Series(psrs)
jdf['Sampling Rate'] = pd.Series(jsrs)

In [144]:
rdf.head()

Unnamed: 0.1,Unnamed: 0,ID,Duration,Explicit,Popularity,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Preview,Time Series,Sampling Rate
0,1,62nQ8UZVqR2RMvkJHkcO2o,318226,False,72,0.285,0.846,2,-6.472,1,0.0438,0.0404,0.0,0.182,0.287,108.808,4,https://p.scdn.co/mp3-preview/c5742cc09643dc07...,"[-0.08670287, -0.23489232, -0.26327035, -0.243...",22050
1,2,59WN2psjkt1tyaxjspN8fp,313573,True,78,0.466,0.833,7,-4.215,1,0.304,0.0266,0.0,0.0327,0.661,88.785,4,https://p.scdn.co/mp3-preview/af0c42e6dacc0b8b...,"[-0.17826098, -0.44856685, -0.40792686, -0.389...",22050
2,3,3d9DChrdc6BOeFsbrZ3Is0,264306,False,81,0.559,0.345,4,-13.496,1,0.0459,0.0576,0.000105,0.141,0.458,84.581,4,https://p.scdn.co/mp3-preview/90e41778392f27b6...,"[0.004214895, 0.0075499583, -0.005760105, 0.00...",22050
3,6,5jafMI8FLibnjkYTZ33m0c,257480,False,73,0.418,0.383,4,-11.782,1,0.0257,0.0718,0.0177,0.0896,0.352,87.773,4,https://p.scdn.co/mp3-preview/7cc3982631523940...,"[0.0023394097, -0.058240935, -0.08615264, -0.0...",22050
4,9,4PtZE0h5oyPhCtPjg3NeYQ,255573,False,64,0.527,0.838,3,-6.013,1,0.0323,0.0206,0.000662,0.07,0.721,117.454,4,https://p.scdn.co/mp3-preview/b1dd1977653f3668...,"[0.0030142576, 0.09424075, 0.3024647, 0.384066...",22050


Now, we may also want to look at Mel-frequency spectrograms for these songs. Thus, for each time series, we computed the mel spectrograms with a logarithmic magnitude scale.

In [145]:
rspecs = []
for y, sr in zip(rdf['Time Series'], rdf['Sampling Rate']):
    spec = librosa.feature.melspectrogram(y=y, sr=sr)
    spec = librosa.power_to_db(spec)
    rspecs.append(spec)
rdf['Log Mel'] = rspecs

In [146]:
cspecs = []
for y, sr in zip(cdf['Time Series'], cdf['Sampling Rate']):
    spec = librosa.feature.melspectrogram(y=y, sr=sr)
    spec = librosa.power_to_db(spec)
    cspecs.append(spec)
cdf['Log Mel'] = cspecs

In [147]:
hspecs = []
for y, sr in zip(hdf['Time Series'], hdf['Sampling Rate']):
    spec = librosa.feature.melspectrogram(y=y, sr=sr)
    spec = librosa.power_to_db(spec)
    hspecs.append(spec)
hdf['Log Mel'] = hspecs

In [148]:
pspecs = []
for y, sr in zip(pdf['Time Series'], pdf['Sampling Rate']):
    spec = librosa.feature.melspectrogram(y=y, sr=sr)
    spec = librosa.power_to_db(spec)
    pspecs.append(spec)
pdf['Log Mel'] = pspecs

In [149]:
jspecs = []
for y, sr in zip(jdf['Time Series'], jdf['Sampling Rate']):
    spec = librosa.feature.melspectrogram(y=y, sr=sr)
    spec = librosa.power_to_db(spec)
    jspecs.append(spec)
jdf['Log Mel'] = jspecs

In [150]:
rdf.head()

Unnamed: 0.1,Unnamed: 0,ID,Duration,Explicit,Popularity,Danceability,Energy,Key,Loudness,Mode,...,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Time Signature,Preview,Time Series,Sampling Rate,Log Mel
0,1,62nQ8UZVqR2RMvkJHkcO2o,318226,False,72,0.285,0.846,2,-6.472,1,...,0.0404,0.0,0.182,0.287,108.808,4,https://p.scdn.co/mp3-preview/c5742cc09643dc07...,"[-0.08670287, -0.23489232, -0.26327035, -0.243...",22050,"[[16.351944, 12.501945, 7.521196, 3.8267663, 0..."
1,2,59WN2psjkt1tyaxjspN8fp,313573,True,78,0.466,0.833,7,-4.215,1,...,0.0266,0.0,0.0327,0.661,88.785,4,https://p.scdn.co/mp3-preview/af0c42e6dacc0b8b...,"[-0.17826098, -0.44856685, -0.40792686, -0.389...",22050,"[[16.86778, 12.591039, 15.987344, 19.949896, 1..."
2,3,3d9DChrdc6BOeFsbrZ3Is0,264306,False,81,0.559,0.345,4,-13.496,1,...,0.0576,0.000105,0.141,0.458,84.581,4,https://p.scdn.co/mp3-preview/90e41778392f27b6...,"[0.004214895, 0.0075499583, -0.005760105, 0.00...",22050,"[[6.358962, 0.6880936, -4.500276, -3.0977912, ..."
3,6,5jafMI8FLibnjkYTZ33m0c,257480,False,73,0.418,0.383,4,-11.782,1,...,0.0718,0.0177,0.0896,0.352,87.773,4,https://p.scdn.co/mp3-preview/7cc3982631523940...,"[0.0023394097, -0.058240935, -0.08615264, -0.0...",22050,"[[5.6829166, 0.66769683, -9.947701, -6.980886,..."
4,9,4PtZE0h5oyPhCtPjg3NeYQ,255573,False,64,0.527,0.838,3,-6.013,1,...,0.0206,0.000662,0.07,0.721,117.454,4,https://p.scdn.co/mp3-preview/b1dd1977653f3668...,"[0.0030142576, 0.09424075, 0.3024647, 0.384066...",22050,"[[8.955857, 8.933042, 11.030407, 10.565489, 7...."


In [151]:
scaler = sklearn.preprocessing.StandardScaler()

In [152]:
rmfccs = []
for y,sr in zip(rdf['Time Series'], rdf['Sampling Rate']):
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=12).T
    scaler.fit(mfcc)
    rmfccs.append(scaler.transform(mfcc))

In [153]:
cmfccs = []
for y,sr in zip(cdf['Time Series'], cdf['Sampling Rate']):
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=12).T
    scaler.fit(mfcc)
    cmfccs.append(scaler.transform(mfcc))

In [154]:
hmfccs = []
for y,sr in zip(hdf['Time Series'], hdf['Sampling Rate']):
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=12).T
    scaler.fit(mfcc)
    hmfccs.append(scaler.transform(mfcc))

In [155]:
jmfccs = []
for y,sr in zip(jdf['Time Series'], jdf['Sampling Rate']):
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=12).T
    scaler.fit(mfcc)
    jmfccs.append(scaler.transform(mfcc))

In [156]:
pmfccs = []
for y,sr in zip(pdf['Time Series'], pdf['Sampling Rate']):
    mfcc = librosa.feature.mfcc(y, sr=sr, n_mfcc=12).T
    scaler.fit(mfcc)
    pmfccs.append(scaler.transform(mfcc))

In [157]:
rdf['MFCC'] = pd.Series(rmfccs)
cdf['MFCC'] = pd.Series(cmfccs)
hdf['MFCC'] = pd.Series(hmfccs)
jdf['MFCC'] = pd.Series(jmfccs)
pdf['MFCC'] = pd.Series(pmfccs)

In [160]:
rdf['Genre'] = pd.Series(["Rock" for x in np.arange(len(rdf))])
cdf['Genre'] = pd.Series(["Classical" for x in np.arange(len(cdf))])
hdf['Genre'] = pd.Series(["Hip Hop" for x in np.arange(len(hdf))])
jdf['Genre'] = pd.Series(["Jazz" for x in np.arange(len(jdf))])
pdf['Genre'] = pd.Series(["Punk" for x in np.arange(len(pdf))])

With all MFCCs and Genres added, we finally combined the dataframes,

In [162]:
df = pd.concat([rdf, cdf, hdf, jdf, pdf], ignore_index=True).drop('Unnamed: 0', axis=1)

and saved it as a pickle

In [164]:
df.to_pickle('combined.pkl')