In [3]:
import pandas as pd
import numpy as np
import spotipy
import spotipy.util as util
import os 
from spotipy.oauth2 import SpotifyClientCredentials
pd.set_option('display.max_rows', None)
from IPython.display import Audio
import librosa
import sklearn

# Creating the Dataset

#### Spotify API Authentication

In [4]:
client_id = '##############'
client_secret = '####################'
credentials = SpotifyClientCredentials(client_id = client_id, client_secret = client_secret)
token = credentials.get_access_token()
spotify = spotipy.Spotify(client_credentials_manager=credentials, auth=token)

  after removing the cwd from sys.path.


#### Get most popular 2 hip-hop songs per year using search

In [5]:
songs = []
for year in range(1980, 2021):
    q_string = 'genre:hip-hop year:{input}'.format(input = year)
    search = spotify.search(q = q_string, type = 'track', limit = 2)
    songs.append(search['tracks']['items'][0])
    songs.append(search['tracks']['items'][1])

#### Take JSON and format into a table

In [6]:
df = pd.DataFrame()
for song in songs:
    year = song['album']['release_date'][0:4]
    title = song['name']
    artist = song['artists'][0]['name']
    album_name = song['album']['name']
    song_id = song['id']
    df = df.append({'title':title, 'artist':artist, 'album':album_name, 'id':song_id, 'year':year}, ignore_index = True)
df.head()

Unnamed: 0,album,artist,id,title,year
0,Zapp,Zapp,1LiwqqaafXkNQuWGW3HeId,More Bounce to the Ounce,1980
1,Kurtis Blow,Kurtis Blow,5TvcggVu3s2P1fMAS8BupX,The Breaks,1980
2,8th Wonder,The Sugarhill Gang,4UCkX8nrBlpxjrrEqtb46a,Apache,1981
3,Apache / Rapper's Delight [Digital 45],The Sugarhill Gang,2rSosmvUzlCiucKgpmbS0f,Apache - 45 Version,1981
4,Planet Rock,Afrika Bambaataa,3uy0jtkM8QYVTsBazkli1x,Planet Rock,1982


#### Replace duplicates and songs that aren't hip hop with 3rd place in each search.

In [7]:
#0, 3, 6, 7, 10, 32, 38, 64, 65, 72
# df[0] = {'album':'The Sugarhill Gang', 'artist':'The Sugarhill Gang', 'id':'1FnxNtKli7rcB9IU4J8MEg', 'title':'Rapper\'s Delight', 'year':'1980'}
def replace_row(row, album, artist, id, title, year):
    ser = df.loc[row]
    ser['album'] = album
    ser['artist'] = artist
    ser['id'] = id
    ser['title'] = title
    ser['year'] = year
    
replace_row(0, 'The Sugarhill Gang', 'The Sugarhill Gang', '1FnxNtKli7rcB9IU4J8MEg', 'Rapper\'s Delight', 1980)
replace_row(3, '8th Wonder', 'The Sugarhill Gang', '6m0wOLtrzaNwMofVVJTJqs', '8th Wonder', 1981)
replace_row(6, 'Money Dollar Bill Y\'all', 'Jimmy Spicer', '3Bu69QHyCAyBy8TQ8N1lIA', 'Money Dollar Bill Y\'all', 1983)
replace_row(7, 'Sucker D.J.\'s (I Will Survive)', 'Dimples D.', '3GdWz9M6RrwipY72BIdhMA', 'Sucker D.J.\'s (I Will Survive)', 1983)
replace_row(10, 'King of Rock', 'Run-D.M.C.', '7112WRQXlBGe4Os43yw8gV', 'King of Rock', 1985)
replace_row(32, 'It Was Written', 'Nas', '5PQmSHzWnlgG4EBuIqjac2', 'If I Ruled the World (Imagine That)', 1996)
replace_row(38, 'Things Fall Apart', 'The Roots', '4pdLZsxq0y5oJDb6Cxlokw', 'You Got Me', 1999)
replace_row(52, 'Blow the Whistle', 'Too $hort', '2lMg3lCMOGistaWBNGjuT3', 'Blow the Whistle', 2006)
replace_row(64, 'Dreams and Nightmares', 'Meek Mill', '42zd6DYQ4o4SECmTITrM1U', 'Dreams and Nightmares', 2012)
replace_row(65, 'good kid, m.A.A.d city', 'Kendrick Lamar', '2HbKqm4o0w5wEeEFXm2sD4', 'Money Trees', 2012)
replace_row(72, 'EVOL', 'Low Life', '7EiZI6JVHllARrX9PUvAdX', 'Low Life', 2016)

#### And now, add in some songs that either cover influential artists that were originally omitted (2Pac, JAY-Z, Kanye West), as well as a few more interesting or influential songs we'd like to look at.

In [8]:
df = df.append({'album':'My Beautiful Dark Twisted Fantasy', 'artist':'Kanye West', 'id':'22L7bfCiAkJo5xGSQgmiIO', 'title':'All of the Lights', 'year':2010}, ignore_index = True)
df = df.append({'album':'All Eyez On Me', 'artist':'2Pac', 'id':'20fBuVybkHgjF6vNhSMROD', 'title':'Ambitionz Az a Ridah', 'year':1996}, ignore_index = True)
df = df.append({'album':'Greatest Hits', 'artist':'2Pac', 'id':'3ia3dJETSOllPsv3LJkE35', 'title':'California Love', 'year':1998}, ignore_index = True)
df = df.append({'album':'Reasonable Doubt', 'artist':'JAY-Z', 'id':'3nDYsXggRQxf7PCNUjR4rz', 'title':'Dead Presidents II', 'year':1996}, ignore_index = True)
df = df.append({'album':'The Blueprint 3', 'artist':'JAY-Z', 'id':'2igwFfvr1OAGX9SKDCPBwO', 'title':'Empire State of Mind', 'year':2009}, ignore_index = True)
df = df.append({'album':'Fear of a Black Planet', 'artist':'Public Enemy', 'id':'1yo16b3u0lptm6Cs7lx4AD', 'title':'Fight the Power', 'year':1990}, ignore_index = True)
df = df.append({'album':'Late Registration', 'artist':'Kanye West', 'id':'1PS1QMdUqOal0ai3Gt7sDQ', 'title':'Gold Digger', 'year':2005}, ignore_index = True)
df = df.append({'album':'Lil Pump', 'artist':'Lil Pump', 'id':'43ZyHQITOjhciSUUNPVRHc', 'title':'Gucci Gang', 'year':2017}, ignore_index = True)
df = df.append({'album':'DAMN.', 'artist':'Kendrick Lamar', 'id':'7KXjTSCq5nL1LoYtL7XAwS', 'title':'HUMBLE.', 'year':2017}, ignore_index = True)
df = df.append({'album':'Watch the Throne', 'artist':'JAY-Z', 'id':'4Li2WHPkuyCdtmokzW2007', 'title':'Ni**as in Paris', 'year':2011}, ignore_index = True)
df = df.append({'album':'The Chronic', 'artist':'Dr. Dre', 'id':'4YtoipFgf4k0AfD17ZfD5X', 'title':'Nuthin\' But a G Thang', 'year':1992}, ignore_index = True)
df = df.append({'album':'7 EP', 'artist':'Lil Nas X', 'id':'2YpeDb67231RjR0MgVLzsG', 'title':'Old Town Road', 'year':2019}, ignore_index = True)
df = df.append({'album':'ASTROWORLD', 'artist':'Travis Scott', 'id':'2xLMifQCjDGFmkHkpNLD9h', 'title':'SICKO MODE', 'year':2018}, ignore_index = True)
df = df.append({'album':'Mecca and the Soul Brother', 'artist':'Pete Rock', 'id':'2Mb3zpobD0CvJGWv6NpsPy', 'title':'They Reminisce Over You (T.R.O.Y)', 'year':1992}, ignore_index = True)
df.tail()

Unnamed: 0,album,artist,id,title,year
91,Watch the Throne,JAY-Z,4Li2WHPkuyCdtmokzW2007,Ni**as in Paris,2011
92,The Chronic,Dr. Dre,4YtoipFgf4k0AfD17ZfD5X,Nuthin' But a G Thang,1992
93,7 EP,Lil Nas X,2YpeDb67231RjR0MgVLzsG,Old Town Road,2019
94,ASTROWORLD,Travis Scott,2xLMifQCjDGFmkHkpNLD9h,SICKO MODE,2018
95,Mecca and the Soul Brother,Pete Rock,2Mb3zpobD0CvJGWv6NpsPy,They Reminisce Over You (T.R.O.Y),1992


In [9]:
df

Unnamed: 0,album,artist,id,title,year
0,The Sugarhill Gang,The Sugarhill Gang,1FnxNtKli7rcB9IU4J8MEg,Rapper's Delight,1980
1,Kurtis Blow,Kurtis Blow,5TvcggVu3s2P1fMAS8BupX,The Breaks,1980
2,8th Wonder,The Sugarhill Gang,4UCkX8nrBlpxjrrEqtb46a,Apache,1981
3,8th Wonder,The Sugarhill Gang,6m0wOLtrzaNwMofVVJTJqs,8th Wonder,1981
4,Planet Rock,Afrika Bambaataa,3uy0jtkM8QYVTsBazkli1x,Planet Rock,1982
5,The Message,Grandmaster Flash & The Furious Five,5DuTNKFEjJIySAyJH1yNDU,The Message (feat. Melle Mel & Duke Bootee),1982
6,Money Dollar Bill Y'all,Jimmy Spicer,3Bu69QHyCAyBy8TQ8N1lIA,Money Dollar Bill Y'all,1983
7,Sucker D.J.'s (I Will Survive),Dimples D.,3GdWz9M6RrwipY72BIdhMA,Sucker D.J.'s (I Will Survive),1983
8,Jam On Revenge,Newcleus,1lB2kyB5h9ceZ388GBfC9L,Jam On It,1984
9,RUN-DMC (Expanded Edition),Run–D.M.C.,2J6QnTjHIWwXErNWyF0RUC,It's Like That,1984


#### Use Spotify's API to pull Spotify's built-in audio features to augument the ones we'll make later.

In [10]:
spotify_feature_df = pd.DataFrame()
for fl in spotify.audio_features(tracks = list(df['id'])):
    spotify_feature_df = spotify_feature_df.append({'danceability':fl['danceability'],
                              'energy':fl['energy'],
                              'loudness':fl['loudness'],
                              'speechiness':fl['speechiness'],
                              'acousticness':fl['acousticness'],
                             'valence':fl['valence'],
                             'tempo':fl['tempo']}, ignore_index = True)

In [11]:
spotify_feature_df.head()

Unnamed: 0,acousticness,danceability,energy,loudness,speechiness,tempo,valence
0,0.22,0.905,0.692,-14.143,0.199,111.292,0.943
1,0.0987,0.91,0.576,-13.256,0.109,112.877,0.907
2,0.332,0.827,0.928,-7.266,0.133,115.023,0.261
3,0.298,0.871,0.937,-8.536,0.195,106.494,0.675
4,0.011,0.915,0.807,-9.636,0.13,127.35,0.708


#### To match these songs with the audio files, we're going to have to find matches in strings, and manually cover the rest

In [12]:
matches = [0] * len(df)
for i in range(len(df)):
    song = df['title'][i]
    for name in os.listdir('./data'):
        if name.lower()[:-4] in song.lower():
            matches[i] = './data/' + name

In [14]:
df['file'] = pd.Series(matches)
df

Unnamed: 0,album,artist,id,title,year,file
0,The Sugarhill Gang,The Sugarhill Gang,1FnxNtKli7rcB9IU4J8MEg,Rapper's Delight,1980,./data/Rapper's Delight.mp3
1,Kurtis Blow,Kurtis Blow,5TvcggVu3s2P1fMAS8BupX,The Breaks,1980,./data/The Breaks.mp3
2,8th Wonder,The Sugarhill Gang,4UCkX8nrBlpxjrrEqtb46a,Apache,1981,./data/Apache.mp3
3,8th Wonder,The Sugarhill Gang,6m0wOLtrzaNwMofVVJTJqs,8th Wonder,1981,./data/8th Wonder.mp3
4,Planet Rock,Afrika Bambaataa,3uy0jtkM8QYVTsBazkli1x,Planet Rock,1982,./data/Planet Rock.mp3
5,The Message,Grandmaster Flash & The Furious Five,5DuTNKFEjJIySAyJH1yNDU,The Message (feat. Melle Mel & Duke Bootee),1982,./data/The Message.mp3
6,Money Dollar Bill Y'all,Jimmy Spicer,3Bu69QHyCAyBy8TQ8N1lIA,Money Dollar Bill Y'all,1983,./data/Money Dollar Bill Y'All.mp3
7,Sucker D.J.'s (I Will Survive),Dimples D.,3GdWz9M6RrwipY72BIdhMA,Sucker D.J.'s (I Will Survive),1983,./data/Sucker D.J.'s (I Will Survive).mp3
8,Jam On Revenge,Newcleus,1lB2kyB5h9ceZ388GBfC9L,Jam On It,1984,./data/Jam On It.mp3
9,RUN-DMC (Expanded Edition),Run–D.M.C.,2J6QnTjHIWwXErNWyF0RUC,It's Like That,1984,./data/It's Like That.mp3


#### Phew. That took a while. Now onto the actual feature extraction.

In [28]:
ss = sklearn.preprocessing.StandardScaler()
def filename_to_features(fname):
    song, rate = librosa.load(fname)

    # 12 MFCC's
    mfccs = librosa.feature.mfcc(y = song, sr = rate, n_mfcc = 12).T
    mfcc_scaled = ss.fit_transform(X = mfccs)
    mfcc_series = pd.DataFrame(mfcc_scaled).mean()

    # 12 chroma features
    chroma = librosa.feature.chroma_stft(y = song, sr = rate)
    chroma_scaled = ss.fit_transform(X = chroma)
    chroma_series = pd.DataFrame(mfcc_scaled).mean()

    zcr = pd.Series(np.mean(librosa.feature.zero_crossing_rate(y = song)))
    bandwidth = pd.Series(np.mean(librosa.feature.spectral_bandwidth(y = song)))
    centroid = pd.Series(np.mean(librosa.feature.spectral_centroid(y = song)))
    rolloff = pd.Series(np.mean(librosa.feature.spectral_rolloff(y = song)))
    return pd.concat([mfcc_series, chroma_series, zcr, bandwidth, centroid, rolloff]).reset_index(drop = True)

In [30]:
features = df['file'].apply(filename_to_features)







In [32]:
features.to_csv('audio_features.csv')

In [36]:
full_features = pd.concat([df, features, spotify_feature_df], axis = 1)
full_features.to_csv('full_data.csv')

In [37]:
pd.read_csv('full_data.csv')

Unnamed: 0.1,Unnamed: 0,album,artist,id,title,year,file,mfcc_0,mfcc_1,mfcc_2,...,bandwidth,centroid,rolloff,acousticness,danceability,energy,loudness,speechiness,tempo,valence
0,0,The Sugarhill Gang,The Sugarhill Gang,1FnxNtKli7rcB9IU4J8MEg,Rapper's Delight,1980,./data/Rapper's Delight.mp3,1.890811e-08,-3.804202e-07,-2.639254e-07,...,2932.269422,3271.149826,6835.88842,0.22,0.905,0.692,-14.143,0.199,111.292,0.943
1,1,Kurtis Blow,Kurtis Blow,5TvcggVu3s2P1fMAS8BupX,The Breaks,1980,./data/The Breaks.mp3,1.057499e-07,6.93338e-08,-1.520881e-07,...,2574.581462,2369.193568,5234.249962,0.0987,0.91,0.576,-13.256,0.109,112.877,0.907
2,2,8th Wonder,The Sugarhill Gang,4UCkX8nrBlpxjrrEqtb46a,Apache,1981,./data/Apache.mp3,-1.414954e-07,1.60602e-08,-4.006917e-08,...,2535.741911,2907.329677,5791.839284,0.332,0.827,0.928,-7.266,0.133,115.023,0.261
3,3,8th Wonder,The Sugarhill Gang,6m0wOLtrzaNwMofVVJTJqs,8th Wonder,1981,./data/8th Wonder.mp3,3.262833e-07,-1.683564e-08,-7.625627e-08,...,2537.327894,3135.83475,5981.420399,0.298,0.871,0.937,-8.536,0.195,106.494,0.675
4,4,Planet Rock,Afrika Bambaataa,3uy0jtkM8QYVTsBazkli1x,Planet Rock,1982,./data/Planet Rock.mp3,-8.671024e-08,7.907207e-08,-5.019072e-08,...,2803.020324,3522.282559,6855.14545,0.011,0.915,0.807,-9.636,0.13,127.35,0.708
5,5,The Message,Grandmaster Flash & The Furious Five,5DuTNKFEjJIySAyJH1yNDU,The Message (feat. Melle Mel & Duke Bootee),1982,./data/The Message.mp3,-2.832823e-08,2.935369e-09,1.472571e-07,...,2574.355798,2934.204205,5828.924053,0.0249,0.947,0.607,-10.58,0.202,100.619,0.732
6,6,Money Dollar Bill Y'all,Jimmy Spicer,3Bu69QHyCAyBy8TQ8N1lIA,Money Dollar Bill Y'all,1983,./data/Money Dollar Bill Y'All.mp3,-2.792772e-08,1.844505e-07,7.472578e-08,...,2159.502236,1746.548657,3682.36835,0.0303,0.862,0.669,-10.26,0.158,99.379,0.668
7,7,Sucker D.J.'s (I Will Survive),Dimples D.,3GdWz9M6RrwipY72BIdhMA,Sucker D.J.'s (I Will Survive),1983,./data/Sucker D.J.'s (I Will Survive).mp3,1.63549e-07,1.327147e-07,-1.987773e-09,...,2233.021289,2369.362194,4649.272644,0.038,0.868,0.832,-9.08,0.136,107.979,0.931
8,8,Jam On Revenge,Newcleus,1lB2kyB5h9ceZ388GBfC9L,Jam On It,1984,./data/Jam On It.mp3,-8.541609e-08,1.585952e-08,1.381151e-07,...,2907.043492,3021.514286,6548.568717,0.00302,0.804,0.782,-6.224,0.0606,116.806,0.346
9,9,RUN-DMC (Expanded Edition),Run–D.M.C.,2J6QnTjHIWwXErNWyF0RUC,It's Like That,1984,./data/It's Like That.mp3,-1.686745e-07,-4.395201e-08,1.53506e-08,...,2478.655649,2936.570346,5778.77703,0.0591,0.964,0.889,-5.169,0.147,121.144,0.552
