In [4]:
import billboard
from datetime import datetime
import numpy as np
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import logging

In [5]:
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

In [6]:
def get_billboard_charts(start_year = 1990, end_year = 2020, chart_name = 'rap-song', top_x = 25):
    
    """Fetch tracks from monthly billboard chart
    
    Args: 
        start_year (int): starting year of query
        end_year (int): ending year of query
        chart_name (int): name of billboard chart
        top_x(int): save top x songs from chart
        
    Return:
        query_results (obj): 'list' of artist, song, chart date, and chart name
    
    """
    
    currentmonth = datetime.now().month
    currentyear = datetime.now().year
    
    # Ensure end year is not in the future
    if end_year > currentyear:
        end_year = currentyear
    
    # Ensure start year is not before 1990 (no rap chart)
    if start_year < 1990:
        start_year = 1990
    
    # Create list of monthly dates from start_year to year before end_year
    years = [x for x in range(start_year, end_year)]
    months = ['-01-01','-02-01','-03-01','-04-01','-05-01',  '-06-01', '-07-01','-08-01','-09-01','-10-01','-11-01','-12-01']
    query_dates = [str(y) + d for y in years for d in months]
    
    # Append monthly dates of end_year, accounting for current date
    if end_year < currentyear:
        cur_year_dates = [str(end_year) + d for d in months]
        query_dates.extend(cur_year_dates)
    else:
        cur_year_dates = [str(end_year) + d for d in months[:currentmonth]]
        query_dates.extend(cur_year_dates)
        
    # Fetch chart data from Billboard API

    retry = [] # Unsuccessful queries from first try
    query_results = [] # Results from query
    unsuccessful = [] # Final queries not successful after second try 
    
    
    for qdate in query_dates:
        
        try:
            chart = billboard.ChartData(chart_name, date = qdate)
            results = [[chart[i].artist, chart[i].title, qdate, chart_name] for i in range(0, top_x)]
            query_results.extend(results)
            
            logger.info("{} ... successful query".format(qdate))

        except:
            retry.append(qdate) # if unsuccessfull, add to retry list
            print('{} not found'.format(qdate))
    
    for qdate in retry:
        
        try:
            chart = billboard.ChartData(chart_name, date = qdate)
            results = [[chart[i].artist, chart[i].title, qdate, chart_name] for i in range(0, top_x)]
            query_results.extend(results)
            
            logger.info("{} ... successful query".format(qdate))

        except:
            unsuccessful.append(qdate) # if unsuccessfull, add to retry list
            print('{} not found'.format(qdate))
            
    logger.warning("{} unsuccessful queries".format(len(unsuccessful)))
    
    return query_results

In [76]:
def prep_spotify_query(query_results):
    
    """ Manipulate and concatenate artist and song information based on Spotify API query format. Remove duplicate songs.
    
    Args:
        query_results (obj): 'List' of queries from get_billboard_charts
    
    Return:
        reesults_df (obj): 'DataFrame' of songs with Spotify query column
    
    """
    
    # Remove duplicates
    results_df = pd.DataFrame(query_results, columns = ['artist', 'track', 'date', 'chart'])
    results_df = (results_df.sort_values(by = 'date', ascending = False)
                            .groupby(['artist', 'track']).head(1)
                            .reset_index(drop = True))
    
    
    # Remove parts of the song/artist name that can prevent a successful Spotify query
    
    results_df['artist_q'] = results_df.artist.str.split(' Feat', 1).str[0]
    results_df['artist_q'] = results_df.artist_q.str.split('\(Feat', 1).str[0]
    results_df['artist_q'] = results_df.artist_q.str.split(' &', 1).str[0]
    results_df['artist_q'] = results_df.artist_q.str.split(' ,', 1).str[0]
    results_df['artist_q'] = results_df.artist_q.str.split(" Tell'em", 1).str[0]
    results_df['artist_q'] = results_df.artist_q.str.split(" Tell 'em", 1).str[0]
    results_df['artist_q'] = results_df.artist_q.str.replace(' X ', ' ')
    results_df['artist_q'] = results_df.artist_q.str.replace(" Co-Starring ", " ")
    results_df['artist_q'] = results_df.artist_q.str.replace("F/", "")
    results_df['artist_q'] = results_df.artist_q.str.replace(' Duet With ', ' ')
    results_df['track_q'] = results_df.track.str.split(" \(", 1).str[0]
    results_df['track_q'] = results_df.track_q.str.split(" Feat", 1).str[0]
    results_df['track_q'] = results_df.track_q.str.split("\(Feat", 1).str[0]
    results_df['track_q'] = results_df.track_q.str.split("/", 1).str[0]
    
    # Create final query column
    results_df['spotify_query'] = (results_df.artist_q + ' ' + results_df.track_q).str.replace(' ', '+')
    
    results_df.drop(['artist_q', 'track_q'], axis = 1, inplace = True)
    
    return results_df

In [61]:
def concat_charts(rap_df, all_df):
    
    """ Create dataframe of songs from hot-100 and rap-song charts. Remove hot-100 songs that are also in rap-song 
    
    """
    
    rap_songs = rap_df.track.values.tolist() # list of songs from rap chart
    
    all_df_no_rap = all_df.loc[~all_df.track.isin(rap_songs), :] # remove rap chart songs from hot-100 chart
    
    concat_df = pd.concat([rap_df, all_df_no_rap]) # combine dataframes
    
    logger.info("{} songs removed from Hot 100".format(len(all_df) - len(all_df_no_rap)))
    
    return concat_df

In [62]:
def get_spotify_metadata(query_df, cid, secret):
    
    """Obtain song metadata from Spotify API using Spotipy library
    
    Args:
    
        query_df (obj): 'DataFrame' with spotify_query column
        cid (str): CID Spotify credentials
        secret (str): Secret key for Spotify credentials
        
    Return:
    
        results (obj): 'DataFrame' of Spotify metadata
    
    """
    
    
    # Configure spotipy with Spotify credentials
    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

    
    search_list = query_df['spotify_query'].values.tolist()
    
    query_found = []
    audio_features = []
    not_found = []
    
    # Fetch track metadata from Spotify
    for search in search_list:

        try:
            query = sp.search(search)
            song_id = query['tracks']['items'][0]['id']

            query_found.append(search)
            audio_feature = sp.audio_features(song_id)[0]
            audio_features.append(audio_feature)

        except:
            not_found.append(search)
            print(search, ' not found')
            
    logger.warning("{} songs not found".format(len(not_found)))
            
    spotify_features = pd.DataFrame(audio_features)
    spotify_features['spotify_query'] = query_found
    
    results = query_df.merge(spotify_features, on = 'spotify_query')
    
    return results

In [10]:
hot_100 = get_billboard_charts(chart_name = 'hot-100', top_x = 50)

1992-12-01 not found
1994-11-01 not found
1996-01-01 not found
1997-04-01 not found
1997-08-01 not found
1997-09-01 not found
1999-12-01 not found
2000-02-01 not found
2000-07-01 not found
2000-10-01 not found
2000-12-01 not found
2001-01-01 not found
2001-02-01 not found
2001-08-01 not found
2001-10-01 not found
2002-05-01 not found
2002-07-01 not found
2002-10-01 not found
2003-05-01 not found
2003-11-01 not found
2003-12-01 not found
2004-06-01 not found
2004-07-01 not found
2004-11-01 not found
2005-09-01 not found
2005-12-01 not found
2006-03-01 not found
2006-06-01 not found
2006-07-01 not found
2007-03-01 not found
2007-04-01 not found
2007-05-01 not found
2008-10-01 not found
2009-04-01 not found
2011-04-01 not found
2014-04-01 not found
2016-07-01 not found
1992-12-01 not found
2001-01-01 not found
2001-02-01 not found
2001-08-01 not found
2002-05-01 not found
2003-11-01 not found


In [78]:
hot100_df = prep_spotify_query(hot_100)

In [25]:
toprap = get_billboard_charts(chart_name = 'rap-song', top_x = 25)

In [79]:
toprap_df = prep_spotify_query(toprap)

In [80]:
all_df = concat_charts(toprap_df, hot100_df)

In [81]:
all_df.to_csv("all_df_0510.csv")

In [41]:
cid = "4d873f4f1fe442d399438bdef5efc4a6"
secret = "5edc8c8b39764a0fbf920ca326a3cbbc"

In [83]:
spotify_df = get_spotify_metadata(all_df, cid, secret)

Lil+Tecca+Ran$om  not found
retrying after...3secs
Yella+Beezy,+Gucci+Mane+Bacc+At+It+Again  not found
retrying after...1secs
Quavo+W+O+R+K+I+N++M+E  not found
J.+Cole+Kevin's+Heart  not found
Derez+De'Shon+Hardaway  not found
ScHoolboy+Q,+2+Chainz+X  not found
Chris+Brown+Pills+And+Automobiles  not found
retrying after...2secs
2+Chainz+x+Gucci+Mane+x+Quavo+Good+Drank  not found
Kendrick+Lamar+Untitled+02+l+06.23.2014.  not found
Kendrick+Lamar+Untitled+07+l+Levitate  not found
iLoveMemphis+Lean+&+Dabb  not found
retrying after...3secs
retrying after...1secs
retrying after...1secs
Steve+Aoki,+Chris+Lake+Delirious  not found
Wiz+Khalifa+Feautring+Project+Pat+KK  not found
QUE.+OG+Bobby+Johnson  not found
retrying after...1secs
A$AP+Rocky+Kendrick+Lamar,+Joey+Bada$$,+YelaWolf,+Danny+Brown,+Action+Bronson+1Train  not found
retrying after...2secs
E-40+With+YG,+iAMSU!+Function  not found
retrying after...2secs
J.+Cole+Who+Dat  not found
T.I.+I'm+Back  not found
Fat+Joe+(HaHa)+Slow+Down  not

White+Dawg+Restless  not found
Baby+DC+Bounce,+Rock,+Skate,+Roll  not found
Missy+"Misdemeanor"+Elliott+She's+A+Bitch  not found
C.+Webb+Gangsta!+Gangsta!  not found
retrying after...1secs
Mos+Def+Respiration  not found
The+2+Live+Crew+The+Real+One  not found
Chuck+Smooth+Who+Let+The+Dogs+Out?  not found
Infamous+Syndicate+Here+I+Go  not found
Mo+Thugs+Family+Ghetto+Cowboy  not found
Lox,+DMX,+Drag-On+Ryde+Or+Die  not found
No+Good-N-Jiggie+Lizard-Lizard  not found
Too+$hort+Invasion+Of+The+Flat+Booty+B*****s  not found
DJ+S&S+Beat+Of+The+Day  not found
Mr.+Money+Loc+Throw+Yo+Hood+Up  not found
B+Da+Outta+Sight+Child+Free+&+Single  not found
Tee+Kee+Every+Thing+I+Want  not found
Redman+Da+Goodness  not found
KRS-One+Buckshot,+Cam'ron,+Keith+Murray,+Killah+Priest,+Prodigy,+Redman,+Run+5+Boroughs  not found
Raheem+You+Scared,+You+Scared  not found
Medina+Green+Crosstown+Beef  not found
Mag+7+The+Street+Mix  not found
5CENT+Never+Enough  not found
Monie+Get+At+Me  not found
General+Grant+

M.C.+Breed+Ain't+Too+Much+Worried  not found
Zhigge+Toss+It+Up  not found
The+College+Boyz+Hollywood+Paradox  not found
Double+XX+Head+Cracker  not found
A.D.O.R.+Let+It+All+Hang+Out  not found
MC+Ren+The+Final+Frontier  not found
M.C.+Brains+Brainstorming  not found
Brothers+Uv+Da+Blakmarket+Livin'+In+Da+Bottle  not found
Chi-Ali+Roadrunner  not found
Ho+Frat+Ho!+Ho+Frat+Swing  not found
Yo-Yo+Home+Girl+Don't+Play+Dat  not found
Da+Youngsta's+Pass+Da+Mic  not found
Monie+Love+Full+Term+Love  not found
K-Solo+I+Can't+Hold+Back  not found
Kwame+Nastee  not found
M.C.+Breed+Ain't+To+Be+F...ed+With  not found
Dr.+Dre+Introducing+Snoop+Doggy+Dogg+Deep+Cover  not found
Penthouse+Players+Clique+Explanation+Of+A+Playa  not found
The+College+Boyz+Victim+Of+The+Ghetto  not found
Doug+E.+Fresh+Bustin'+Out  not found
Ultramagnetic+MC's+Poppa+Large  not found
Organized+Konfusion+Walk+Into+The+Sun  not found
BDP+13+And+Good  not found
The+Future+Sound+Lady  not found
M.C.+Brains+Everybody's+Talking

retrying after...1secs
retrying after...2secs
retrying after...2secs
retrying after...1secs
retrying after...2secs
Lil'+Mo+Superwoman+Pt.+II  not found
retrying after...1secs
Olivia+Bizounce  not found
retrying after...1secs
retrying after...1secs
Kenny+Rogers+With+Alison+Krauss+Buy+Me+A+Rose  not found
retrying after...1secs
retrying after...2secs
Marc+Nelson+15+Minutes  not found
Silkk+The+Shocker+It+Ain't+My+Fault+1+&+2  not found
retrying after...1secs
Myron+Destiny  not found
Link+Whatcha+Gone+Do?  not found
INOJ/LATHUN+Love+You+Down  not found
Missy+"Misdemeanor"+Elliott+Sock+It+2+Me  not found
retrying after...2secs
Various+Artists+ESPN+Presents+The+Jock+Jam  not found
Raybon+Bros.+Butterfly+Kisses  not found
3rd+Party+Can+U+Feel+It  not found
retrying after...2secs
Bill+Engvall+With+Special+Guest+Travis+Tritt+Here's+Your+Sign  not found
retrying after...2secs
Adam+Clayton+Theme+From+Mission:+Impossible  not found
Solo+Where+Do+U+Want+Me+To+Put+It  not found
retrying after...2se

In [87]:
spotify_df.to_csv("spotify_df_0510.csv")