In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bs4 import BeautifulSoup
import requests

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import time 

ARTIST_BB = {}

In [None]:
# function to scrape billboard site given artist name 
def artist_billboard(artist_name):
    # convert artist name to - if spaces
    artist_name = artist_name.lower()
    artist_name = artist_name.replace(' ', '-')
    if artist_name in ARTIST_BB.keys():
        return ARTIST_BB[artist_name]
    else:
        start = time.time()
        base = 'https://www.billboard.com/artist/'
        url = base + artist_name
        page = requests.get(url)
        if page.reason == "Not Found":
            ARTIST_BB[artist_name] = 0
            return 0
        else:
            soup = BeautifulSoup(page.content, "html.parser")
            top_100 = soup.find_all("div", class_ ="artist-chart-history-sticky-wrapper lrv-u-position-relative")
            if len(top_100) > 0:
                text = top_100[0].text.strip('\n').strip('\t').strip().split('\t')[0]
                if text == "Billboard Hot 100™":
                    results = soup.find_all("span", class_="c-span a-font-primary-bold u-font-size-34 u-line-height-120 u-letter-spacing-0063 artist-stat-3")
                    if len(results) < 1: 
                        ARTIST_BB[artist_name] = 0
                        end = time.time()
                        #print(end - start) 
                        return 0 
                    hits = int(results[0].text.strip())
                    if hits > 1:
                        ARTIST_BB[artist_name] = 1
                        end = time.time()
                        #print(end - start) 
                        return 1
            end = time.time()
            #print(end - start) 
            return 0

In [None]:
# Collect data CSVs into a single dataframe
# Collect data CSVs into a single dataframe
dfs = []
directory = "dataset" 
#Spins-search-results-9-23-23-3-22-24-for-WRUV
df = pd.read_csv("dataset/Spins-search-results-9-23-23-3-22-24-for-WRUV.csv")
df.sort_values("Date-time", inplace=True, ignore_index=True)
# convert Date-time to date time
# filter for values in the last year 
df.drop(["Playlist Category", "Playlist Duration", "DJ Email", "Date-time", "Composer"], axis=1, inplace=True)
df["Date"] = pd.to_datetime(df["Date"])
df = df[df["Date"] >= "01-01-2024"]
# Drop columns that we already know we don't need

df

In [None]:
len(df)
splits = int(len(df) / 10)
print(splits)
print(type(splits))
for i in range(10):
    print(i, (i*splits), ((i+1)*splits))
    slice = df.iloc[(i*splits): ((i+1)*splits), :]
    slice['artistBB'] = slice['Artist'].apply(artist_billboard)
    dfs.append(slice)
    slice.to_csv("slice" + str(i) + ".csv")
dfs

In [None]:
df = pd.concat(dfs, axis=0)
df.to_csv("2024-with-billboard-artists.csv")

In [None]:
# Clean the null values from the boolean columns New and Local
df["New"] = df["New"].map({"N": 1, np.nan: 0})
df["Local"] = df["Local"].map({"L": 1, np.nan: 0})

# Since there are only 56 instances where artist, song, or release are null,
# we feel comfortable dropping those instances
df = df.dropna(subset=["Artist", "Song", "Release"])

# Count null values again
df.isna().sum()


In [None]:
def billboard_songs(artist_name, song_name):
    # convert artist name to - if spaces
    artist_name = artist_name.lower()
    artist_name = artist_name.replace(' ', '-')
    base = 'https://www.billboard.com/artist/'
    url = base + artist_name
    page = requests.get(url)
    if page.reason == "Not Found":
        return 0
    else:
        soup = BeautifulSoup(page.content, "html.parser")
        results = soup.find_all("span",
                                class_="c-span a-font-primary-bold u-font-size-34 u-line-height-120 u-letter-spacing-0063 artist-stat-3")
        if len(results) < 1:
            return 0
        hits = int(results[0].text.strip())
        if hits > 1:
            url = url + "/chart-history/hsi"
            page = requests.get(url)
            soup = BeautifulSoup(page.content, "html.parser")
            songs = soup.find_all("div", class_ ="o-chart-results-list__item // lrv-u-flex lrv-u-flex-direction-column lrv-u-flex-grow-1 lrv-u-justify-content-center lrv-u-border-b-1 u-border-b-0@mobile-max lrv-u-border-color-grey-light lrv-u-padding-lr-2 lrv-u-padding-lr-1@mobile-max lrv-u-padding-tb-050@mobile-max")
            for i in range(hits):
                song = songs[i].text.strip('\n')
                song = song.strip('\t')
                song = song.strip()
                song = song.split('\t')[0]
                if song_name == song:
                    print(f'billboard! {song_name} by {artist_name}')
                    return 1
        return 0
#smaller['songBB'] = smaller[['Artist', 'Song']].apply(artist_billboard)
#smaller


#df['New_Column'] = df.apply(lambda row: custom_function(row['A'], row['B']), axis=1)
# apply this only to songs for which billboard = 1, if billboard = 0, this is = 0 
#df['songBB'] = df[df['artistBB' == 0]
#df['songBB'].iloc
#df['songBB'] = df.apply(lambda row: billboard_songs(row['Artist'], row['Song']), axis=1)
#df
# set them all to 0
# then run songBB to replace with 1 if artistBB is 1 and song matches web scrapping 
df['songBB'] = 0
#df['songBB'] = df.apply(lambda row: billboard_songs(row['Artist'], row['Song']), axis=1)

df.loc[df['artistBB'] == 1, 'songBB'] = df.apply(lambda row: billboard_songs(row['Artist'], row['Song']), axis=1)
# next, run spotify and save it as csv 
df.to_csv("2024-all-billboard.csv")

In [None]:

def spotify_connect():
    # technically insecure to have client secret displayed like this (environment variable)
    CLIENT_ID = '344d3b062e344710a5bdb8427358a31d'
    CLIENT_SECRET = '9f886dde51184f989b1aff4f5ffb21f8'
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    auth_manager = SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        requests_session=True)

    sp = spotipy.Spotify(auth_manager=auth_manager)
    # this is our connection which we will need for any future query 
    return sp
    
# just making this global for effeciency (no need to reaccess every time) 
sp = spotify_connect()

# next step: instead of returning result_string, return dictionary of values OR empty dictionary of NAs?
def spotify_search_song(artist_name, song_name):
    results = sp.search(q='track:' + song_name, type='track', limit=10)
    try: 
        results_title = results['tracks']['items'][0]['name']
        results_album = results['tracks']['items'][0]['album']['name']
        results_artist = results['tracks']['items'][0]['artists'][0]['name']
    except IndexError:
        return {}
    # if this results title matches our song name and the artist matches our artist, find the audio features and add them (by spotify uri id for song)
    i = 0
    while (results_artist != artist_name) & (results_title != song_name) & (i < len(results)):
        results_title = results['tracks']['items'][i]['name']
        results_artist = results['tracks']['items'][i]['artists'][0]['name']
        i += 1
    if (results_artist == artist_name) & (results_title == song_name):
        uri = results['tracks']['items'][i]['id'] 
        # these are audio features!! like danceability, energy, key, loudness..
        # stored in an array of length 1 containing a dictionary (key = audio feature, value = value of that feature)
        features = sp.audio_features(uri)
        return_val = {'danceability': features[0]['danceability'], 'energy': features[0]['energy'], 'key' : features[0]['energy'],
              'loudness': features[0]['loudness'], 'mode': features[0]['mode'], 'speechiness': features[0]['speechiness'],
              'acousticness': features[0]['acousticness'], 'instrumentalness': features[0]['instrumentalness'],
              'liveness': features[0]['liveness'], 'valence': features[0]['valence'], 'tempo': features[0]['tempo'],
              'duration_ms': features[0]['duration_ms']}
        result_string =  f'TOP RESULT: {results_title} from {results_album} by {results_artist}. URI: {uri}'
        return return_val
    return {}

# we have slice 0 and 1 rn (ned 2 - 9)
for i in range(2, 10):
    print(i, (i*splits), ((i+1)*splits))
    slice = df.iloc[(i*splits): ((i+1)*splits), :]
    slice[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instruemtnalness', 'liveness', 'valence', 
    'tempo', 'duration_ms']] = slice.apply(lambda row: spotify_search_song(row['Artist'], row['Song']), axis='columns', result_type='expand')
    dfs.append(slice)
    slice.to_csv("slice" + str(i) + ".csv")
dfs

#danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, 
# valence, tempo, duration_ms

In [None]:
df = pd.concat(dfs, axis=0)
df
slices = []
# right now we have 2 slices, hopefully we will get a lot more 
for i in range(2):
    slices.append(pd.read_csv(f'slice{i}.csv'))
small = pd.concat(slices, axis=0)
small["Date"] = pd.to_datetime(df["Date"])
small