# Data Collection

## Collecting Popular Songs from Billboard Data Scraping

In [3]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [2]:
# get Hot 100 for each year
def get_hot100(year=2023):
    # get data from billboard Year-End Hot-100 Songs Chart
    url = 'https://www.billboard.com/charts/year-end/' + str(year) + '/hot-100-songs/'
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    # get text only, remove \t and \n
    html_text = soup.get_text().replace('\t','').split('\n')
    # remove empty strings and whitespaces
    html_text[:] = [i for i in html_text if i!='' and not i.isspace()]
    # convert to list
    ls = []
    for rank in range(1,101):
        i = html_text.index(str(rank))
        ls.append(html_text[i:i+3])
    # convert to pd dataframe
    df = pd.DataFrame(ls, columns = ['billboard_ranking', 'song', 'artist'])
    df['year'] = year
    df['popular_or_not']=1
    df = df.convert_dtypes()
    return df

In [3]:
hot100_2023 = get_hot100(2023)

In [19]:
# # save data to local file
# hot100_2023.to_csv('billboardHot100_2023.csv',index=False)

## Collecting Necessary Information of Popular Songs from Spotify
This section, using above 100 songs from billboard Hot 100, retrieves their track ID, album name, album ID, popularity score (out of 100), and url link to a 30s song snippet from Spotify API. These info are necessary for collecting unpopular songs, song snippet links will be used in audio feature extraction.

In [4]:
import pandas as pd
from urllib.parse import quote

In [7]:
# Connect to Spotify API
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [6]:
# Search a specific Spotify track ID based on song name and artist name
def get_track_id(tracks, artists):
    track_id=[]

    for t, a in zip(tracks, artists):
        a = a.replace('Featuring ','')
        query = quote('track:' + t + ' artist:' + a)
        search_result = spotify.search(q=query, type="track", market="AU", limit=1, offset=0)['tracks']['items']
        if len(search_result)>0:
            track_id.append(search_result[0]['id'])
        else:
            track_id.append(None)

    return track_id

In [7]:
hot100_2023['track_id'] = get_track_id(hot100_2023['song'], hot100_2023['artist'])

In [10]:
hot100_2023['track_id'].count()

100

In [9]:
# hot100_2023.to_csv('billboard_hot100_2023.csv', index = False)

In [None]:
hot100_2023

Unnamed: 0,billboard_ranking,song,artist,year,popular_or_not,track_id
0,1,Last Night,Morgan Wallen,2023,1,7K3BhSpAxZBznislvUMVtn
1,2,Flowers,Miley Cyrus,2023,1,1ZJwhLQLt2quexYKP2Q3gk
2,3,Kill Bill,SZA,2023,1,2oqB9en8cOf6Sl9NMEeu7H
3,4,Anti-Hero,Taylor Swift,2023,1,779ooI3rBd0CLqCiiJmtVo
4,5,Creepin',"Metro Boomin, The Weeknd & 21 Savage",2023,1,2dHHgzDwk4BJdRwy9uXhTO
...,...,...,...,...,...,...
95,96,"Bzrp Music Sessions, Vol. 53",Bizarrap & Shakira,2023,1,4nrPB8O7Y7wsOCJdgXkthe
96,97,Meltdown,Travis Scott Featuring Drake,2023,1,0O9WkgPDfZ7A9DMDocF6ur
97,98,Put It On Da Floor Again,Latto Featuring Cardi B,2023,1,6c6WmIHcHlhccEwSFBhzNa
98,99,Bloody Mary,Lady Gaga,2023,1,53jnnqFSRGMDB9ADrNriCA


In [None]:
def get_data(track_id):
    arr = [[] for _ in range(4)]
    
    for id in track_id:
        track_info = spotify.track(track_id=id, market='AU')
        arr[0].append(track_info['album']['name'])
        arr[1].append(track_info['album']['id'])
        arr[2].append(track_info['popularity'])
        arr[3].append(track_info['preview_url'])
    
    return arr

In [None]:
arr = get_data(hot100_2023['track_id'])

In [None]:

hot100_2023['album'], hot100_2023['album_id'], hot100_2023['spotify_popularity'], hot100_2023['audio_link'] = arr[0], arr[1], arr[2], arr[3]

In [14]:
hot100_2023

Unnamed: 0,billboard_ranking,song,artist,year,popular_or_not,track_id,spotify_popularity,album,album_id,audio_link
0,1,Last Night,Morgan Wallen,2023,1,7K3BhSpAxZBznislvUMVtn,85,One Thing At A Time,6i7mF7whyRJuLJ4ogbH2wh,
1,2,Flowers,Miley Cyrus,2023,1,7DSAEUvxU8FajXtRloy8M0,92,Endless Summer Vacation,5DvJgsMLbaR1HmAI6VhfcQ,https://p.scdn.co/mp3-preview/5184d19d1b7fcc3e...
2,3,Kill Bill,SZA,2023,1,3OHfY25tqY28d16oZczHc8,83,SOS,07w0rG5TETcyihsEIZR3qG,https://p.scdn.co/mp3-preview/49dfe4ea78c5d3c5...
3,4,Anti-Hero,Taylor Swift,2023,1,0V3wPSX9ygBnCm8psDIegu,89,Midnights,151w1FgRZfnKZA9FEcg9Z3,
4,5,Creepin',"Metro Boomin, The Weeknd & 21 Savage",2023,1,2dHHgzDwk4BJdRwy9uXhTO,87,HEROES & VILLAINS,7txGsnDSqVMoRl6RQ9XyZP,
...,...,...,...,...,...,...,...,...,...,...
95,96,"Bzrp Music Sessions, Vol. 53",Bizarrap & Shakira,2023,1,4nrPB8O7Y7wsOCJdgXkthe,64,Las Mujeres Ya No Lloran,3fonA82Hl7huJiQCwKkzGA,https://p.scdn.co/mp3-preview/3ba573cb752298ab...
96,97,Meltdown,Travis Scott Featuring Drake,2023,1,0O9WkgPDfZ7A9DMDocF6ur,83,UTOPIA,18NOKLkZETa4sWwLMIm0UZ,https://p.scdn.co/mp3-preview/7c7e0e4419d101bc...
97,98,Put It On Da Floor Again,Latto Featuring Cardi B,2023,1,6c6WmIHcHlhccEwSFBhzNa,69,Put It On Da Floor Again (feat. Cardi B),4A43tzEN3jILvseI1HeXGG,https://p.scdn.co/mp3-preview/3cd934af8742a4c0...
98,99,Bloody Mary,Lady Gaga,2023,1,53jnnqFSRGMDB9ADrNriCA,77,Born This Way (International Special Edition V...,6LY3AerY6KNGOPsNPL63Kk,


In [None]:
# hot100_2023.to_csv('billboard_hot100_2023.csv', index = False)

## Collecting Unpopular Songs

Spotify songs in the same album all have songs snippets (preview_url) or all don't  
Since we are getting songs from the same album, and need audio link to get audio features to train ML model, only tracks with a link are useful

In [21]:
import pandas as pd
import time

In [2]:
# Connect to Spotify API
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [22]:
hot100_2023 = pd.read_csv('billboard_hot100_2023.csv')

In [23]:
# drop all songs without links
popular_songs = hot100_2023.dropna(subset=['audio_link']).reset_index(drop=True)

In [24]:
len(popular_songs)

55

In [25]:
popular_songs.to_csv('popular_songs.csv', index=False)

In [6]:
unpopular_songs = pd.DataFrame({
    'album': popular_songs['album'],
    'album_id': popular_songs['album_id'],
    'artist': popular_songs['artist'],
    'year': popular_songs['year'],
    'billboard_ranking': 0,
    'popular_or_not': 0
})

In [7]:
unpopular_songs.head(5)

Unnamed: 0,album,album_id,artist,year,billboard_ranking,popular_or_not
0,Endless Summer Vacation,5DvJgsMLbaR1HmAI6VhfcQ,Miley Cyrus,2023,0,0
1,SOS,07w0rG5TETcyihsEIZR3qG,SZA,2023,0,0
2,Gettin' Old,5Uly85dJHHDfHQCsyUQ8gw,Luke Combs,2023,0,0
3,SOS,07w0rG5TETcyihsEIZR3qG,SZA,2023,0,0
4,Bebe,0FWLTTcCKsd3b1PiW9GTEN,David Guetta & Bebe Rexha,2023,0,0


In [9]:
def get_unpopular_track(album_ids):
    arr = [[] for _ in range(4)]

    for id in album_ids:
        search_result = spotify.album_tracks(album_id=id, market='AU')
        # get all tracks in an album
        album_tracks = {track['id']: None for track in search_result['items']}
        # record their popularity scores
        for id in album_tracks:
            track = spotify.track(track_id=id, market='AU')
            album_tracks[id] = track['popularity']
        # find the one with lowest score
        min_key = min(album_tracks, key=lambda k: album_tracks[k])
        track = spotify.track(track_id=min_key, market='AU')
        arr[0].append(track['name'])
        arr[1].append(track['id'])
        arr[2].append(track['popularity'])
        arr[3].append(track['preview_url'])
        # add a 30s timer to avoid Spotify API rate limit
        time.sleep(30)
    
    return arr

In [10]:
# because need to work around spotify's api rate limit, this operation will take 30min
arr = get_unpopular_track(unpopular_songs['album_id'])

In [11]:
unpopular_songs['song'], unpopular_songs['track_id'], unpopular_songs['spotify_popularity'], unpopular_songs['audio_link'] = arr[0], arr[1], arr[2], arr[3]

In [26]:
unpopular_songs.head(5)

Unnamed: 0,album,album_id,artist,year,billboard_ranking,popular_or_not,song,track_id,spotify_popularity,audio_link
0,Endless Summer Vacation,5DvJgsMLbaR1HmAI6VhfcQ,Miley Cyrus,2023,0,0,Handstand,6eiNVtWCRsU7N6JDPXFI2W,56,https://p.scdn.co/mp3-preview/c55b106ce42d91f0...
1,SOS,07w0rG5TETcyihsEIZR3qG,SZA,2023,0,0,Too Late,4rAg5bbrdZX00mXXhLvYXj,66,https://p.scdn.co/mp3-preview/cfd6bdd6babe9cea...
2,Gettin' Old,5Uly85dJHHDfHQCsyUQ8gw,Luke Combs,2023,0,0,A Song Was Born,4AhZSJyGTfYqUnFjn5eGQa,55,https://p.scdn.co/mp3-preview/3cb99b6f94fb1007...
3,SOS,07w0rG5TETcyihsEIZR3qG,SZA,2023,0,0,Too Late,4rAg5bbrdZX00mXXhLvYXj,66,https://p.scdn.co/mp3-preview/cfd6bdd6babe9cea...
4,Bebe,0FWLTTcCKsd3b1PiW9GTEN,David Guetta & Bebe Rexha,2023,0,0,Born Again,29y1cN7e9sTX0gBx4y7OCv,32,https://p.scdn.co/mp3-preview/2cfae3cf9aa275f7...


In [13]:

unpopular_songs.to_csv('unpopular_songs.csv', index=False)

## Composing all_songs Data

In [1]:
import pandas as pd

In [2]:
popular_songs = pd.read_csv('popular_songs.csv')
unpopular_songs = pd.read_csv('unpopular_songs.csv')

In [5]:
all_songs = pd.concat([popular_songs,unpopular_songs], axis=0)

In [6]:
all_songs

Unnamed: 0,billboard_ranking,song,artist,year,popular_or_not,track_id,spotify_popularity,album,album_id,audio_link
0,2,Flowers,Miley Cyrus,2023,1,7DSAEUvxU8FajXtRloy8M0,92,Endless Summer Vacation,5DvJgsMLbaR1HmAI6VhfcQ,https://p.scdn.co/mp3-preview/5184d19d1b7fcc3e...
1,3,Kill Bill,SZA,2023,1,3OHfY25tqY28d16oZczHc8,83,SOS,07w0rG5TETcyihsEIZR3qG,https://p.scdn.co/mp3-preview/49dfe4ea78c5d3c5...
2,8,Fast Car,Luke Combs,2023,1,1Lo0QY9cvc8sUB2vnIOxDT,87,Gettin' Old,5Uly85dJHHDfHQCsyUQ8gw,https://p.scdn.co/mp3-preview/1dc0426c95058783...
3,9,Snooze,SZA,2023,1,4iZ4pt7kvcaH6Yo8UoZ4s2,91,SOS,07w0rG5TETcyihsEIZR3qG,https://p.scdn.co/mp3-preview/8c53920b5fd2c317...
4,10,I'm Good (Blue),David Guetta & Bebe Rexha,2023,1,4uUG5RXrOk84mYEfFvj3cK,90,Bebe,0FWLTTcCKsd3b1PiW9GTEN,https://p.scdn.co/mp3-preview/c1de960c1a98f7ab...
...,...,...,...,...,...,...,...,...,...,...
46,0,Hot Grease,Young Nudy Featuring 21 Savage,2023,0,3guv5zn5hDfpPpKfcmCTxS,39,Gumbo,1UbeEAPS49eulB659XSU9g,https://p.scdn.co/mp3-preview/b245449843dff7ea...
47,0,"Bzrp Music Sessions, Vol. 53 - Tiësto Remix",Bizarrap & Shakira,2023,0,22Q0eVNpSUIWG0prN6cVmI,63,Las Mujeres Ya No Lloran,3fonA82Hl7huJiQCwKkzGA,https://p.scdn.co/mp3-preview/3ba573cb752298ab...
48,0,PARASAIL (feat. Yung Lean & Dave Chappelle),Travis Scott Featuring Drake,2023,0,2Q0aElTZQtEUsoiaQfizu6,68,UTOPIA,18NOKLkZETa4sWwLMIm0UZ,https://p.scdn.co/mp3-preview/82f3c0d5844693c4...
49,0,Put It On Da Floor Again - Instrumental,Latto Featuring Cardi B,2023,0,3H2tSGsIkHnhT2tuHiHtAF,25,Put It On Da Floor Again (feat. Cardi B),4A43tzEN3jILvseI1HeXGG,https://p.scdn.co/mp3-preview/7844fd6714a5ee13...


In [8]:
all_songs = all_songs.drop(columns=['artist','year','track_id','album','album_id'])

In [9]:
all_songs

Unnamed: 0,billboard_ranking,song,popular_or_not,spotify_popularity,audio_link
0,2,Flowers,1,92,https://p.scdn.co/mp3-preview/5184d19d1b7fcc3e...
1,3,Kill Bill,1,83,https://p.scdn.co/mp3-preview/49dfe4ea78c5d3c5...
2,8,Fast Car,1,87,https://p.scdn.co/mp3-preview/1dc0426c95058783...
3,9,Snooze,1,91,https://p.scdn.co/mp3-preview/8c53920b5fd2c317...
4,10,I'm Good (Blue),1,90,https://p.scdn.co/mp3-preview/c1de960c1a98f7ab...
...,...,...,...,...,...
46,0,Hot Grease,0,39,https://p.scdn.co/mp3-preview/b245449843dff7ea...
47,0,"Bzrp Music Sessions, Vol. 53 - Tiësto Remix",0,63,https://p.scdn.co/mp3-preview/3ba573cb752298ab...
48,0,PARASAIL (feat. Yung Lean & Dave Chappelle),0,68,https://p.scdn.co/mp3-preview/82f3c0d5844693c4...
49,0,Put It On Da Floor Again - Instrumental,0,25,https://p.scdn.co/mp3-preview/7844fd6714a5ee13...


In [10]:
all_songs.to_csv("all_songs.csv", index=False)