In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import json

In [2]:
awards = pd.read_csv("awards-information.csv").drop(["Network"], axis=1)
awards["Date"] = pd.to_datetime(awards["Date"]).apply(lambda x: x.year-1)
awards = awards[(awards["Date"] > 1959) & (awards["Date"] < 2024)]
awards.reset_index(inplace=True, drop=True)

In [3]:
awards.head()

Unnamed: 0,Edition,Date
0,3rd,1960
1,4th,1961
2,5th,1962
3,6th,1963
4,7th,1964


In [4]:
def get_url(edition, year):
    if year < 2017:
        return f"https://www.grammy.com/awards/{edition}-annual-grammy-awards" 
    else:   
        return f"https://www.grammy.com/awards/{edition}-annual-grammy-awards-{year}"

In [5]:
awards['url'] = awards.apply(lambda row: get_url(row['Edition'], row['Date']), axis=1)

In [6]:
awards.head()

Unnamed: 0,Edition,Date,url
0,3rd,1960,https://www.grammy.com/awards/3rd-annual-gramm...
1,4th,1961,https://www.grammy.com/awards/4th-annual-gramm...
2,5th,1962,https://www.grammy.com/awards/5th-annual-gramm...
3,6th,1963,https://www.grammy.com/awards/6th-annual-gramm...
4,7th,1964,https://www.grammy.com/awards/7th-annual-gramm...


In [7]:
test = awards["url"][0]
test

'https://www.grammy.com/awards/3rd-annual-grammy-awards'

In [8]:
awards.tail(5)

Unnamed: 0,Edition,Date,url
59,62nd,2019,https://www.grammy.com/awards/62nd-annual-gram...
60,63rd,2020,https://www.grammy.com/awards/63rd-annual-gram...
61,64th,2021,https://www.grammy.com/awards/64th-annual-gram...
62,65th,2022,https://www.grammy.com/awards/65th-annual-gram...
63,66th,2023,https://www.grammy.com/awards/66th-annual-gram...


In [9]:
test = awards["url"][58]

In [10]:
awards["url"][63]

'https://www.grammy.com/awards/66th-annual-grammy-awards-2023'

In [11]:
import requests
from bs4 import BeautifulSoup

# URL of the page you want to scrape
url = test

# Send a request to the page
response = requests.get(url)

# Create a BeautifulSoup object and specify the parser
soup = BeautifulSoup(response.content, 'html.parser')

In [12]:
data = soup.find('script', id="__NEXT_DATA__" ,type="application/json")
data = json.loads(data.string)

In [13]:
categories = data["props"]["pageProps"]["pageContent"]["getAwardsYears"]["hits"][0]["categoryDetails"]

# Could use to get show year
print(data["props"]["pageProps"]["pageContent"]["getAwardsYears"]["hits"][0]["showYear"])

2018


In [14]:
info = None
for category in categories:
    if "song of the year" in category["title"][0]['name'].lower():
        info = category["nominations"]
        break

In [None]:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
data = soup.find('script', id="__NEXT_DATA__" ,type="application/json")
data = json.loads(data.string)
categories = data["props"]["pageProps"]["pageContent"]["getAwardsYears"]["hits"][0]["categoryDetails"]
info = None
for category in categories:
    if "song of the year" in category["title"][0]['name'].lower():
        info = category["nominations"]
        break

In [15]:
info

[{'__typename': 'Nominations',
  'title': 'This Is America',
  'isWinner': True,
  'nomineeOrder': 8,
  'displayLine1': '"This Is America"',
  'displayLine2': '',
  'displayLine3': 'Donald Glover, Ludwig Göransson & Jeffery Lamar Williams, songwriters (Childish Gambino)',
  'creditedArtists': [{'__typename': 'Artist',
    'title': 'Childish Gambino',
    'creditedAs': 'Songwriter',
    'imageType': 'Default',
    'manualImage': '',
    'slug': 'artists/childish-gambino/18365',
    'tivoInfo': {'__typename': 'TivoInfo',
     'idField': None,
     'titleField': 'Childish Gambino',
     'damDynamic': None,
     'damThumbnail': None}},
   {'__typename': 'Artist',
    'title': 'Ludwig Goransson',
    'creditedAs': 'Songwriter',
    'imageType': 'Default',
    'manualImage': '',
    'slug': 'artists/ludwig-goransson/187476',
    'tivoInfo': {'__typename': 'TivoInfo',
     'idField': None,
     'titleField': 'Ludwig Goransson',
     'damDynamic': None,
     'damThumbnail': None}},
   {'__type

In [16]:
for i in info:
    print('Winner:' if i['isWinner'] else 'Nominee:', i['title'], "-", (lambda s: re.search(r'\((.*?)\)', s).group(1) if re.search(r'\((.*?)\)', s) else None)(i['displayLine3']))

Winner: This Is America - Childish Gambino
Nominee: Shallow - Lady Gaga & Bradley Cooper
Nominee: The Middle - Zedd, Maren Morris & Grey
Nominee: The Joke - Brandi Carlile
Nominee: In My Blood - Shawn Mendes
Nominee: God's Plan - Drake
Nominee: Boo'd Up - Ella Mai
Nominee: All The Stars - Kendrick Lamar & SZA


In [17]:
from lyricsgenius import Genius
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os
import time

In [18]:
load_dotenv()
GENIUS_API_KEY = os.getenv('GENIUS_API_KEY')
SPOTIFY_CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = os.getenv('SPOTIFY_CLIENT_SECRET')

In [19]:
genius = Genius(GENIUS_API_KEY,
                skip_non_songs=True,
                remove_section_headers=True,
                verbose=False,
                sleep_time=1,
                retries=3
            )

def get_song(name, artist=None):
    pattern = r'^[\s\S]*?Lyrics\s*|(\d*)Embed$|See.*Live|Get tickets.*?\$[\d,]+|You might also like'
    song = genius.search_song(name, artist) if artist else genius.search_song(name)
    return re.sub(pattern, '', song.lyrics, flags=re.MULTILINE)

def get_song_alt(name, artist):
    try:
        return requests.get(f"https://api.lyrics.ovh/v1/{artist}/{name}", timeout=10).json()["lyrics"]
    except requests.Timeout:
        return ""

In [20]:
empty = {
    'danceability': None,
    'energy': None,
    'key': None,
    'loudness': None,
    'mode': None,
    'speechiness': None,
    'acousticness': None,
    'instrumentalness': None,
    'liveness': None,
    'valence': None,
    'tempo': None,
    'duration_ms': None,
    'time_signature': None
}

In [21]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, 
                                                           client_secret=SPOTIFY_CLIENT_SECRET),
                    requests_timeout=15, retries=5)

def get_audio_features(song_name, artist_name):

    results = sp.search(q=f'track:{song_name} artist:{artist_name}', type='track', limit=1)
    if results['tracks']['items']:
        track = results['tracks']['items'][0]
        track_id = track['id']
        
        audio_features = sp.audio_features(track_id)
        
        if audio_features:
            audio_features = audio_features[0]
            audio_features.pop('analysis_url')
            audio_features.pop('track_href')
            audio_features.pop('id')
            audio_features.pop('uri')
            audio_features.pop('type')
            return audio_features
        
    return empty

In [22]:
awards = awards.sample(10)

In [23]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Convert Pandas DataFrame to Dask DataFrame
# Parallelize for faster computation
ddf = dd.from_pandas(awards, npartitions=4)  

In [24]:
def make_dataset(awards):
    df = []

    total = awards.shape[0]
    current = 0

    # for each row in the database
    for index, row in awards.iterrows():
        current += 1
        # get the url and use beautiful soup to capture
        
        # print(f"Now at the {row["Edition"]} Grammys. {current}/{total} complete", end='\r', flush=True)

        url = row["url"]
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract the soty data
        data = soup.find('script', id="__NEXT_DATA__" ,type="application/json")
        data = json.loads(data.string)

        # extract show year
        year = data["props"]["pageProps"]["pageContent"]["getAwardsYears"]["hits"][0]["showYear"]
        
        # extract the nominees
        categories = data["props"]["pageProps"]["pageContent"]["getAwardsYears"]["hits"][0]["categoryDetails"]

        # get the song of the year category
        info = None
        for category in categories:
            if "song of the year" in category["title"][0]['name'].lower():
                info = category["nominations"]
                break
        
        if info:
            for i in info:
                status = 'Winner' if i['isWinner'] else 'Nominee' 
                
                remove_extra = r"(\s*[\(\[](?:.*?(?:Adapted|Theme|Single|Track|Album|Motion Picture|Film).*?)[\)\]]|-\s*(?=.*?(?:Adapted|Theme|Single|Track|Album|Motion Picture|Film)).*)"
                song = re.sub(r'\s*(\((Single|Track|Album)\))\s', '', i['title'])
                song = re.sub(remove_extra, '', song)
                song = re.sub(r'[\r\t\n]+', '', song)

                try:
                    extract = r'(songwriters|songwriter)\s*\(([^)]+)\)'
                    artist = (lambda s: re.search(extract, s).group(2) if re.search(extract, s) else re.search(r'\((.*?)\)', s).group(1))(i['displayLine3']) 
                    artist = re.sub(r"(?i)(\b(?:feat|featuring|ft|&|and|with)\b.*)|((,|\(|&|\[|(- Various Artists)).*$)", "", artist, flags=re.MULTILINE).strip()
                    artist = re.sub(r"(BeyoncÃ©)", "Beyonce", artist, flags=re.MULTILINE).strip()
                    artist = re.sub(r'[\r\t\n]+', '', artist)
                except:
                    print(row["Edition"], song)

                lyrics = None
                try:
                        lyrics = get_song(song, artist)
                except Exception as e:
                    try:
                        lyrics = get_song_alt(song, artist)
                        print(f"\nTrying alt lyrics for {song} by {artist}")
                    except:
                        print("\nThe alt also failed")
                    # print(e)
                    pass
                
                features = empty
                try:
                    features = get_audio_features(song, artist)
                    pass
                except Exception as e:
                    print(f"\nAn error occured getting audio features for {song} by {artist}")
                    print(e)
                    pass
                
                new_row = {
                    "edition": row["Edition"],
                    "year": year,
                    "status": status,
                    "name": song,
                    "artist": artist,
                    "lyrics": lyrics
                }

                new_row.update(features)

                df.append(new_row)
            
                time.sleep(4)
    
        time.sleep(4)
            
    return pd.DataFrame(df)

In [None]:
with ProgressBar():
    columns = ["edition", "year", "status", "name", "artist", "lyrics", 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
    result_ddf = ddf.map_partitions(make_dataset, meta=pd.DataFrame(columns=columns))
    final_df = result_ddf.compute()

[                                        ] | 0% Completed | 356.00 us

In [None]:
# Old code, without parallelization

# final_df = make_dataset(awards)
# final_df.head()

In [None]:
# final_df.loc[final_df["name"] == "Theme From Exodus", "lyrics"] = ""

In [None]:
final_df.head()

Unnamed: 0,edition,year,status,name,artist,lyrics,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,5th,1962,Winner,What Kind Of Fool Am I,Sammy Davis,What kind of fool am I\nWho never fell in love...,0.231,0.31,11.0,-8.959,0.0,0.0301,0.96,0.000134,0.383,0.161,94.009,202617.0,4.0
1,5th,1962,Nominee,The Sweetest Sounds,Peggy Lee,The sweetest sounds I'll ever hear\nAre still ...,0.436,0.453,5.0,-13.835,1.0,0.156,0.892,0.0,0.949,0.611,152.92,101602.0,4.0
2,5th,1962,Nominee,My Coloring Book,Sandy Stewart,For those who fancy colouring books\nAnd lots ...,,,,,,,,,,,,,
3,5th,1962,Nominee,I Left My Heart In San Francisco,Tony Bennett,The loveliness of Paris seems somehow sadly ga...,0.313,0.078,0.0,-17.583,1.0,0.0332,0.955,4e-06,0.169,0.125,128.065,170960.0,4.0
4,5th,1962,Nominee,As Long As He Needs Me,Shirley Bassey and by Della Reese,#\n03' Adolescence - J. Cole\n100 Degrees - Ri...,,,,,,,,,,,,,


In [None]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 61 entries, 0 to 17
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   edition           61 non-null     object 
 1   year              61 non-null     int64  
 2   status            61 non-null     object 
 3   name              61 non-null     object 
 4   artist            61 non-null     object 
 5   lyrics            61 non-null     object 
 6   danceability      59 non-null     float64
 7   energy            59 non-null     float64
 8   key               59 non-null     float64
 9   loudness          59 non-null     float64
 10  mode              59 non-null     float64
 11  speechiness       59 non-null     float64
 12  acousticness      59 non-null     float64
 13  instrumentalness  59 non-null     float64
 14  liveness          59 non-null     float64
 15  valence           59 non-null     float64
 16  tempo             59 non-null     float64
 17  dura

In [None]:
final_df.to_excel('lyrics-dataset.xlsx', index=False, engine='openpyxl')

In [None]:
final_df.sample(5)

Unnamed: 0,edition,year,status,name,artist,lyrics,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
9,64th,2021,Nominee,Peaches,Justin Bieber,"I got my peaches out in Georgia (Oh, yeah, shi...",0.677,0.696,0.0,-6.181,1.0,0.119,0.321,0.0,0.42,0.464,90.03,198082.0,4.0
6,19th,1976,Nominee,The Wreck Of The Edmund Fitzgerald,Gordon Lightfoot,The legend lives on from the Chippewa on down\...,0.308,0.376,4.0,-15.24,1.0,0.0626,0.116,3e-06,0.166,0.158,186.047,389280.0,3.0
5,36th,1993,Nominee,The River Of Dreams,Billy Joel,"In the middle of the, I go walking in the\nIn ...",0.631,0.749,7.0,-8.15,1.0,0.0576,0.205,5e-06,0.0667,0.437,89.646,247627.0,4.0
5,19th,1976,Nominee,This Masquerade,George Benson,​\nAre we really happy here with this lonely g...,0.625,0.373,5.0,-12.549,0.0,0.0383,0.5,0.000524,0.132,0.43,88.543,483133.0,4.0
1,35th,1992,Nominee,Save The Best For Last,Vanessa Williams,Sometimes the snow comes down in June\nSometim...,0.528,0.234,3.0,-15.784,1.0,0.0283,0.641,0.0,0.183,0.191,95.911,218733.0,4.0
