In [17]:
import pandas as pd
import requests
from lyricsgenius import Genius
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
import os
import re
import time

In [18]:
load_dotenv()
GENIUS_API_KEY = os.getenv('GENIUS_API_KEY')
SPOTIFY_CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = os.getenv('SPOTIFY_CLIENT_SECRET')

In [19]:
genius = Genius(GENIUS_API_KEY,
                skip_non_songs=True,
                remove_section_headers=True,
                verbose=False,
                sleep_time=1,
                retries=3
            )

def get_song(name, artist=None):
    try:
        print(f"Doing {name} by {artist}", end='\r', flush=True)
        pattern = r'^[\s\S]*?Lyrics\s*|(\d+)Embed$|See.*Live|Get tickets.*?\$[\d,]+|You might also like'
        song = genius.search_song(name, artist) if artist else genius.search_song(name)
        return re.sub(pattern, '', song.lyrics, flags=re.MULTILINE)
    except TimeoutError:
        time.sleep(10)
        return get_song(name, artist)

def get_song_alt(name, artist):
    try:
        return requests.get(f"https://api.lyrics.ovh/v1/{artist}/{name}", timeout=10).json()["lyrics"]
    except requests.Timeout:
        return ""

In [20]:
empty = {
    'danceability': None,
    'energy': None,
    'key': None,
    'loudness': None,
    'mode': None,
    'speechiness': None,
    'acousticness': None,
    'instrumentalness': None,
    'liveness': None,
    'valence': None,
    'tempo': None,
    'duration_ms': None,
    'time_signature': None
}

In [21]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID, 
                                                           client_secret=SPOTIFY_CLIENT_SECRET),
                    requests_timeout=15, retries=5)

def get_audio_features(song_name, artist_name):
    print(f"Doing {song_name} by {artist_name}", end='\r', flush=True)
    
    results = sp.search(q=f'track:{song_name} artist:{artist_name}', type='track', limit=1)
    if results['tracks']['items']:
        track = results['tracks']['items'][0]
        track_id = track['id']
        
        audio_features = sp.audio_features(track_id)
        
        if audio_features:
            audio_features = audio_features[0]
            audio_features.pop('analysis_url')
            audio_features.pop('track_href')
            audio_features.pop('id')
            audio_features.pop('uri')
            audio_features.pop('type')
            return audio_features
        
    return empty

In [22]:
df = pd.read_csv('newtracklist.csv')
df.head()

Unnamed: 0,Track,Artist
0,Good Luck Babe!,Chappell Roan
1,Not Like Us,Kendrick Lamar
2,Fortnight,Taylor Swift
3,A Bar Song (Tipsy),Shaboozey
4,Birds of a Feather,Billie Eilish


In [23]:
df.columns = [i.lower() for i in df.columns]

In [24]:
df['lyrics'] = df.apply(lambda row: get_song(row['track'], row['artist']), axis=1)

Doing Texas Hold 'Em by Beyoncéabrina Carpenters

In [25]:
audio_features_df = df.apply(lambda row: get_audio_features(row['track'], row['artist']), axis=1, result_type='expand')
df = pd.concat([df, audio_features_df], axis=1)

Doing Texas Hold 'Em by Beyoncéabrina Carpenters

In [26]:
df['edition'] = 67
df['year'] = 2025
df['status'] = 'Nominee'

In [27]:
df.head(3)

Unnamed: 0,track,artist,lyrics,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,edition,year,status
0,Good Luck Babe!,Chappell Roan,"It's fine, it's cool\nYou can say that we are ...",0.7,0.582,11.0,-5.96,0.0,0.0356,0.0502,0.0,0.0881,0.785,116.712,218424.0,4.0,67,2025,Nominee
1,Not Like Us,Kendrick Lamar,"Psst, I see dead people\n(Mustard on the beat,...",0.898,0.472,1.0,-7.001,1.0,0.0776,0.0107,0.0,0.141,0.214,101.061,274192.0,4.0,67,2025,Nominee
2,Fortnight,Taylor Swift,I was supposed to be sent away\nBut they forgo...,0.504,0.386,11.0,-10.976,1.0,0.0308,0.502,1.5e-05,0.0961,0.281,192.004,228965.0,4.0,67,2025,Nominee


In [28]:
df.tail(3)

Unnamed: 0,track,artist,lyrics,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,edition,year,status
5,Die With a Smile,Lady Gaga & Bruno Mars,"(Ooh, ooh)\n\nI, I just woke up from a dream\n...",0.521,0.592,6.0,-7.777,0.0,0.0304,0.308,0.0,0.122,0.535,157.969,251668.0,3.0,67,2025,Nominee
6,Please Please Please,Sabrina Carpenter,"I know I have good judgment, I know I have goo...",0.675,0.586,9.0,-6.086,1.0,0.0531,0.257,0.0,0.104,0.621,107.015,186365.0,4.0,67,2025,Nominee
7,Texas Hold 'Em,Beyoncé,This ain't Texas (Woo)\nAin't no hold 'em (Hey...,0.727,0.711,2.0,-6.549,1.0,0.078,0.582,0.0,0.158,0.375,110.012,233457.0,4.0,67,2025,Nominee


In [29]:
df.to_csv('new-dataset.csv', index=False)

In [30]:
len(df.columns)

19