In [None]:
import pandas as pd

from dotenv import dotenv_values
from lyricsgenius import Genius
from requests.exceptions import Timeout
from time import sleep

In [None]:
config = dotenv_values(".env")
output_path = "data/all_songs_with_lyrics.pkl"

In [None]:
genius = Genius(config["GENIUS_CLIENT_ACCESS_TOKEN"])

In [None]:
def get_lyrics(artist_name: str, song_name: str) -> str:
    """
        Search for song and return lyrics if it is found
    """
    song = genius.search_song(song_name, artist_name)
    if song:
        return song.lyrics
    else:
        return None


In [None]:
df_apres_ski = pd.read_pickle("data/df_apres_ski.pkl")
df_top_2000 = pd.read_pickle("data/df_top_2000.pkl")


In [None]:
df_all_songs = (
    pd.concat(
        [
            df_apres_ski.assign(playlist="Après Ski"),
            df_top_2000.assign(playlist="Top 2000"),
        ]
    )
    .reset_index(drop=True)
)

In [None]:
df_all_songs_with_lyrics = pd.read_pickle(output_path)

In [None]:
# I put this in, in case I want to try getting lyrics data for songs we didn't find before
if df_all_songs["id"].equals(df_all_songs_with_lyrics["id"]):
    df_all_songs = df_all_songs_with_lyrics
else:
    df_all_songs["lyrics"] = None

In [None]:
# Based on:
# https://github.com/johnwmillr/LyricsGenius/issues/121
for i, song in df_all_songs.iterrows():
    artist_name = song["artist_names"]
    song_name = song["name"]
    print(f"{i:04d}/{len(df_all_songs)}: {artist_name} - {song_name}")
    if pd.isnull(song["lyrics"]):
        print("looking for lyrics")
        retries = 0
        while retries < 3:
            try:
                df_all_songs.loc[i, "lyrics"] = get_lyrics(song["artist_names"], song["name"])
            except Timeout as e:
                retries += 1
                print("Encountered a TimeOut")
                sleep(30)
                continue
            break
        # Try to avoid time outs
        sleep(2)
    else:
        print("already have lyrics")

In [None]:
df_all_songs["has_lyrics"] = ~df_all_songs["lyrics"].isnull()

In [None]:
df_all_songs.groupby(["playlist", "has_lyrics"]).count()["id"]

With more time, I might try a few more things to extract more data, like
- cleaning up the song and / or artist name.
- add some retry logic like this: 

In [None]:
df_all_songs.to_pickle(output_path)