In [1]:
from time import sleep
from typing import Tuple
import tekore as tk
import re
import urllib
from datetime import datetime
import pickle
from tqdm import tqdm
ARABIAN_DECIMAL_SEPARATOR = "\N{ARABIC DECIMAL SEPARATOR}"
class Song:

    def __init__(self, ts, title, artists, id):
        self.title = title
        self.artists_raw = list(artists)
        self.artists = ", ".join(self.artists_raw).replace(ARABIAN_DECIMAL_SEPARATOR, ',')
        self.id = id
        self.ts = datetime.utcfromtimestamp(int(ts))
        self.a_n_t = f"{self.artists} - {self.title}"

    def __str__(self):
        return self.a_n_t

    def __repr__(self):
        return str(self)

    def __hash__(self):
        return hash(self.__repr__())

    def __eq__(self, other):
        return self.a_n_t.lower() == other.a_n_t.lower() or self.id == other.id

def song_from_track(t):
    return Song(0, t.name, map(lambda x: x.name, t.artists), t.id)

def get_site(URL):
    #print("Loading...")
    html = ''
    try:
        html = urllib.request.urlopen(URL).read()
    except Exception as e:
        print("Unexpected Error: ", e, " with url: ", URL)
    #print("Done.")
    return html


def getGenresOfName(artist_name: str):
    URL = f"http://everynoise.com/lookup.cgi?who={urllib.parse.quote(artist_name)}"
    html = str(get_site(URL))
    genres = re.findall('t>([A-z0-9- ]*)<', html)
    return genres

def getTop10Tracks(artist_id: str):
    return [song_from_track(t) for t in spotify.artist_top_tracks(artist_id, "US")]

def getTopNLyrics(artist_info: Tuple[str, str], n: int = 10):
    artist_name, artist_id = artist_info
    lyrics = []
    for song in getTop10Tracks(artist_id)[:n]:
        try:
            song_lyrics = genius.search_song(song.title, artist_name, get_full_info=False)
            sleep(0.5)
            lyrics.append((artist_name, artist_id, song, song_lyrics))
        except Exception as e:
            sleep(2.0)
            lyrics.append((artist_name, artist_id, song, "FAILED"))
            print(e)
    return lyrics

In [2]:
conf = tk.config_from_file("../tekore.conf", return_refresh=True)
client_id, client_secret, redirect_uri, user_refresh = conf
token = tk.refresh_user_token(client_id, client_secret, user_refresh)
sender = tk.RetryingSender(sender=tk.SyncSender())
spotify = tk.Spotify(token, sender=sender, max_limits_on=True, chunked_on=True)

In [3]:
with open("../artist_data_pickle.pth", "rb") as f:
    artists = pickle.load(f)
    
#remove artists with id = None
for name, data in list(artists.items()):
    if data["id"] is None:
        artists.pop(name)

In [4]:
from lyricsgenius import Genius

genius = Genius()
genius.verbose = False
genius.skip_non_songs = True

In [5]:
#Threading
from multiprocessing.pool import ThreadPool as Pool
from functools import partial

NUM_TOP_SONGS = 1

try:
    with open(f"lyrics_cache_top{NUM_TOP_SONGS}.pkl", "rb") as f:
        lyrics_cache = pickle.load(f)
except:
    lyrics_cache = []

updated_artists = set()
for line in lyrics_cache:
    for _, artist_id, _, song_lyrics in line:
        if song_lyrics != "FAILED":
            updated_artists.add(artist_id)
    
artists_to_update = {}
for name, data in artists.items():
    if data["id"] in updated_artists:
        continue
    artists_to_update[name] = data

In [6]:
len(artists_to_update)

3730

In [7]:
def getLyrics(from_idx, to_idx):
    #top 1
    return pool.map(partial(getTopNLyrics, n=NUM_TOP_SONGS), [(name, data["id"]) for name, data in list(artists_to_update.items())[from_idx:to_idx]])

#Load the lyrics 1000 at a time and cache inbetween
with tqdm(total=len(artists_to_update)) as pbar:
    for i in range(0, len(artists_to_update), 1000):
        with Pool(processes=12) as pool:
            lyrics_cache.extend(getLyrics(i, i+1000))
            pbar.update(1000)
            with open(f"lyrics_cache_top{NUM_TOP_SONGS}.pkl", "wb") as f:
                pickle.dump(lyrics_cache, f)

  0%|          | 0/26523 [00:00<?, ?it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)


  4%|▍         | 1000/26523 [02:15<57:36,  7.38it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)
Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed ou

 19%|█▉        | 5000/26523 [09:55<40:37,  8.83it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)


 41%|████▏     | 11000/26523 [20:56<28:42,  9.01it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)


 68%|██████▊   | 18000/26523 [33:43<15:32,  9.14it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)


 72%|███████▏  | 19000/26523 [35:30<13:36,  9.21it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)


 79%|███████▉  | 21000/26523 [39:12<10:06,  9.11it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)


 83%|████████▎ | 22000/26523 [41:00<08:14,  9.15it/s]

Request timed out:
HTTPSConnectionPool(host='genius.com', port=443): Read timed out. (read timeout=5)


27000it [49:15,  9.14it/s]                           
