# Download lyrics from GENIUS
From the list of songs (represented by artists/title) in df_tracks.csv, this notebook allows to search lyrics on Genius and download them.

In [None]:
import requests
import pandas as pd
import numpy as np
import unidecode

import urllib.parse
from bs4 import BeautifulSoup
import os
import re
import os.path
from requests.utils import requote_uri
import pickle

import pandas as pd
df_tracks = pd.read_csv('lyrizz/csv/df_tracks.csv', sep=';')

# GENIUS API
TOKEN_GENIUS = 'YOUR***GENIUS***TOKEN'
HEADERS = {'Authorization': f'Bearer {TOKEN_GENIUS}'}

### Functions definition

In [None]:
def filter_title(name):
    # Try de remove "- Remastered ..."
    name = name.split(' - ')[0]
    
    # Try de remove " (Remastered ...)"
    name = name.split('(')[0]

    # Remove space at begin/end
    name = name.strip()
    return name

def filter_artist(name):
    # Try de remove others artists
    name = name.split(',')[0]
    
    # Try de remove " (Feat ...)"
    name = name.split('(')[0]

    # Remove space at begin/end
    name = name.strip()
    return name

In [None]:
def search_song(artist, title):
    """
    Search on Genius from artist and title
    """
    url = requote_uri(f"https://api.genius.com/search?q={artist} - {title}")
    r = requests.get(url, headers=HEADERS)
    hits = r.json()['response']['hits']
    # No response in search
    if len(hits) == 0:
        return None,None,None,None
    
    search = hits[0]['result']
    img = search['header_image_url']
    url2 = search['url']
    id_song = search['api_path'].split('/')[-1]
    if 'media' in search:
        spotify_url = [e['url'] for e in search['media'] if e['provider']=='spotify']
        if len(spotify_url)==1:
            spotify_url = spotify_url[0]
        else:
            spotify_url = None
    else:
        spotify_url = None
        
    url3 = requote_uri(f"https://api.genius.com/songs/{id_song}")
    r3 = requests.get(url3, headers=HEADERS)
    search3 = r3.json()['response']
    apple_id = search3['song']['apple_music_id']
    
    return url2, img, spotify_url, apple_id

In [None]:
def process_text(s):
    s = s.replace('genius', '')
    s = s.replace('lyrics', '')
    s = unidecode.unidecode(s.lower())
    s = re.sub('[\W_]', '', s)
    
    return s

In [None]:
def get_raw_lyrics(url, artist, title):
    """
    From Genius lyric page url, get lyrics and check (True if lyrics seem to be correct)
    """
    page = requests.get(url)
    html = BeautifulSoup(page.text, "html.parser")
    for br in html.find_all("br"):
        br.replace_with("\n")

    div = html.find("div", id="lyrics-root")
    if div == None:
        div = html.find("div", class_="lyrics")
        if div == None:
            div = html.find("div", class_="Lyrics__Container-sc-1ynbvzw-2 jgQsqn")
            if div == None:
                return None, None
    text = div.get_text()
    parts = text.split("\n\n")#.find_all("span")
    lyrics = [p.split("\n") for p in parts]
    
    lyrics[-1][-1] = re.sub(r'\d*EmbedShare URLCopyEmbedCopy','', lyrics[-1][-1])
    
    ### Check
    infos = html.find("title").get_text().lower().replace(u'\xa0', u' ')
    check=False
    if process_text(artist) in process_text(infos) and process_text(title) in process_text(infos):
        check=True
        
    return lyrics, check

In [None]:
def write_txt_file(lyrics, track_id):
    s=""
    for parts in lyrics:
        for p in parts:
            s+=p+"\n"
        s+="\n"
    with open(f'lyrizz/txt/{track_id}.txt', 'w') as f:
        f.write(s)

In [None]:
def is_available(lyrics, check, spotify_url, apple_id):
    res = True
    if lyrics == None:
        res = False
    if not check:
        res = False
    return res

In [None]:
def save_image(img_url, track_id):
    img_data = requests.get(img_url).content
    file_name = img_url.split('/')[-1]
    if '.' not in file_name:
        ext='jpg'
    else:
        ext = file_name.split('.')[-1]
    with open(f'lyrizz/images/{track_id}.{ext}', 'wb') as handler:
        handler.write(img_data)

### Process
- Clean artist and title
- Search song on Genius API
- If song exists and lyrics available on Genius, download lyrics and image

In [None]:
LIST_BUG=[]
for i in range(len(df_tracks)):
    track = df_tracks.iloc[i]
    track_id = track['track_id']
    artist, title = track['artists'], track['name']
    artist = filter_artist(artist)
    title = filter_title(title)
    
    if os.path.isfile(f'lyrizz/txt/{track_id}.txt'):
        print('[ALREADY]', artist, title)
    elif if track_id in LIST_BUG:
        print('[BUG]', artist, title)
        pass
    else:
#         print(artist, title)
        url, img, spotify_url, apple_id = search_song(artist, title)

        if url == None:
            available = False
        else:
            lyrics, check = get_raw_lyrics(url, artist, title)
            available = is_available(lyrics, check, spotify_url, apple_id)

        if available:
            print(artist, title, track_id)
            save_image(img, track_id)
            write_txt_file(lyrics, track_id)
        else:
            LIST_BUG.append(track_id)
            print('[BUG]', artist, title, url)