In [422]:
from bs4 import BeautifulSoup
import requests 
import numpy as np 
import json
import time

In [423]:
from bs4 import BeautifulSoup
import numpy as np 
import pandas as pd 
import os 
import re
import requests

In [424]:
# given url, returns beautiful soup option
def make_soup(url): 
    my_page = requests.get(url, headers = {"Accept": "text/html"})
    if my_page.status_code == 200:
        soup = BeautifulSoup(my_page.text, 'html.parser')
        return soup
    else: 
        print("ERROR: {} threw status code {}\n".format(url, my_page.status_code))
        return None


In [425]:
#input a url to an artist page, returns list of relative links to the artist's albums. 
def get_alb_links(url): 
    alb_links=[]
    soup=make_soup(url)
    # last full_width_button is a tag with a url to a page with all the artist's albums
    url= "http://genius.com" + soup.find_all("a", class_="full_width_button")[-1]["href"]
    soup=make_soup(url)
    alb_tags=soup.find_all("a", class_="album_link")
    for album in alb_tags: 
        alb_links.append(album['href'])
    return alb_links

In [426]:
# pass a page of text (not a beautiful soup object) and the artist and use string operations to return a list of writers
def get_writers(page, artist_name):
    writers=set()
    # in pages with writer info, a "Written By" div is followed by a tags with writers. 
    begin=page.find("Written By")
    end = page[begin:].find("</div")
    if begin != -1 and end != -1:
        sub_sect=BeautifulSoup(page[begin:begin+end])
        writer_tags=sub_sect.find_all("a")
        for writer_tag in writer_tags: 
            writers.add(writer_tag.get_text().lower())
    # rappers are by default considered the writer of their song (unless it is a cover which is outside this project)
    writers.add(artist_name)
    return list(writers)

In [427]:
# Input beautiful soup object of some song from a desired artist and the artist's name in lowercase
# returns just the desired artist's contributions to the song - for instance a chorus sung by a featured artist is not returned.
def get_song_lyrics(soup, artist_name):
    lyr_list=[]
    lyrics = soup.find('div', class_='lyrics').get_text()
    #lyrics are broken up into sections with headers - eg) [Chorus: Drake] - followed by lyrics 
    lyric_split=re.split(r"\n\[", lyrics)
    for line in lyric_split:
        #break into header and lyric, which should return exactly two elements in the body of the song.
        head_lyr=re.split(r"]\n", line)
        if len(head_lyr) ==2: 
            # song's with multiple authors have a colon in the header.  
            # only return lyrics from sections that exclusively list the desired artist as author
            if re.search(":.*", head_lyr[0]):
                if re.search(":.*", head_lyr[0])[0].lower()== ": "+artist_name:
                    lyr_list.append(head_lyr[1])   
            # if there was no colon, then it is a one author song, and thus is added to lyrics
            else:
                lyr_list.append(head_lyr[1])
    lyrics = "".join(lyr_list)
    return lyrics

In [428]:
#given artist name and url to artist's genius homepage, writes lyrics and metadata from that page to json files corresponding to their album name
# returns the number of songs sucessfully written
def main(url, artist_name): 
    artist_name=artist_name.strip().lower()
    alb_links=get_alb_links(url)
    song_count=0
    for alb in alb_links: 
        json_lst=[]
        alb_url="http://genius.com"+alb
        soup=make_soup(alb_url)
        #get year of album from first instance of "metadata_unit" class; currenly only looking for data after 2000. 
        year=soup.find('div', class_="metadata_unit").get_text()
        if year:
            if re.search(r"(20\d\d)", year):
                year=re.search(r"20\d\d", year)[0]
            else:
                year = None
        album=soup.find("title").get_text()
        start=album.find("-")
        end=album.find("Lyrics")
        if start != -1 and end != -1: 
            album=album[start+2:end]

        #links to songs found in u-display_block on album page
        songs=soup.find_all("a", class_="u-display_block")
        for song in songs:
            json_dict={}
            song_url=song["href"]
            #manually get page and soup because get_writers uses string operations, not beautiful soup
            page= requests.get(song_url, headers = {"Accept": "text/html"})
            if page.status_code != 200:
                print("ERROR: {} threw  status code {}\n".format(song_url, page.status_code))
                continue
            writers=get_writers(page.text, artist_name)
            soup = BeautifulSoup(page.text)       
            lyrics = get_song_lyrics(soup, artist_name)
            title=soup.find("h1", class_="header_with_cover_art-primary_info-title").get_text()
            json_dict["title"] = title
            json_dict["year"] = year
            json_dict["album"] = album
            json_dict["writers"] = writers
            json_dict["artist"] = artist_name
            json_dict["lyrics"]= lyrics
            json_lst.append(json_dict)
            song_count+=1
            time.sleep(2)
        file_name = "data/"+artist_name.replace(' ', '_')+'/'+album.replace(" ", "_")+'.json'
        with open(file_name, 'w') as file: 
            json.dump(json_lst, file)
    return song_count


In [429]:
url= 'https://genius.com/artists/Drake'
artist_name= 'drake'
print(main(url, artist_name))

ERROR: https://genius.com/Drake-inst-lyrics threw  status code404

ERROR: https://genius.com/Drake-acapella-lyrics threw  status code404

259


In [430]:
url= 'https://genius.com/artists/Quentin-miller'
artist_name= 'quentin miller'
print(main(url, artist_name))

ERROR: https://genius.com/Quentin-miller-no-scrimage-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-grey-steel-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-shanes-introduction-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-taste-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-5-oh-x-two-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-acquisistion-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-reckless-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-apply-pressure-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-eden-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-free-tacos-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-cul-cha-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-love-below-zero-lyrics threw  status code404

ERROR: http