In [259]:
from bs4 import BeautifulSoup
import requests 
import numpy as np 
import json
import time
import pdb
import os 
import re

In [260]:
# given url, returns beautiful soup option
def make_soup(url): 
    my_page = requests.get(url, headers = {"Accept": "text/html"})
    if my_page.status_code == 200:
        soup = BeautifulSoup(my_page.text, 'html.parser')
        return soup
    else: 
        print("ERROR: {} threw status code {}\n".format(url, my_page.status_code))
        return None


In [261]:
#input a url to an artist page, returns list of relative links to the artist's albums. 
def get_alb_links(url): 
    alb_links = []
    soup = make_soup(url)
    # last full_width_button is a tag with a url to a page with all the artist's albums
    url = "http://genius.com" + soup.find_all("a", class_="full_width_button")[-1]["href"]
    soup=make_soup(url)
    alb_tags = soup.find_all("a", class_="album_link")
    for album in alb_tags: 
        alb_links.append(album['href'])
    return alb_links

In [262]:
# recursive function that takes: 
# 1) url to first page of a artist's songs 
# 2) artist name (lowercase, hyphens for spaces) to match only songs by that artist (some songs_links are just production credits).
# 3) a list of accumulated songs to pass between function calls
# 4) a string that if contained in a song url will not be added. Used to screen out 'Drake' reference tracks.
# returns a list of all songs
def pg_to_songs(url, artist_name, ls, taboo='drake'):
    soup = make_soup(url)
    songs = soup.find_all(class_='song_name work_in_progress song_link')
    for song in songs: 
        if song['href'].lower().find(taboo) == -1 and song['href'].lower().find(artist_name) != -1:
            ls.append(song['href'])
    x = soup.find('a', class_='next_page')
    if (x):
        time.sleep(1)
        return pg_to_songs(' http://genius.com'+x['href'], artist_name, ls)
    else: 
        return ls

In [263]:
# similar to get_albums but bypass albums and just get single list of all songs (even those not part of any album)
# instead return all the songs on their songs page 
def get_song_links(url, artist_name):
    artist_name=artist_name.lower()
    soup = make_soup(url)
    # first full_width_button is a tag with a url to a page with all the artist's albums
    url = "http://genius.com" + soup.find_all("a", class_="full_width_button")[0]["href"]
    song_links = pg_to_songs(url, artist_name, [])
    return song_links

In [264]:
# pass a page of text (not a beautiful soup object) and the artist and use string operations to return a list of writers
def get_writers(page, artist_name):
    writers = set()
    # in pages with writer info, a "Written By" div is followed by a tags with writers. 
    begin = page.find("Written By")
    end = page[begin:].find("</div")
    if begin != -1 and end != -1:
        sub_sect=BeautifulSoup(page[begin:begin+end])
        writer_tags=sub_sect.find_all("a")
        for writer_tag in writer_tags: 
            writers.add(writer_tag.get_text().lower())
    # rappers are by default considered the writer of their song (unless it is a cover which we're omitting)
    writers.add(artist_name)
    return list(writers)

In [265]:
# Input beautiful soup object of some song from a desired artist and the artist's name (lowercase, hyphens as spaces)
# returns just the desired artist's contributions to the song - for instance a chorus sung by a featured artist is not returned.
def get_song_lyrics(soup, artist_name):
    artist_name = artist_name.replace('-', ' ')
    lyr_list = []
    lyrics = soup.find('div', class_='lyrics').get_text()
    # lyrics are broken up into sections with headers - eg) [Chorus: Drake] - followed by lyrics 
    lyric_split = re.split(r"\n\[", lyrics)
    for line in lyric_split:
        #break into header and lyric, which should return exactly two elements in the body of the song.
        head_lyr = re.split(r"]\n", line)
        if len(head_lyr) ==2: 
            # song's with multiple authors have a colon in the header.  
            # only return lyrics from sections that exclusively list the desired artist as author
            if re.search(":.*", head_lyr[0]):
                if re.search(":.*", head_lyr[0])[0].lower()== ": "+artist_name:
                    lyr_list.append(head_lyr[1])   
            # if there was no colon, then it is a one author song, and thus add to lyrics
            else:
                lyr_list.append(head_lyr[1])
    lyrics = "".join(lyr_list)
    return lyrics

In [266]:
# writes a list of json - one for each song
# label each year as 0 and each album as an empty string for formatting.
# returns number of songs without missing lyrics. 
def song_links_to_json(song_links, artist_name): 
    json_ls = []
    song_count = 0
    for song in song_links:
        json_dict = {}
        # manually get page and soup because get_writers uses string operations, not beautiful soup
        page = requests.get(song, headers = {"Accept": "text/html"})
        if page.status_code != 200:
            print("ERROR: {} threw  status code {}\n".format(song_url, page.status_code))
            continue
        soup = BeautifulSoup(page.text)
        lyrics = get_song_lyrics(soup, artist_name)
        if lyrics != '':
            writers = get_writers(page.text, artist_name)
            head = soup.find('div', class_='header_with_cover_art-primary_info_container')
            artist = head.find(class_='header_with_cover_art-primary_info-primary_artist').contents[0]
            title = head.find(class_='header_with_cover_art-primary_info-title').contents[0]
            json_dict["title"] = title
            json_dict["year"] = 0
            json_dict["album"] = ''
            json_dict["writers"] = writers
            json_dict["artist"] = artist_name
            json_dict["lyrics"] = lyrics
            json_ls.append(json_dict)
            song_count+=1
        time.sleep(2)
    file_name = "data/"+artist_name.replace('-', '_')+'/all.json'
    with open(file_name, 'w') as file: 
        json.dump(json_ls, file)
    return song_count

In [267]:
# input artist's genius homepage and artist name in all lowercase with hyphens as spaces (eg quentin-miller)
# write all the artitsts songs to 'data/artist_name/all.json'
# returns number of songs successfully retreived and written
def song_main(url, artist_name):
    song_links = get_song_links(url, artist_name)
    return song_links_to_json(song_links, artist_name)


In [272]:
# given artist name and url to artist's genius homepage, 
# writes lyrics and metadata from that page to json files corresponding to their album name
# returns the number of songs sucessfully written
def album_main(url, artist_name): 
    artist_name = artist_name.strip().lower()
    alb_links = get_alb_links(url)
    song_count = 0
    for alb in alb_links: 
        json_ls = []
        alb_url = "http://genius.com"+alb
        soup = make_soup(alb_url)
        #get year of album from first instance of "metadata_unit" class; currenly only looking for data after 2000. 
        year = soup.find('div', class_="metadata_unit").get_text()
        if year:
            if re.search(r"(20\d\d)", year):
                year = re.search(r"20\d\d", year)[0]
            else:
                year = None
        album = soup.find("title").get_text()
        start = album.find("-")
        end = album.find("Lyrics")
        if start != -1 and end != -1: 
            album = album[start+2:end]

        #links to songs found in u-display_block on album page
        songs = soup.find_all("a", class_="u-display_block")
        for song in songs:
            json_dict = {}
            song_url = song["href"]
            # manually get page and soup because get_writers uses string operations, not beautiful soup
            page = requests.get(song_url, headers = {"Accept": "text/html"})
            if page.status_code != 200:
                print("ERROR: {} threw  status code {}\n".format(song_url, page.status_code))
                continue
            soup = BeautifulSoup(page.text)       
            lyrics = get_song_lyrics(soup, artist_name)
            if (lyrics != ''):
                writers = get_writers(page.text, artist_name)
                title = soup.find("h1", class_="header_with_cover_art-primary_info-title").get_text()
                json_dict["title"] = title
                json_dict["year"] = year
                json_dict["album"] = album
                json_dict["writers"] = writers
                json_dict["artist"] = artist_name
                json_dict["lyrics"] = lyrics
                json_ls.append(json_dict)
                song_count+=1
            time.sleep(2)
        file_name = "data/"+artist_name.replace(' ', '_')+'/'+album.replace(" ", "_")+'.json'
        with open(file_name, 'w') as file: 
            json.dump(json_ls, file)
    return song_count


In [258]:
url = "https://genius.com/artists/Quentin-miller"
count = song_main(url, 'quentin-miller')
print("Found {} Quentin-miller songs".format(count))

Found 138 Quentin-miller songs


In [242]:
url = 'https://genius.com/Quentin-miller-4-21-freestyle-lyrics'
soup = make_soup(url)
get_song_lyrics(soup, 'quentin-miller')

'Yeah, I apologize, slipped a couple times\nSlipped into a dark place running towards the light\nI slipped, like DMX in \'98 I owned up to my mistakes\nI had a couple records leak\nThat dark cloud been rainin\' on my career since then, shits been- Difficult, but then again I can’t recall when it hasn\'t been\nThis much alcohol is hazardous but fuck being sober I’d rather this Somehow the word "Lit" became synonymous with happiness\nI remember back when I ain\'t have to twist a cap for me to spit a rap\nDreams do come true I’m really livin\' that\nBut nightmares do too, see what I’m gettin\' at?\nThe signs was right there and I was missin\' that\nAll along it was in me, I was just livin\' timid\nAfraid to say that I’m just as good as my competition\nIf not better, strange feeling, the same feeling\nWhen I was a kid and lookin\' out the window\nAt the other children, get on the bus and shit\nI felt I wasn’t living, the grass is always greener\nI know the deeper meaning, hold up\nWhile I’

In [273]:
url = 'https://genius.com/artists/Drake'
artist_name = 'drake'
print(album_main(url, artist_name))

ERROR: https://genius.com/Drake-inst-lyrics threw  status code 404

ERROR: https://genius.com/Drake-acapella-lyrics threw  status code 404

260


In [430]:
url = 'https://genius.com/artists/Quentin-miller'
artist_name = 'quentin miller'
print(album_main(url, artist_name))

ERROR: https://genius.com/Quentin-miller-no-scrimage-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-grey-steel-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-shanes-introduction-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-taste-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-5-oh-x-two-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-acquisistion-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-reckless-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-apply-pressure-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-eden-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-free-tacos-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-cul-cha-lyrics threw  status code404

ERROR: https://genius.com/Quentin-miller-love-below-zero-lyrics threw  status code404

ERROR: http

In [28]:
# for future exploration: add similar artists such as Partynextdoor and JCole as noise 

url = 'https://genius.com/artists/Partynextdoor'
get_alb_links(url)
artist_name = 'partynextdoor'
print(album_main(url, artist_name))

url = 'https://genius.com/artists/J-cole'
artist_name = 'j cole'
print(album_main(url, artist_name))

In [141]:
# input path to folder with jsons of artist's albums, 
# convert into txt files and write them into 'data/text' folder 
# returns number of songs that got written
# r stylo package works better with txt files with special naming conventions of class/author + '_' + title
def json_to_txt(path):
    if (path[-1] != '\\'):
        path = path+'\\'
    song_count = 0
    for filename in os.listdir(path): 
        if filename.find('.json')==-1:
            continue            
        with open(path+filename) as f: 
            data = json.load(f)
        #use first song to create file name following stylo file naming conventions
        name = data[0]['artist'].strip().split()[0].capitalize()
        name = re.sub(r'[-_\. ]', '', name)
        album = data[0]['album'].strip(' ').split()[0].capitalize()
        album = re.sub(r'[-_\. ]', '', album)
        txt_path = 'data/texts/'
        txt_file = ''.join([txt_path, name, '_', album, '.txt'])
        # account for first words being shared in album titles such as multiple editions of a mixtape - (e.g. tha Carter I, the carter II etc).  
        if os.path.isfile(txt_file): 
            suffix = 1
            while True: 
                txt_file = ''.join([txt_path, name, '_', album, str(suffix), '.txt'])
                if os.path.isfile(txt_file): 
                    suffix+=1
                    continue
                else: 
                    break
        with open(txt_file, 'w', errors='ignore',) as f: 
            # for now include every song including those with multiple writers due to small size of corpus
            for song in data:
                if len(song['lyrics']) > 10:
                    f.write(song['lyrics'])
                    song_count+=1
    return song_count

In [144]:
path = 'data/drake/'
drake_count = json_to_txt(path)
print(drake_count)

243


In [145]:
path = 'data/quentin_miller/'
miller_count = json_to_txt(path)
print(miller_count)

75
