### Scrapping top 50 Hip Hop artists in 2020 from BillBoard

In [None]:
import requests 
from bs4 import BeautifulSoup
artists=list()

In [12]:
url = 'https://www.billboard.com/charts/year-end/2020/top-r-and-b-hip-hop-artists' # 2010~2020
req = requests.get(url)
page = req.text
soup = BeautifulSoup(page, 'html.parser')

# scrap names of top 50 hip hop artists in 2020 from billboard
    #<div class="ye-chart-item__title">
    #<a href="/music/juice-wrld">
    #Juice WRLD
    #</a>
    #</div> 
#artists=list()
for div in soup.find_all("div", "ye-chart-item__title"):
    # print(div.a.string.strip())
    artists.append(div.a.string.strip())
print('In total, we scrapped {} names of artists'.format(len(artists)))

In total, we scrapped 548 names of artists


In [14]:
import pickle
with open('data/498artists.txt', 'wb') as fp:
    pickle.dump(artists, fp)
    
with open ('data/498artists.txt', 'rb') as fp:
    list_1 = pickle.load(fp)    

In [15]:
unique_artists=list(set(artists))
print('In total, we have {} names of unique artists'.format(len(unique_artists)))

with open('data/498artists_u.txt', 'wb') as fp:
    pickle.dump(unique_artists, fp)

In total, we have 233 names of unique artists


In [16]:
'Migos' in unique_artists

True

In [17]:
import lyricsgenius as lg

file = open("data/lyrics.txt", "w")

# ignore lyrics that aren’t from official songs and disregard live performances and remixes
# focus solely on spoken song lyrics and to exclude song metadata
genius = lg.Genius('aegW44YYCCF2mKwO8fvQvnWEA68HHIfTl6-7QCh53p3i_c6ZAxoI3QCtWg6wYBmq', skip_non_songs=True, excluded_terms=["(Remix)", "(Live)"], remove_section_headers=True)

In [18]:
# FUNCTION : GET LYRICS OF TOP 10 SONGS OF EACH ARTIST
def get_lyrics(arr, k):
    '''get_lyrics() function takes a list of artist names and k number of songs we want to grab for each artist as parameters. 
    The function will print the name of each song collected and the total number of songs successfully grabbed for each artist and then write the lyrics to the .txt file.
    
    Parameters
    ----------
    arr : list
        list of artist names
    k : int
        number of songs to collect    
    '''
    # counter c used to keep track of the number of sets of lyrics written to the .txt file.
    c = 0
    
    # loop through the list of names arr
    for name in arr:
        try:
            # pass name to the lyricsgenius.Genius.search_artist() along with the k number of songs desired and sort the songs by popularity 
            # so that each artist’s most popular songs are grabbed first (up to limit k), a list of song names are returned
            songs = (genius.search_artist(name, max_songs=k, sort='popularity')).songs
            # list comprehension loop through songs, adding each song’s lyrics song.lyrics to a new list s
            s = [song.lyrics for song in songs]
            # call file.write() and pass “ ”.join(s) to compress the list of strings into a single string and write the newly made string (which represents all lyrics grabbed) to the .txt file. 
            # A more conspicuous separator like “\n \n <|endoftext|> \n \n” which will make it much easier to read through the text file as each set of single-song lyrics will be succeeded by that separator.
            file.write("\n \n \n \n".join(s))
            c += 1
            print(f"Songs grabbed:{len(s)}")
        except:
            print(f"some exception at {name}: {c}")

In [19]:
get_lyrics(unique_artists, 30)

Searching for songs by Kevin Gates...

Song 1: "2 Phones"
Song 2: "Really Really"
Song 3: "Perfect Imperfection"
Song 4: "I Don’t Get Tired"
Song 5: "Time for That"
Song 6: "Imagine That"
Song 7: "Not the Only One"
Song 8: "Satellites"
Song 9: "Hard For"
Song 10: "Kno One"
Song 11: "Cut Her Off Freestyle"
Song 12: "Pride"
Song 13: "Paper Chasers"
Song 14: "In My Feelings"
Song 15: "Out the Mud"
Song 16: "Had To"
Song 17: "Change Lanes"
Song 18: "Posed To Be In Love"
Song 19: "Jam"
Song 20: "John Gotti"
Song 21: "The Prayer"
Song 22: "Pourin the Syrup"
Song 23: "Narco Trafficante"
Song 24: "IDGAF"
Song 25: "Twilight"
Song 26: "Wish I Had It"
Song 27: "Sit Down"
Song 28: "100it Gang (Marijuana Time)"
Song 29: "One Thing"
Song 30: "4:30 AM"

Reached user-specified song limit (30).
Done. Found 30 songs.
Songs grabbed:30
Searching for songs by Jennifer Hudson...

Song 1: "And I Am Telling You I’m Not Going"
Song 2: "One Night Only"
Song 3: "Memory"
Song 4: "Spotlight"
Song 5: "Remember Me"


In [20]:
!wc -l -w -c data/lyrics.txt

  482170  3248539 16356405 data/lyrics.txt


In [21]:
!ls -hl data/lyrics.txt

-rw-r--r-- 1 yi-ting yi-ting 16M janv. 23 09:27 data/lyrics.txt


In [22]:
!tr ':;,?!\"' ' ' < data/lyrics.txt | tr -s ' ' '\n' | awk '!a[$0]++{c++} END{print c}'

72901


### Split scrapped lyrics to 2 sets - Train and Validation 

In [23]:
import re

# elimnate unnecessary parts like [chorus:...] or [outro: ...] or multiline ()
with open("data/lyrics.txt", "r") as data:
    text=data.read()
    text=re.sub(r'\[.*(\n.*){0,10}\]','',text.strip())
    text=re.sub(r'\(\n.*(\n)?\)','',text.strip())
    lines=list(text.split('\n'))

In [None]:
#with open("data/lyrics.txt", "r") as data:
#    for line in data:
#        # remove leading and trailing whitespace
#        pure_line = line.strip()
#
#        # if pure_line is not the empty string,
#        if pure_line:
#            # append it to the list
#            lines.append(pure_line)

In [None]:
# for i, line in enumerate(lines):
#     # convert to all lowercase
#     lines[i] = line.lower()

# print(f"Number of lines: {len(lines)}")
# print(f"Sample line at position 0 {lines[0]}")
# print(f"Sample line at position 999 {lines[999]}")

In [24]:
train_size=round(len(lines)*.85)
eval_size=len(lines)-train_size

train_lines = lines[:train_size]
eval_lines = lines[-eval_size:]

with open('data/train.txt','w') as f:
    for line in train_lines:
        f.write(line+'\n')

with open('data/eval.txt','w') as f:
    for line in eval_lines:
        f.write(line+'\n')
        
print(f"Number of lines for training: {len(train_lines)}")
print(f"Number of lines for validation: {len(eval_lines)}")

Number of lines for training: 402628
Number of lines for validation: 71052


In [25]:
train_lines[0:5]

['Hello? What the deal? Bet',
 '',
 'I got two phones, one for the plug and one for the load',
 'I got two phones, one for the bitches and one for the dough',
 'Think I need two more, line bumpin’ I’m ring, ring, ringin’']

In [35]:
# new_train_lines = lines[:round(train_size*.5)]

# with open('data/small_train.txt','w') as f:
#     for line in new_train_lines:
#         f.write(line+'\n')