# Mini-Project in NLP

In [491]:
import sys
import os
import zipfile
import re

import requests
from bs4 import BeautifulSoup

import pandas as pd
from IPython.display import display, HTML
from pprint import pprint


from lyricsgenius import Genius
from credentials import CLIENT_ACCESS_TOKEN

import seaborn
import matplotlib as plt

In [492]:
# loading billboard top 100 years 1958-2021
# original dataset from: https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs/data

data_directory = "data"
csv_file_name = "charts.csv.zip"
lyrics_csv_file_name = f"lyrics_charts.csv.xz"

csv_file_path = os.path.join(data_directory, csv_file_name)
lyrics_csv_file_path = os.path.join(data_directory, lyrics_csv_file_name)

GENIUS_API_GATEWAY = Genius(
    access_token=CLIENT_ACCESS_TOKEN,
    
    verbose=False,
    retries=10,
    skip_non_songs=True,
    
    remove_section_headers=True,
)

In [493]:
# if not os.path.isfile(lyrics_csv_file_path) and not os.path.isfile(csv_file_path):
#     print("extracted CSV not found")
    
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         zip_ref.extractall(data_directory)

In [494]:
def covert_date_column(dataframe):
    print(f'{dataframe["date"].dtype = }')

    dataframe["date"] = dataframe[ ["date"] ].apply(pd.to_datetime, format="%Y-%m-%d")
    print(f'{dataframe["date"].dtype = }')

In [495]:
# if not os.path.isfile(lyrics_csv_file_path):
charts = pd.read_csv(
    csv_file_path, 
    compression="zip", 
    # converters={"date": lambda x: pd.to_datetime(x, format="%Y-%m-%d")}
)

covert_date_column(charts)

dataframe["date"].dtype = dtype('O')
dataframe["date"].dtype = dtype('<M8[ns]')


In [496]:
if not os.path.isfile(lyrics_csv_file_path):
    charts.shape

In [497]:
def print_dataframe(dataframe):
    display(HTML(
        dataframe
        .to_html())
    )

In [498]:
print_dataframe(charts.head(3))
print_dataframe(charts.tail(3))

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14


Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
330084,1958-08-04,98,Little Serenade,The Ames Brothers,,98,1
330085,1958-08-04,99,I'll Get By (As Long As I Have You),Billy Williams,,99,1
330086,1958-08-04,100,Judy,Frankie Vaughan,,100,1


In [499]:
charts = charts[["date", "song", "artist"]]

In [500]:
def sanitize_lyrics(lyrics):
    splitting_pattern = \
        r"[\d,]*Embed$" \
        r"|\{.*?\}"
        # r"|\[.*?\]"
    lyrics = ''.join(re.split(splitting_pattern, lyrics))
    
    lyrics = lyrics.split("Lyrics", maxsplit=1)[1]
    lyrics = ''.join(lyrics.rsplit("You might also like", maxsplit=1))
    
    lyrics = re.sub(r"(\n\s*){2,}\n", r"\n\n", lyrics)
    lyrics = lyrics.strip()
    
    return lyrics
    
    
def download_lyrics(artist, title):
    song = GENIUS_API_GATEWAY.search_song(
        artist=artist,
        title=title,
        get_full_info=False,
    )
    return song.lyrics if song is not None else None


def lyrics_from_row(row):
   return sanitize_lyrics(
       download_lyrics(
           artist=row["artist"], 
           title=row["song"]
        )
    )

In [501]:
def get_n_lines(num_lines, text):
    line_separator = "\n"
    
    return line_separator.join(
        text.split(line_separator)
        [ : num_lines]
    )

In [502]:
relevant_columns = ["date", "artist", "song",]

In [503]:
# example of downloading lyrics for a single song
if not os.path.isfile(lyrics_csv_file_path):

    # example_song = charts.loc[:, relevant_columns].iloc[-1: , :]
    example_song = charts.loc[ : , relevant_columns].iloc[1:2 , :]
    print_dataframe(example_song)

    raw_lyrics = download_lyrics(example_song.iloc[0].artist, example_song.iloc[0].song)
    num_lines = 10
    print("="*30)
    print("Raw lyrics:")
    print("-"*30)
    print(get_n_lines(num_lines, raw_lyrics))
    print("...")
    print()

    print("="*30)
    print("Sanitized lyrics:")
    print("-"*30)
    sanitized_lyrics = sanitize_lyrics(raw_lyrics)
    print(get_n_lines(num_lines, sanitized_lyrics))
    print("...")

In [504]:
# example of downloading songs for each row in the dataset's dataframe
# TODO isolate a dataframe of wanted songs, download all needed lyrics to dataframe, and save for future use

if not os.path.isfile(lyrics_csv_file_path):
    example_songs_copy = charts.loc[:, relevant_columns].iloc[0:3 , :].copy()

    example_songs_copy["lyrics"] = example_songs_copy.apply(lyrics_from_row, axis=1)

    example_songs_copy.to_csv(lyrics_csv_file_path, index=False, compression="xz")
else:
    example_songs_copy = pd.read_csv(lyrics_csv_file_path, compression="xz")
    covert_date_column(example_songs_copy)
    
print_dataframe(example_songs_copy)
print(example_songs_copy["lyrics"].iloc[1])

dataframe["date"].dtype = dtype('O')
dataframe["date"].dtype = dtype('<M8[ns]')


Unnamed: 0,date,artist,song,lyrics
0,2021-11-06,Adele,Easy On Me,"There ain't no gold in this river\nThat I've been washin' my hands in forever\nI know there is hope in these waters\nBut I can't bring myself to swim\nWhen I am drowning in this silence\nBaby, let me in\nGo easy on me, baby\nI was still a child\nDidn't get the chance to\nFeel the world around me\nI had no time to choose what I chose to do\nSo go easy on me\n\nThere ain't no room for things to change\nWhen we are both so deeply stuck in our ways\nYou can't deny how hard I've tried\nI changed who I was to put you both first\nBut now I give up\nGo easy on mе, baby\nI was still a child\nDidn't get the chance to\nFeel thе world around me\nHad no time to choose what I chose to do\nSo go easy on me\nSee Adele LiveGet tickets as low as $126\nI had good intentions\nAnd the highest hopes\nBut I know right now\nIt probably doesn't even show\nGo easy on me, baby\nI was still a child\nI didn't get the chance to\nFeel the world around me\nI had no time to choose what I chose to do\nSo go easy on me"
1,2021-11-06,The Kid LAROI & Justin Bieber,Stay,"I do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can’t find nobody else as good as you\nI need you to stay, need you to stay, hey (Oh)\n\nI get drunk, wake up, I'm wasted still\nI realize the time that I wasted here\nI feel like you can't feel the way I feel\nOh, I’ll be fucked up if you can't be right here\n\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, I'll be fucked up if you can't be right here\n\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\n\nWhen I’m away from you, I miss your touch (Ooh)\nYou’re the reason I believe in love\nIt's been difficult for me to trust (Ooh)\nAnd I’m afraid that I'ma fuck it up\nAin't no way that I can leave you stranded\n'Cause you ain’t ever left me empty-handed\nAnd you know that I know that I can't live without you\nSo, baby, stay\n\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nI'll be fucked up if you can't be right here\n\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\nWoah-oh\nI need you to stay, need you to stay, hey"
2,2021-11-06,Lil Nas X & Jack Harlow,Industry Baby,"(D-D-Daytrip took it to ten, hey)\nBaby back, ayy, couple racks, ayy\nCouple Grammys on him, couple plaques, ayy\nThat's a fact, ayy, throw it back, ayy\nThrow it back, ayy\nAnd this one is for the champions\nI ain't lost since I began, yeah\nFunny how you said it was the end, yeah\nThen I went did it again, yeah\n\nI told you long ago on the road\nI got what they waiting for\nI don't run from nothing, dog\nGet your soldiers, tell 'em I ain't layin' low\nYou was never really rooting for me anyway\nWhen I'm back up at the top, I wanna hear you say\nHe don't run from nothin', dog\nGet your soldiers, tell 'em that the break is over\n\nUh, need to, uh\nNeed to get this album done\nNeed a couple number onеs\nNeed a plaque on every song\nNeed mе like one with Nicki now\nTell a rap nigga I don't see ya, hah\nI'm a pop nigga like Bieber, hah\nI don't fuck bitches, I'm queer, hah\nBut these niggas bitches like Madea, yeah, yeah, yeah, ayy (Yeah)\nOh, let's do it\nI ain't fall off, I just ain't release my new shit\nI blew up, now everybody tryna sue me\nYou call me Nas, but the hood call me Doobie, yeah\n\nAnd this one is for the champions\nI ain't lost since I began, yeah\nFunny how you said it was the end, yeah\nThen I went did it again, yeah\n\nI told you long ago on the road\nI got what they waiting for (I got what they're waiting for)\nI don't run from nothing, dog\nGet your soldiers, tell 'em I ain't layin' low (Bitch, I ain't runnin' from nowhere)\nYou was never really rooting for me anyway (Ooh, ooh)\nWhen I'm back up at the top, I wanna hear you say (Ooh, ooh)\nHe don't run from nothin', dog\nGet your soldiers, tell 'em that the break is over (Yeah)\n\nMy track record so clean, they couldn't wait to just bash me\nI must be gettin' too flashy, y'all shouldn't have let the world gas me (Woo)\nIt's too late 'cause I'm here to stay and these girls know that I'm nasty (Mmm)\nI sent her back to her boyfriend with my handprint on her ass cheek\nCity talkin', we takin' notes\nTell 'em all to keep makin' posts\nWish he could, but he can't get close\nOG so proud of me that he chokin' up while he makin' toasts\nI'm the type that you can't control, said I would, then I made it so\nI don't clear up rumors (Ayy), where's y'all sense of humor? (Ayy)\nI'm done makin' jokes 'cause they got old like baby boomers\nTurned my haters to consumers, I make vets feel like they juniors (Juniors)\nSay your time is comin' soon, but just like Oklahoma (Mmm)\nMine is comin' sooner (Mmm), I'm just a late bloomer (Mmm)\nI didn't peak in high school, I'm still out here gettin' cuter (Woo)\nAll these social networks and computers\nGot these pussies walkin' 'round like they ain't losers\nI told you long ago on the road\nI got what they waiting for (I got what they waiting for)\nI don't run from nothing, dog\nGet your soldiers, tell 'em I ain't layin' low (Bitch, I ain't runnin' from nowhere)\nYou was never really rooting for me anyway\nWhen I'm back up at the top, I wanna hear you say\nHe don't run from nothin', dog\nGet your soldiers, tell 'em that the break is over\nYeah\nI'm the industry baby, mmm\nI'm the industry baby\nYeah"


I do the same thing I told you that I never would
I told you I'd change, even when I knew I never could
I know that I can’t find nobody else as good as you
I need you to stay, need you to stay, hey (Oh)

I get drunk, wake up, I'm wasted still
I realize the time that I wasted here
I feel like you can't feel the way I feel
Oh, I’ll be fucked up if you can't be right here

Oh, ooh-woah (Oh, ooh-woah, ooh-woah)
Oh, ooh-woah (Oh, ooh-woah, ooh-woah)
Oh, ooh-woah (Oh, ooh-woah, ooh-woah)
Oh, I'll be fucked up if you can't be right here

I do the same thing I told you that I never would
I told you I'd change, even when I knew I never could
I know that I can't find nobody else as good as you
I need you to stay, need you to stay, hey
I do the same thing I told you that I never would
I told you I'd change, even when I knew I never could
I know that I can't find nobody else as good as you
I need you to stay, need you to stay, hey

When I’m away from you, I miss your touch (Ooh)
You’re the reason 

In [505]:
# scraping top 100 lists prior to 1958

wikipedia_link = "https://en.wikipedia.org"
wikipedia_billboard_years_link = f"{wikipedia_link}/wiki/Template:Hot_100_year-end_charts"

wikipedia_billboard_years_html = requests.get(wikipedia_billboard_years_link)
billboard_years_html_soup = BeautifulSoup(wikipedia_billboard_years_html.content, 'html.parser')
year_link_tags = billboard_years_html_soup.select("tbody td a")

links_by_year = {int(tag.text): tag.attrs["href"] for tag in year_link_tags}
# pprint(links_by_year)


def full_wikipedia_link(short_wikipedia_link):
    return f'{wikipedia_link}{short_wikipedia_link}'


def sanitize_wikipedia_table(table:pd.DataFrame):
    table["Artist(s)"] = table["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True)
    table["Title"] = table["Title"].str.strip("\" ")
    
    # TODO ?? sanitize rank
    

def scrap_year(year:int):
    list_link = full_wikipedia_link(links_by_year[year])
    print(f"following {list_link = }")
    response = requests.get(list_link)
    soup = BeautifulSoup(response.content, 'html.parser')

    list = soup.select("table.wikitable.sortable")[0]
    # print(list)
    dataframe = pd.read_html(list.prettify())[0]
    dataframe["year"] = year
    
    sanitize_wikipedia_table(dataframe)
    
    # dataframe.rename(columns={"No.": "rank", "No. (Rank)": "rank"}, inplace=True)
    dataframe.rename(columns={"Title": "song", "Artist(s)": "artist"}, inplace=True)
    dataframe.drop(columns=["No.", "No. (Rank)"], errors="ignore", inplace=True)
    
    return dataframe


year = 1950
print_dataframe(scrap_year(year))

following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1950'


  dataframe = pd.read_html(list.prettify())[0]


Unnamed: 0,song,artist,year
0,Goodnight Irene,Gordon Jenkins,1950
1,Mona Lisa,Nat King Cole,1950
2,Third Man Theme,Anton Karas,1950
3,Sam's Song,Gary,1950
4,Play a Simple Melody,Gary,1950
5,"Music, Music, Music",Teresa Brewer,1950
6,Third Man Theme,Guy Lombardo,1950
7,Chattanoogie Shoe Shine Boy,Red Foley,1950
8,Harbor Lights,Sammy Kaye,1950
9,It Isn't Fair,Sammy Kaye,1950


In [506]:
# sanitizing wikipedia-extracted data

# temp = scrap_year(1950)
# print_dataframe(temp)
# sanitize_wikipedia_table(temp)
# print_dataframe(temp)

In [507]:
# %timeit temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
# %timeit temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
# %timeit temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),

# print_dataframe(pd.concat([
#             temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
#             temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
#             temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),
#         ],
#         axis=1
#     ))

In [508]:
# sanitize_lyrics(download_lyrics(title="The Cry of the Wild Goose", artist="Frankie Laine with Carl T. Fischer"))
# print(sanitize_lyrics(download_lyrics(title=" I Can Dream, Can't I", artist="Andrews Sisters")))

In [509]:
# getting only year-ends' top 100 from downloaded dataset

charts_indexed_by_date = charts.set_index("date")
dates_series = charts["date"]

year_ends = dates_series.groupby(charts_indexed_by_date.index.year).first()
year_ends = year_ends[year_ends.index <= 2020]

# print(charts_index_by_date.index)
print(year_ends)

year_end_charts = charts_indexed_by_date.loc[year_ends]
year_end_charts.index = year_end_charts.index.year
year_end_charts.index.names = ["year"]
year_end_charts.reset_index(inplace=True)
year_end_charts

date
1958   1958-12-29
1959   1959-12-28
1960   1960-12-26
1961   1961-12-25
1962   1962-12-29
          ...    
2016   2016-12-31
2017   2017-12-30
2018   2018-12-29
2019   2019-12-28
2020   2020-12-26
Name: date, Length: 63, dtype: datetime64[ns]


Unnamed: 0,year,song,artist
0,1958,The Chipmunk Song,The Chipmunks With David Seville
1,1958,Smoke Gets In Your Eyes,The Platters
2,1958,"To Know Him, Is To Love Him",The Teddy Bears
3,1958,One Night,Elvis Presley
4,1958,Problems,The Everly Brothers
...,...,...,...
6294,2020,Good Time,Niko Moon
6295,2020,Throat Baby (Go Baby),BRS Kash
6296,2020,Errbody,Lil Baby
6297,2020,Favorite Time Of Year,Carrie Underwood


In [510]:
billboard_missing_years = range(min(links_by_year.keys()), year_ends.index.min())
missing_years_dataframes = [scrap_year(year) for year in billboard_missing_years]
missing_years_concatenated = pd.concat(missing_years_dataframes)
missing_years_concatenated

following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_singles_of_1946'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_singles_of_1947'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_singles_of_1948'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1949'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1950'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1951'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1952'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1953'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1954'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1955'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_50_singles_of_1956'


  dataframe = pd.read_html(list.prettify())[0]


following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_50_singles_of_1957'


  dataframe = pd.read_html(list.prettify())[0]


Unnamed: 0,song,artist,year
0,Prisoner of Love,Perry Como,1946
1,To Each His Own,Eddy Howard,1946
2,The Gypsy,The Ink Spots,1946
3,Five Minutes More,Frank Sinatra,1946
4,Rumors Are Flying,Frankie Carle,1946
...,...,...,...
45,Old Cape Cod,Patti Page,1957
46,Mr. Lee,The Bobbettes,1957
47,Blueberry Hill,Fats Domino,1957
48,Whispering Bells,The Del-Vikings,1957


In [511]:
entirety_of_data = pd.concat([missing_years_concatenated, year_end_charts])
entirety_of_data

Unnamed: 0,song,artist,year
0,Prisoner of Love,Perry Como,1946
1,To Each His Own,Eddy Howard,1946
2,The Gypsy,The Ink Spots,1946
3,Five Minutes More,Frank Sinatra,1946
4,Rumors Are Flying,Frankie Carle,1946
...,...,...,...
6294,Good Time,Niko Moon,2020
6295,Throat Baby (Go Baby),BRS Kash,2020
6296,Errbody,Lil Baby,2020
6297,Favorite Time Of Year,Carrie Underwood,2020
