# Mini-Project in NLP

In [1]:
import sys
import os
import zipfile
import re

import requests
from bs4 import BeautifulSoup

import pandas as pd
from IPython.display import display, HTML
from pprint import pprint


from lyricsgenius import Genius
from credentials import CLIENT_ACCESS_TOKEN

import seaborn
import matplotlib as plt

In [2]:
# loading billboard top 100 years 1958-2021
# original dataset from: https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs/data

data_directory = "data"
csv_file_name = "charts.csv.zip"
lyrics_csv_file_name = f"lyrics_charts.csv.xz"

csv_file_path = os.path.join(data_directory, csv_file_name)
lyrics_csv_file_path = os.path.join(data_directory, lyrics_csv_file_name)

GENIUS_API_GATEWAY = Genius(
    access_token=CLIENT_ACCESS_TOKEN,
    
    verbose=False,
    retries=10,
    skip_non_songs=True,
    
    remove_section_headers=True,
)

In [3]:
# if not os.path.isfile(lyrics_csv_file_path) and not os.path.isfile(csv_file_path):
#     print("extracted CSV not found")
    
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         zip_ref.extractall(data_directory)

In [4]:
def covert_date_column(dataframe):
    print(f'{dataframe["date"].dtype = }')

    dataframe["date"] = dataframe[ ["date"] ].apply(pd.to_datetime, format="%Y-%m-%d")
    print(f'{dataframe["date"].dtype = }')

In [5]:
if not os.path.isfile(lyrics_csv_file_path):
    charts = pd.read_csv(
        csv_file_path, 
        compression="zip", 
        # converters={"date": lambda x: pd.to_datetime(x, format="%Y-%m-%d")}
    )
    
    covert_date_column(charts)

In [6]:
if not os.path.isfile(lyrics_csv_file_path):
    charts.shape

In [7]:
def print_dataframe(dataframe):
    display(HTML(
        dataframe
        .to_html())
    )

In [8]:
if not os.path.isfile(lyrics_csv_file_path):
    print_dataframe(charts.head(3))
    print_dataframe(charts.tail(3))

In [9]:
def sanitize_lyrics(lyrics):
    splitting_pattern = \
        r"[\d,]*Embed$"\
        # r"|\[.*?\]"
    lyrics = ''.join(re.split(splitting_pattern, lyrics))
    
    lyrics = lyrics.split("Lyrics", maxsplit=1)[1]
    lyrics = ''.join(lyrics.rsplit("You might also like", maxsplit=1))
    
    lyrics = re.sub(r"(\n\s*){2,}\n", r"\n\n", lyrics)
    lyrics = lyrics.strip()
    
    return lyrics
    
    
def download_lyrics(artist, title):
    song = GENIUS_API_GATEWAY.search_song(
        artist=artist,
        title=title,
        get_full_info=False,
    )
    return song.lyrics if song is not None else None


def lyrics_from_row(row):
   return sanitize_lyrics(
       download_lyrics(
           artist=row["artist"], 
           title=row["song"]
        )
    )

In [10]:
def get_n_lines(num_lines, text):
    line_separator = "\n"
    
    return line_separator.join(
        text.split(line_separator)
        [ : num_lines]
    )

In [11]:
relevant_columns = ["date", "artist", "song",]

In [12]:
# example of downloading lyrics for a single song
if not os.path.isfile(lyrics_csv_file_path):

    # example_song = charts.loc[:, relevant_columns].iloc[-1: , :]
    example_song = charts.loc[ : , relevant_columns].iloc[1:2 , :]
    print_dataframe(example_song)

    raw_lyrics = download_lyrics(example_song.iloc[0].artist, example_song.iloc[0].song)
    num_lines = 10
    print("="*30)
    print("Raw lyrics:")
    print("-"*30)
    print(get_n_lines(num_lines, raw_lyrics))
    print("...")
    print()

    print("="*30)
    print("Sanitized lyrics:")
    print("-"*30)
    sanitized_lyrics = sanitize_lyrics(raw_lyrics)
    print(get_n_lines(num_lines, sanitized_lyrics))
    print("...")

In [13]:
# example of downloading songs for each row in the dataset's dataframe
# TODO isolate a dataframe of wanted songs, download all needed lyrics to dataframe, and save for future use

if not os.path.isfile(lyrics_csv_file_path):
    example_songs_copy = charts.loc[:, relevant_columns].iloc[1:3 , :].copy()

    example_songs_copy["lyrics"] = example_songs_copy.apply(lyrics_from_row, axis=1)

    example_songs_copy.to_csv(lyrics_csv_file_path, compression="xz")
else:
    example_songs_copy = pd.read_csv(lyrics_csv_file_path, compression="xz")
    covert_date_column(example_songs_copy)
    
print_dataframe(example_songs_copy)
print(example_songs_copy["lyrics"].iloc[1])

dataframe["date"].dtype = dtype('O')
dataframe["date"].dtype = dtype('<M8[ns]')


Unnamed: 0.1,Unnamed: 0,date,artist,song,lyrics
0,1,2021-11-06,The Kid LAROI & Justin Bieber,Stay,"I do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can’t find nobody else as good as you\nI need you to stay, need you to stay, hey (Oh)\n\nI get drunk, wake up, I'm wasted still\nI realize the time that I wasted here\nI feel like you can't feel the way I feel\nOh, I’ll be fucked up if you can't be right here\n\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, I'll be fucked up if you can't be right here\n\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\n\nWhen I’m away from you, I miss your touch (Ooh)\nYou’re the reason I believe in love\nIt's been difficult for me to trust (Ooh)\nAnd I’m afraid that I'ma fuck it up\nAin't no way that I can leave you stranded\n'Cause you ain’t ever left me empty-handed\nAnd you know that I know that I can't live without you\nSo, baby, stay\n\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nOh, ooh-woah (Oh, ooh-woah, ooh-woah)\nI'll be fucked up if you can't be right here\n\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\nI do the same thing I told you that I never would\nI told you I'd change, even when I knew I never could\nI know that I can't find nobody else as good as you\nI need you to stay, need you to stay, hey\nWoah-oh\nI need you to stay, need you to stay, hey"
1,2,2021-11-06,Lil Nas X & Jack Harlow,Industry Baby,"(D-D-Daytrip took it to ten, hey)\nBaby back, ayy, couple racks, ayy\nCouple Grammys on him, couple plaques, ayy\nThat's a fact, ayy, throw it back, ayy\nThrow it back, ayy\nAnd this one is for the champions\nI ain't lost since I began, yeah\nFunny how you said it was the end, yeah\nThen I went did it again, yeah\n\nI told you long ago on the road\nI got what they waiting for\nI don't run from nothing, dog\nGet your soldiers, tell 'em I ain't layin' low\nYou was never really rooting for me anyway\nWhen I'm back up at the top, I wanna hear you say\nHe don't run from nothin', dog\nGet your soldiers, tell 'em that the break is over\n\nUh, need to, uh\nNeed to get this album done\nNeed a couple number onеs\nNeed a plaque on every song\nNeed mе like one with Nicki now\nTell a rap nigga I don't see ya, hah\nI'm a pop nigga like Bieber, hah\nI don't fuck bitches, I'm queer, hah\nBut these niggas bitches like Madea, yeah, yeah, yeah, ayy (Yeah)\nOh, let's do it\nI ain't fall off, I just ain't release my new shit\nI blew up, now everybody tryna sue me\nYou call me Nas, but the hood call me Doobie, yeah\n\nAnd this one is for the champions\nI ain't lost since I began, yeah\nFunny how you said it was the end, yeah\nThen I went did it again, yeah\n\nI told you long ago on the road\nI got what they waiting for (I got what they're waiting for)\nI don't run from nothing, dog\nGet your soldiers, tell 'em I ain't layin' low (Bitch, I ain't runnin' from nowhere)\nYou was never really rooting for me anyway (Ooh, ooh)\nWhen I'm back up at the top, I wanna hear you say (Ooh, ooh)\nHe don't run from nothin', dog\nGet your soldiers, tell 'em that the break is over (Yeah)\n\nMy track record so clean, they couldn't wait to just bash me\nI must be gettin' too flashy, y'all shouldn't have let the world gas me (Woo)\nIt's too late 'cause I'm here to stay and these girls know that I'm nasty (Mmm)\nI sent her back to her boyfriend with my handprint on her ass cheek\nCity talkin', we takin' notes\nTell 'em all to keep makin' posts\nWish he could, but he can't get close\nOG so proud of me that he chokin' up while he makin' toasts\nI'm the type that you can't control, said I would, then I made it so\nI don't clear up rumors (Ayy), where's y'all sense of humor? (Ayy)\nI'm done makin' jokes 'cause they got old like baby boomers\nTurned my haters to consumers, I make vets feel like they juniors (Juniors)\nSay your time is comin' soon, but just like Oklahoma (Mmm)\nMine is comin' sooner (Mmm), I'm just a late bloomer (Mmm)\nI didn't peak in high school, I'm still out here gettin' cuter (Woo)\nAll these social networks and computers\nGot these pussies walkin' 'round like they ain't losers\nI told you long ago on the road\nI got what they waiting for (I got what they waiting for)\nI don't run from nothing, dog\nGet your soldiers, tell 'em I ain't layin' low (Bitch, I ain't runnin' from nowhere)\nYou was never really rooting for me anyway\nWhen I'm back up at the top, I wanna hear you say\nHe don't run from nothin', dog\nGet your soldiers, tell 'em that the break is over\nYeah\nI'm the industry baby, mmm\nI'm the industry baby\nYeah"


(D-D-Daytrip took it to ten, hey)
Baby back, ayy, couple racks, ayy
Couple Grammys on him, couple plaques, ayy
That's a fact, ayy, throw it back, ayy
Throw it back, ayy
And this one is for the champions
I ain't lost since I began, yeah
Funny how you said it was the end, yeah
Then I went did it again, yeah

I told you long ago on the road
I got what they waiting for
I don't run from nothing, dog
Get your soldiers, tell 'em I ain't layin' low
You was never really rooting for me anyway
When I'm back up at the top, I wanna hear you say
He don't run from nothin', dog
Get your soldiers, tell 'em that the break is over

Uh, need to, uh
Need to get this album done
Need a couple number onеs
Need a plaque on every song
Need mе like one with Nicki now
Tell a rap nigga I don't see ya, hah
I'm a pop nigga like Bieber, hah
I don't fuck bitches, I'm queer, hah
But these niggas bitches like Madea, yeah, yeah, yeah, ayy (Yeah)
Oh, let's do it
I ain't fall off, I just ain't release my new shit
I blew up

In [23]:
# scraping top 100 lists prior to 1958

wikipedia_link = "https://en.wikipedia.org"
wikipedia_billboard_years_link = f"{wikipedia_link}/wiki/Template:Hot_100_year-end_charts"
wikipedia_billboard_years_html = requests.get(wikipedia_billboard_years_link)

html_soup = BeautifulSoup(wikipedia_billboard_years_html.content, 'html.parser')
year_link_tags = html_soup.select("tbody td a")

links_by_year = {int(tag.text): tag.attrs["href"] for tag in year_link_tags}
# pprint(links_by_year)


def full_wikipedia_link(short_wikipedia_link):
    return f'{wikipedia_link}{short_wikipedia_link}'


def list_by_year(year:int):
    list_link = full_wikipedia_link(links_by_year[year])
    print(f"following {list_link = }")
    response = requests.get(list_link)
    soup = BeautifulSoup(response.content, 'html.parser')

    list = soup.select("table.wikitable.sortable")[0]
    # print(list)
    dataframe = pd.read_html(list.prettify())
    return dataframe
    a.strip("\"")
    
    # titles = soup.select("table.wikitable.sortable tbody tr td:nth-child(2)")
    # artists = soup.select("table.wikitable.sortable tbody tr td:nth-child(3)")
    
    # artists_without_rowspan = []
    # for artist in artists:
    #     rowspan = int(artist.attrs.get("rowspan", "1"))
    #     artists_without_rowspan.extend(rowspan * [artist])
    
    # assert (len(titles) == len(artists_without_rowspan))
    
    # parsed_list = []
    # for list_item in list:
    #     split_text = list_item.text.split(" - ")
        
    #     parsed_list.append({
    #         "song": split_text[0],
    #         "artist": split_text[1],
    #     })

    # return parsed_list


year = 1950
print_dataframe(list_by_year(year)[0])

following list_link = 'https://en.wikipedia.org/wiki/Billboard_year-end_top_30_singles_of_1950'


  return pd.read_html(list.prettify())


Unnamed: 0,No.,Title,Artist(s)
0,1,""" Goodnight Irene """,Gordon Jenkins & The Weavers
1,2,""" Mona Lisa """,Nat King Cole with Les Baxter
2,3,""" Third Man Theme """,Anton Karas
3,4,""" Sam's Song """,Gary & Bing Crosby with Matty Matlock
4,5,""" Play a Simple Melody """,Gary & Bing Crosby with Matty Matlock
5,6,""" Music, Music, Music """,Teresa Brewer
6,7,""" Third Man Theme """,Guy Lombardo
7,8,""" Chattanoogie Shoe Shine Boy """,Red Foley
8,9,""" Harbor Lights """,Sammy Kaye
9,10,""" It Isn't Fair """,Sammy Kaye & Don Cornell


In [None]:
# sanitize_lyrics(download_lyrics(title="Sam's Song", artist="Gary & Bing Crosby with Matty Matlock"))