# Mini-Project in NLP

In [199]:
import sys
import os
import zipfile
import re

import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from IPython.display import display, HTML
from io import StringIO
import seaborn
import matplotlib as plt
from pprint import pprint
import mapply
import multiprocessing
from tqdm import tqdm

from lyricsgenius import Genius
from credentials import CLIENT_ACCESS_TOKEN
from bs4 import BeautifulSoup
import requests

In [200]:
# tqdm.pandas()
ProgressBar().register()

In [201]:
# loading billboard top 100 years 1958-2021
# original dataset from: https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs/data

data_directory = "data"
csv_file_name = "charts.csv.zip"
lyrics_csv_file_name = f"lyrics_charts.csv.xz"

csv_file_path = os.path.join(data_directory, csv_file_name)
lyrics_csv_file_path = os.path.join(data_directory, lyrics_csv_file_name)

GENIUS_API_GATEWAY = Genius(
    access_token=CLIENT_ACCESS_TOKEN,
    
    verbose=False,
    timeout=30,
    retries=20,
    skip_non_songs=True,
    remove_section_headers=True,
)

In [202]:
# if not os.path.isfile(lyrics_csv_file_path) and not os.path.isfile(csv_file_path):
#     print("extracted CSV not found")
    
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         zip_ref.extractall(data_directory)

In [203]:
def covert_date_column(dataframe):
    print(f'{dataframe["date"].dtype = }')

    dataframe["date"] = dataframe[ ["date"] ].apply(pd.to_datetime, format="%Y-%m-%d")
    print(f'{dataframe["date"].dtype = }')

In [204]:
# if not os.path.isfile(lyrics_csv_file_path):
charts = pd.read_csv(
    csv_file_path, 
    compression="zip", 
    # converters={"date": lambda x: pd.to_datetime(x, format="%Y-%m-%d")}
)

covert_date_column(charts)

dataframe["date"].dtype = dtype('O')
dataframe["date"].dtype = dtype('<M8[ns]')


In [205]:
if not os.path.isfile(lyrics_csv_file_path):
    charts.shape

In [206]:
def print_dataframe(dataframe):
    display(HTML(
        dataframe
        .to_html())
    )

In [207]:
print_dataframe(charts.head(3))
print_dataframe(charts.tail(3))

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14


Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
330084,1958-08-04,98,Little Serenade,The Ames Brothers,,98,1
330085,1958-08-04,99,I'll Get By (As Long As I Have You),Billy Williams,,99,1
330086,1958-08-04,100,Judy,Frankie Vaughan,,100,1


In [208]:
relevant_columns = ["date", "artist", "song",]
charts = charts[relevant_columns]

In [209]:
def sanitize_lyrics(lyrics):
    splitting_pattern = \
        r"[\d,]*Embed$" \
        r"|\{.*?\}"
        # r"|\[.*?\]"
    try:
        lyrics = ''.join(re.split(splitting_pattern, lyrics))
    except Exception as exception:
        print(f"exception when {lyrics = }")
        raise exception
    
    lyrics = lyrics.split("Lyrics", maxsplit=1)[1]
    lyrics = ''.join(lyrics.rsplit("You might also like", maxsplit=1))
    
    lyrics = re.sub(r"(\n\s*){2,}\n", r"\n\n", lyrics)
    lyrics = lyrics.strip()
    
    return lyrics
    
    
def download_lyrics(artist, title):
    song = GENIUS_API_GATEWAY.search_song(
        artist=artist,
        title=title,
        get_full_info=False,
    )
    return song.lyrics if song is not None else None


def lyrics_from_row(row):
    lyrics = download_lyrics(
        artist=row["artist"], 
        title=row["song"]
    )
    
    return sanitize_lyrics(lyrics) if lyrics is not None else None

In [210]:
def get_n_lines(num_lines, text):
    line_separator = "\n"
    
    return line_separator.join(
        text.split(line_separator)
        [ : num_lines]
    )

In [211]:
# example of downloading lyrics for a single song
if not os.path.isfile(lyrics_csv_file_path):

    # example_song = charts.loc[:, relevant_columns].iloc[-1: , :]
    example_song = charts.loc[ : , relevant_columns].iloc[1:2 , :]
    print_dataframe(example_song)

    raw_lyrics = download_lyrics(example_song.iloc[0].artist, example_song.iloc[0].song)
    num_lines = 10
    print("="*30)
    print("Raw lyrics:")
    print("-"*30)
    print(get_n_lines(num_lines, raw_lyrics))
    print("...")
    print()

    print("="*30)
    print("Sanitized lyrics:")
    print("-"*30)
    sanitized_lyrics = sanitize_lyrics(raw_lyrics)
    print(get_n_lines(num_lines, sanitized_lyrics))
    print("...")

Unnamed: 0,date,artist,song
1,2021-11-06,The Kid LAROI & Justin Bieber,Stay


Raw lyrics:
------------------------------
136 ContributorsTranslationsTürkçeEspañolPortuguêsItalianoDeutschFrançaisفارسیРусскийSTAY Lyrics
I do the same thing I told you that I never would
I told you I'd change, even when I knew I never could
I know that I can’t find nobody else as good as you
I need you to stay, need you to stay, hey (Oh)

I get drunk, wake up, I'm wasted still
I realize the time that I wasted here
I feel like you can't feel the way I feel
Oh, I’ll be fucked up if you can't be right here
...

Sanitized lyrics:
------------------------------
I do the same thing I told you that I never would
I told you I'd change, even when I knew I never could
I know that I can’t find nobody else as good as you
I need you to stay, need you to stay, hey (Oh)

I get drunk, wake up, I'm wasted still
I realize the time that I wasted here
I feel like you can't feel the way I feel
Oh, I’ll be fucked up if you can't be right here

...


In [212]:
# example of downloading songs for each row in the dataset's dataframe
# TODO isolate a dataframe of wanted songs, download all needed lyrics to dataframe, and save for future use

# if not os.path.isfile(lyrics_csv_file_path):
#     example_songs_copy = charts.loc[:, relevant_columns].iloc[0:3 , :].copy()

#     example_songs_copy["lyrics"] = example_songs_copy.apply(lyrics_from_row, axis=1)

#     example_songs_copy.to_csv(lyrics_csv_file_path, index=False, compression="xz")
# else:
#     example_songs_copy = pd.read_csv(lyrics_csv_file_path, compression="xz")
#     covert_date_column(example_songs_copy)
    
# print_dataframe(example_songs_copy)
# print(example_songs_copy["lyrics"].iloc[1])

In [213]:
# scraping top 100 lists prior to 1958

wikipedia_url = "https://en.wikipedia.org"
wikipedia_billboard_years_url = f"{wikipedia_url}/wiki/Template:Hot_100_year-end_charts"

wikipedia_billboard_years_html = requests.get(wikipedia_billboard_years_url)
billboard_years_html_soup = BeautifulSoup(wikipedia_billboard_years_html.content, 'html.parser')
year_link_tags = billboard_years_html_soup.select("tbody td a")

links_by_year = {int(tag.text): tag.attrs["href"] for tag in year_link_tags}


def full_wikipedia_link(short_wikipedia_link):
    return f'{wikipedia_url}{short_wikipedia_link}'


def sanitize_wikipedia_table(table:pd.DataFrame):
    table["artist"] = table["artist"] \
        .str.replace("(?i)(with |and |[&,]).*$", "", regex=True) \
        .str.strip("\" ")
        
    table["song"] = table["song"].str.strip("\" ")
    # table.reset_index(drop=True, inplace=True)
    

def scrap_year(year:int):
    wikipedia_list_url = full_wikipedia_link(links_by_year[year])
    # print(f"following {wikipedia_list_url = }")
    wikipedia_response = requests.get(wikipedia_list_url)
    wikipedia_list_html_soup = BeautifulSoup(wikipedia_response.content, 'html.parser')

    chart_html = wikipedia_list_html_soup.select("table.wikitable.sortable")[0]
    # chart = pd.read_html(StringIO(chart_html.prettify()))[0]
    chart = pd.read_html(StringIO(str(chart_html)))[0]
    chart["year"] = year
    
    # chart.rename(columns={"No.": "rank", "No. (Rank)": "rank"}, inplace=True)
    chart.drop(columns=["No.", "No. (Rank)"], errors="ignore", inplace=True)
    chart.rename(columns={"Title": "song", "Artist(s)": "artist"}, inplace=True)
    
    sanitize_wikipedia_table(chart)
    
    # print_dataframe(chart[chart.index.duplicated()])
    
    # chart.reindex(range(len(chart)))
    
    # assert not chart.index.has_duplicates
    
    return chart


# year = 1950
# print_dataframe(scrap_year(year))

In [214]:
# sanitizing wikipedia-extracted data

# temp = scrap_year(1950)
# print_dataframe(temp)
# sanitize_wikipedia_table(temp)
# print_dataframe(temp)

In [215]:
# %timeit temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
# %timeit temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
# %timeit temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),

# print_dataframe(pd.concat([
#             temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
#             temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
#             temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),
#         ],
#         axis=1
#     ))

In [216]:
# sanitize_lyrics(download_lyrics(title="The Cry of the Wild Goose", artist="Frankie Laine with Carl T. Fischer"))
# print(sanitize_lyrics(download_lyrics(title=" I Can Dream, Can't I", artist="Andrews Sisters")))

In [217]:
# getting only year-ends' top 100 from downloaded dataset

charts_indexed_by_date = charts.set_index("date")
dates_series = charts["date"]

year_ends = dates_series.groupby(charts_indexed_by_date.index.year).first()
year_ends = year_ends[year_ends.index <= 2020]

# print(charts_index_by_date.index)
print(year_ends)

year_end_charts = charts_indexed_by_date.loc[year_ends]
year_end_charts.index = year_end_charts.index.year
# year_end_charts.index.names = ["year"]
year_end_charts.reset_index(inplace=True)
year_end_charts.rename(columns={"date": "year"}, inplace=True)
year_end_charts

date
1958   1958-12-29
1959   1959-12-28
1960   1960-12-26
1961   1961-12-25
1962   1962-12-29
          ...    
2016   2016-12-31
2017   2017-12-30
2018   2018-12-29
2019   2019-12-28
2020   2020-12-26
Name: date, Length: 63, dtype: datetime64[ns]


Unnamed: 0,year,artist,song
0,1958,The Chipmunks With David Seville,The Chipmunk Song
1,1958,The Platters,Smoke Gets In Your Eyes
2,1958,The Teddy Bears,"To Know Him, Is To Love Him"
3,1958,Elvis Presley,One Night
4,1958,The Everly Brothers,Problems
...,...,...,...
6294,2020,Niko Moon,Good Time
6295,2020,BRS Kash,Throat Baby (Go Baby)
6296,2020,Lil Baby,Errbody
6297,2020,Carrie Underwood,Favorite Time Of Year


In [218]:
billboard_missing_years = range(min(links_by_year.keys()), year_ends.index.min())
missing_years_dataframes = [scrap_year(year) for year in billboard_missing_years]
missing_years_concatenated = pd.concat(missing_years_dataframes, ignore_index=True)
missing_years_concatenated

Unnamed: 0,song,artist,year
0,Prisoner of Love,Perry Como,1946
1,To Each His Own,Eddy Howard,1946
2,The Gypsy,The Ink Spots,1946
3,Five Minutes More,Frank Sinatra,1946
4,Rumors Are Flying,Frankie Carle,1946
...,...,...,...
432,Old Cape Cod,Patti Page,1957
433,Mr. Lee,The Bobbettes,1957
434,Blueberry Hill,Fats Domino,1957
435,Whispering Bells,The Del-Vikings,1957


In [219]:
print(any(x.index.has_duplicates for x in missing_years_dataframes))
missing_years_concatenated.index.has_duplicates

False


False

In [220]:
entirety_of_data = pd.concat([missing_years_concatenated, year_end_charts], ignore_index=True)
entirety_of_data

Unnamed: 0,song,artist,year
0,Prisoner of Love,Perry Como,1946
1,To Each His Own,Eddy Howard,1946
2,The Gypsy,The Ink Spots,1946
3,Five Minutes More,Frank Sinatra,1946
4,Rumors Are Flying,Frankie Carle,1946
...,...,...,...
6731,Good Time,Niko Moon,2020
6732,Throat Baby (Go Baby),BRS Kash,2020
6733,Errbody,Lil Baby,2020
6734,Favorite Time Of Year,Carrie Underwood,2020


In [232]:
if not os.path.isfile(lyrics_csv_file_path):
    # entirety_of_data["lyrics"] = entirety_of_data.apply(lyrics_from_row, axis=1)
    # entirety_of_data["lyrics"] = entirety_of_data.progress_apply(lyrics_from_row, axis=1)
    
    dask_dataframe = dd.from_pandas(entirety_of_data, npartitions=len(entirety_of_data))
    lyrics_column = dask_dataframe.apply(lyrics_from_row, axis=1, meta=(None, str)).compute() # parallel execution
    entirety_of_data["lyrics"] = lyrics_column
    entirety_of_data.to_csv(lyrics_csv_file_path, index=False, compression="xz")
else:
    entirety_of_data = pd.read_csv(lyrics_csv_file_path, compression="xz")

print_dataframe(entirety_of_data.head())
print_dataframe(entirety_of_data.tail())

Unnamed: 0,song,artist,year,lyrics
0,Prisoner of Love,Perry Como,1946,"Alone from night to night you'll find me\nToo weak to break the chains that bind me\nI need no shackles to remind me\nI'm just a prisoner of love!\n\nFor one command I stand and wait now\nFrom one who's master of my fate now\nI can't escape for it's too late now\nI'm just a prisoner of love!\n\nWhat's the good of my caring\nIf someone is sharing those arms with me!\nAlthough she has another\nI can't have another, for I'm not free!\n\nShe's in my dreams awake or sleeping\nUpon my knees to her I'm creeping\nMy very life is in her keeping\nI'm just a prisoner of love\n\nWhat's the good of my caring\nIf someone is sharing those arms with me!\nAlthough she has another\nI can't have another, for I'm not free!\n\nShe's in my dreams awake or sleeping\nUpon my knees to her I'm creeping\nMy very life is in her keeping\nI'm just a prisoner of love"
1,To Each His Own,Eddy Howard,1946,"A rose must remain with the sun and the rain\nOr its lovely promise won't come true\nTo each his own, to each his own\nAnd my own is you\nWhat good is a song if the words just don't belong?\nAnd a dream must be a dream for two\nNo good alone, to each his own\nFor me there's you\n\nIf a flame is to grow there must be a glow\nTo open each door there's a key\nI need you, I know, I can't let you go\nYour touch means too much to me\nTwo lips must insist on two more to be kissed\nOr they'll never know what love can do\nTo each his own, I've found my own\nOne and only you\n\nTwo lips must insist on two more to be kissed\nOr they'll never know what love can do\nTo each his own, I've found my own\nOne and only you"
2,The Gypsy,The Ink Spots,1946,"In a quaint caravan\nThere's a lady they call the Gypsy\nShe can look in the future\nAnd drive away all your fears\nEverything will come right\nIf you only believe the Gypsy\nShe could tell at a glance\nThat my heart was so full of tears\nShe looked at my hand and told me\nMy lover was always true\nAnd yet in my heart I knew, dear\nSomebody else was kissing you\nBut I'll go there again\n'Cause I want to believe the Gypsy\nThat my lover is true\nAnd will come back to me someday\n\nYou see, she looked at my hand and told me that my baby would always be true\nAnd yet, in my heart I knew, dear\nThat somebody else was kissing you\n\nBut I'll go there again\n'Cause I want to believe the Gypsy\nThat my lover is true\nAnd will come back to me someday"
3,Five Minutes More,Frank Sinatra,1946,"Dear, this evening seemed to go so awfully fast\nWe had so much fun and now you're home at last\nI look forward to a kiss or two at the garden gate\nBut she gave me just a peck and insisted it was late\nGive me five minutes more, only five minutes more\nLet me stay, let me stay in your arms\nHere am I, begging for only five minutes more\nOnly five minutes more of your charms\nAll week long I dreamed about our Saturday date\nDon't you know that Sunday morning you can sleep late?\nGive me five minutes more, only five minutes more\nLet me stay, let me stay in your arms\nAll week long I dreamed about our Saturday date\nDon't you know that Sunday morning you can sleep late?\nGive me five minutes more, only five minutes more\nLet me stay, let me stay in your arms\nAh, come on"
4,Rumors Are Flying,Frankie Carle,1946,"Rumors are flying\nThat you've got me sighing\nThat I'm in a crazy kind of a daze\nA lazy sort of a haze\nWhen I go walking\nI hear people talking\nThey say our affair is not just a passing phase\nAnd they whisper 'bout the flowers\nYou keep sending me by the dozen\nAnd they wonder 'bout the hours\nThat you spend with me, it keeps them buzzin'\nRumors are flying\nAnd I'm not denying\nThat people are sure I'm falling in love with you\n'Cause for a change, darling\nAll the rumors are true\n------ guitar solo ------\nAnd they whisper all about the flowers\nThe many little flowers you keep sending me by the dozen\nAnd they wonder all about the hours\nThat you spend with your baby, what do you do-do-do\nThat keeps 'em buzzin' all the time, all the time\nRumors are flying\nAnd I'm not denying\nThat people are sure I'm falling in love with you\n'Cause for a change, darling\nAll the rumors are true"


Unnamed: 0,song,artist,year,lyrics
6731,Good Time,Niko Moon,2020,
6732,Throat Baby (Go Baby),BRS Kash,2020,
6733,Errbody,Lil Baby,2020,
6734,Favorite Time Of Year,Carrie Underwood,2020,
6735,Beautiful Trip,Kid Cudi,2020,


In [231]:

print(download_lyrics(title="Kid Cudi", artist="Beautiful Trip"))
print(sanitize_lyrics(download_lyrics(title="Beautiful Trip", artist="Kid Cudi")))

47 ContributorsTranslationsEspañolNorsk (bokmål / riksmål)PortuguêsРусскийBeautiful Trip Lyrics
Oh, oh
Oh
Three (Three, two, one)See Kid Cudi LiveGet tickets as low as $37You might also like3Embed
Oh, oh
Oh
Three (Three, two, one)See Kid Cudi LiveGet tickets as low as $37


In [222]:
print(missing_years_concatenated.index.has_duplicates)
print(year_end_charts.index.has_duplicates)
print(entirety_of_data.index.has_duplicates)

False
False
False


In [223]:
entirety_of_data["lyrics"] = lyrics_column
entirety_of_data.to_csv(lyrics_csv_file_path, index=False, compression="xz")

In [224]:
entirety_of_data.loc[entirety_of_data.lyrics.isna()]

Unnamed: 0,song,artist,year,lyrics
26,Hey! Ba-Ba-Re-Bop,Tex Beneke,1946,
42,Peg o' My Heart,The Harmonicats,1947,
47,Peg o' My Heart,The Three Suns,1947,
55,Temptation (Tim-Tayshun),Red Ingle,1947,
88,Twelfth Street Rag,Pee Wee Hunt,1948,
...,...,...,...,...
6731,Good Time,Niko Moon,2020,
6732,Throat Baby (Go Baby),BRS Kash,2020,
6733,Errbody,Lil Baby,2020,
6734,Favorite Time Of Year,Carrie Underwood,2020,


In [225]:
temp = entirety_of_data.loc[3]
artist = temp.artist
song = temp.song
sanitized_lyrics = sanitize_lyrics(GENIUS_API_GATEWAY.search_song(artist=song, title=artist).lyrics)
entirety_of_data[entirety_of_data["lyrics"] == sanitized_lyrics]

Unnamed: 0,song,artist,year,lyrics
3,Five Minutes More,Frank Sinatra,1946,"Dear, this evening seemed to go so awfully fas..."


In [226]:
# entirety_of_data.lyrics.isna().sum()