# Mini-Project in NLP

In [1]:
import sys
import os
import zipfile
import re

import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from IPython.display import display, HTML
from io import StringIO
import seaborn
import matplotlib as plt
from pprint import pprint
import mapply
import multiprocessing
from tqdm import tqdm

from lyricsgenius import Genius
from credentials import CLIENT_ACCESS_TOKEN
from bs4 import BeautifulSoup
import requests

## Data collection and cleaning

In [2]:
# tqdm.pandas()
ProgressBar().register() # progress of download process when using dask-dataframe's "apply"

In [3]:
# loading billboard top 100 years 1958-2021
# original dataset from: https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs/data

data_directory = "data"
csv_file_name = "charts.csv.zip"
lyrics_csv_file_name = f"lyrics_charts.csv.xz"

csv_file_path = os.path.join(data_directory, csv_file_name)
lyrics_csv_file_path = os.path.join(data_directory, lyrics_csv_file_name)

GENIUS_API_GATEWAY = Genius(
    access_token=CLIENT_ACCESS_TOKEN,
    
    verbose=False,
    timeout=200,
    retries=20,
    sleep_time=3,
    
    skip_non_songs=True,
    remove_section_headers=True,
)

In [4]:
# if not os.path.isfile(lyrics_csv_file_path) and not os.path.isfile(csv_file_path):
#     print("extracted CSV not found")
    
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         zip_ref.extractall(data_directory)

In [5]:
def covert_date_column(dataframe):
    print(f'{dataframe["date"].dtype = }')

    dataframe["date"] = dataframe["date"].apply(pd.to_datetime, format="%Y-%m-%d")
    print(f'{dataframe["date"].dtype = }')

In [6]:
# if not os.path.isfile(lyrics_csv_file_path):
charts = pd.read_csv(
    csv_file_path, 
    compression="zip", 
    # converters={"date": lambda x: pd.to_datetime(x, format="%Y-%m-%d")}
)

covert_date_column(charts)

dataframe["date"].dtype = dtype('O')
dataframe["date"].dtype = dtype('<M8[ns]')


In [7]:
charts.shape

(330087, 7)

In [8]:
charts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330087 entries, 0 to 330086
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   date            330087 non-null  datetime64[ns]
 1   rank            330087 non-null  int64         
 2   song            330087 non-null  object        
 3   artist          330087 non-null  object        
 4   last-week       297775 non-null  float64       
 5   peak-rank       330087 non-null  int64         
 6   weeks-on-board  330087 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 17.6+ MB


In [9]:
def print_dataframe(dataframe):
    display(HTML(
        dataframe
        .to_html())
    )

In [10]:
print_dataframe(charts.head(3))
print_dataframe(charts.tail(3))

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14


Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
330084,1958-08-04,98,Little Serenade,The Ames Brothers,,98,1
330085,1958-08-04,99,I'll Get By (As Long As I Have You),Billy Williams,,99,1
330086,1958-08-04,100,Judy,Frankie Vaughan,,100,1


In [11]:
relevant_columns = ["date", "artist", "song",]
charts = charts[relevant_columns]

In [12]:
splitting_pattern = "".join((
    "(?i)(", # case-insensitive
    "|".join((
        r"[\d,]*Embed$",
        r"\{.*?\}",
        r"see .* live",
        r"get tickets as low as \$[\d.]+",
        # r"|\[.*?\]",
    )),
    ")",
))


def sanitize_lyrics(lyrics):
    # try:
    lyrics = ''.join(re.split(splitting_pattern, lyrics))
    # except Exception as exception:
        # print(f"exception when {lyrics = }")
        # raise exception
    
    lyrics = lyrics.split("Lyrics", maxsplit=1)[1]
    # lyrics = ''.join(lyrics.rsplit("You might also like", maxsplit=1))
    lyrics = ''.join(lyrics.split("You might also like"))
    
    lyrics = re.sub(r"(\n\s*){2,}\n", r"\n\n", lyrics)
    lyrics = lyrics.strip()
    
    return lyrics
    
    
def download_lyrics(artist, title):
    song = GENIUS_API_GATEWAY.search_song(
        artist=artist,
        title=title,
        get_full_info=False,
    )
    return song.lyrics if song is not None else None


def lyrics_from_row(row):
    lyrics = download_lyrics(
        artist=row["artist"], 
        title=row["song"]
    )
    
    return sanitize_lyrics(lyrics) if lyrics is not None else None

In [13]:
def get_n_lines(num_lines, text):
    line_separator = "\n"
    
    return line_separator.join(
        text.split(line_separator)
        [ : num_lines]
    )

In [14]:
# example of downloading lyrics for a single song
if not os.path.isfile(lyrics_csv_file_path):

    # example_song = charts.loc[:, relevant_columns].iloc[-1: , :]
    example_song = charts.loc[ : , relevant_columns].iloc[1:2 , :]
    print_dataframe(example_song)

    raw_lyrics = download_lyrics(example_song.iloc[0].artist, example_song.iloc[0].song)
    num_lines = 10
    print("="*30)
    print("Raw lyrics:")
    print("-"*30)
    print(get_n_lines(num_lines, raw_lyrics))
    print("...")
    print()

    print("="*30)
    print("Sanitized lyrics:")
    print("-"*30)
    sanitized_lyrics = sanitize_lyrics(raw_lyrics)
    print(get_n_lines(num_lines, sanitized_lyrics))
    print("...")

In [15]:
# example of downloading songs for each row in the dataset's dataframe
# TODO isolate a dataframe of wanted songs, download all needed lyrics to dataframe, and save for future use

# if not os.path.isfile(lyrics_csv_file_path):
#     example_songs_copy = charts.loc[:, relevant_columns].iloc[0:3 , :].copy()

#     example_songs_copy["lyrics"] = example_songs_copy.apply(lyrics_from_row, axis=1)

#     example_songs_copy.to_csv(lyrics_csv_file_path, index=False, compression="xz")
# else:
#     example_songs_copy = pd.read_csv(lyrics_csv_file_path, compression="xz")
#     covert_date_column(example_songs_copy)
    
# print_dataframe(example_songs_copy)
# print(example_songs_copy["lyrics"].iloc[1])

In [16]:
# scraping top 100 lists prior to 1958

wikipedia_url = "https://en.wikipedia.org"
wikipedia_billboard_years_url = f"{wikipedia_url}/wiki/Template:Hot_100_year-end_charts"

wikipedia_billboard_years_html = requests.get(wikipedia_billboard_years_url)
billboard_years_html_soup = BeautifulSoup(wikipedia_billboard_years_html.content, 'html.parser')
year_link_tags = billboard_years_html_soup.select("tbody td a")

links_by_year = {int(tag.text): tag.attrs["href"] for tag in year_link_tags}


def full_wikipedia_link(short_wikipedia_link):
    return f'{wikipedia_url}{short_wikipedia_link}'


artist_replacement_pattern = "".join((
    "(?i)(", # case-insensitive
    "|".join((
        " with ",
        " and ",
        "[&,]",
        " x ",
        r" ft\.",
        " ft ",
        " feat ",
        " featuring",
        " Co-Starring",
        " Starring",
        " Introducing",
        " Presents",
    )),
    ").*$", # up to the end
))

song_replacement_pattern = "".join((
    "(?i)(", # case-insensitive
    "|".join((
        "\(from .*\)",
        "\(\".*\"\)",
    )),
    ").*$", # up to the end
))


def sanitize_artist_song_columns(table:pd.DataFrame):
    table["artist"] = table["artist"] \
        .str.replace(artist_replacement_pattern, "", regex=True) \
        .str.strip("\" ")
        
    table["song"] = table["song"] \
        .str.replace(song_replacement_pattern, "", regex=True) \
        .str.strip("\" ")
    # table.reset_index(drop=True, inplace=True)
    

def scrap_year(year:int):
    wikipedia_list_url = full_wikipedia_link(links_by_year[year])
    # print(f"following {wikipedia_list_url = }")
    wikipedia_response = requests.get(wikipedia_list_url)
    wikipedia_list_html_soup = BeautifulSoup(wikipedia_response.content, 'html.parser')

    chart_html = wikipedia_list_html_soup.select("table.wikitable.sortable")[0]
    # chart = pd.read_html(StringIO(chart_html.prettify()))[0]
    chart = pd.read_html(StringIO(str(chart_html)))[0]
    chart["year"] = year
    
    # chart.rename(columns={"No.": "rank", "No. (Rank)": "rank"}, inplace=True)
    chart.drop(columns=["No.", "No. (Rank)"], errors="ignore", inplace=True)
    chart.rename(columns={"Title": "song", "Artist(s)": "artist"}, inplace=True)
    
    sanitize_artist_song_columns(chart)
    
    # print_dataframe(chart[chart.index.duplicated()])
    
    # chart.reindex(range(len(chart)))
    
    # assert not chart.index.has_duplicates
    
    return chart


# year = 1950
# print_dataframe(scrap_year(year))

In [17]:
# sanitizing wikipedia-extracted data

# temp = scrap_year(1950)
# print_dataframe(temp)
# sanitize_wikipedia_table(temp)
# print_dataframe(temp)

In [18]:
# %timeit temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
# %timeit temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
# %timeit temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),

# print_dataframe(pd.concat([
#             temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
#             temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
#             temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),
#         ],
#         axis=1
#     ))

In [19]:
# sanitize_lyrics(download_lyrics(title="The Cry of the Wild Goose", artist="Frankie Laine with Carl T. Fischer"))
# print(sanitize_lyrics(download_lyrics(title=" I Can Dream, Can't I", artist="Andrews Sisters")))

In [20]:
# getting only year-ends' top 100 from downloaded dataset

charts_indexed_by_date = charts.set_index("date")
dates_series = charts["date"]

year_ends = dates_series.groupby(charts_indexed_by_date.index.year).first()
year_ends = year_ends[year_ends.index <= 2020]

# print(charts_index_by_date.index)
print(year_ends)

year_end_charts = charts_indexed_by_date.loc[year_ends]
year_end_charts.index = year_end_charts.index.year
# year_end_charts.index.names = ["year"]
year_end_charts.reset_index(inplace=True)
year_end_charts.rename(columns={"date": "year"}, inplace=True)
year_end_charts

date
1958   1958-12-29
1959   1959-12-28
1960   1960-12-26
1961   1961-12-25
1962   1962-12-29
          ...    
2016   2016-12-31
2017   2017-12-30
2018   2018-12-29
2019   2019-12-28
2020   2020-12-26
Name: date, Length: 63, dtype: datetime64[ns]


Unnamed: 0,year,artist,song
0,1958,The Chipmunks With David Seville,The Chipmunk Song
1,1958,The Platters,Smoke Gets In Your Eyes
2,1958,The Teddy Bears,"To Know Him, Is To Love Him"
3,1958,Elvis Presley,One Night
4,1958,The Everly Brothers,Problems
...,...,...,...
6294,2020,Niko Moon,Good Time
6295,2020,BRS Kash,Throat Baby (Go Baby)
6296,2020,Lil Baby,Errbody
6297,2020,Carrie Underwood,Favorite Time Of Year


In [21]:
billboard_missing_years = range(min(links_by_year.keys()), year_ends.index.min())
missing_years_dataframes = [scrap_year(year) for year in billboard_missing_years]
missing_years_concatenated = pd.concat(missing_years_dataframes, ignore_index=True)
missing_years_concatenated

Unnamed: 0,song,artist,year
0,Prisoner of Love,Perry Como,1946
1,To Each His Own,Eddy Howard,1946
2,The Gypsy,The Ink Spots,1946
3,Five Minutes More,Frank Sinatra,1946
4,Rumors Are Flying,Frankie Carle,1946
...,...,...,...
432,Old Cape Cod,Patti Page,1957
433,Mr. Lee,The Bobbettes,1957
434,Blueberry Hill,Fats Domino,1957
435,Whispering Bells,The Del-Vikings,1957


In [22]:
# print(any(x.index.has_duplicates for x in missing_years_dataframes))
# missing_years_concatenated.index.has_duplicates

In [23]:
entirety_of_data = pd.concat([missing_years_concatenated, year_end_charts], ignore_index=True)
entirety_of_data

Unnamed: 0,song,artist,year
0,Prisoner of Love,Perry Como,1946
1,To Each His Own,Eddy Howard,1946
2,The Gypsy,The Ink Spots,1946
3,Five Minutes More,Frank Sinatra,1946
4,Rumors Are Flying,Frankie Carle,1946
...,...,...,...
6731,Good Time,Niko Moon,2020
6732,Throat Baby (Go Baby),BRS Kash,2020
6733,Errbody,Lil Baby,2020
6734,Favorite Time Of Year,Carrie Underwood,2020


In [24]:
def table_download_lyrics(dataframe):
    dask_dataframe = dd.from_pandas(dataframe, npartitions=len(dataframe))
    dataframe["lyrics"] = dask_dataframe.apply( # parallel execution
        lyrics_from_row, 
        axis=1, 
        meta=(None, str),
    ).compute()

In [25]:
if not os.path.isfile(lyrics_csv_file_path):
    # entirety_of_data["lyrics"] = entirety_of_data.apply(lyrics_from_row, axis=1)
    # entirety_of_data["lyrics"] = entirety_of_data.progress_apply(lyrics_from_row, axis=1)
    
    table_download_lyrics(entirety_of_data)
    entirety_of_data.to_csv(lyrics_csv_file_path, index=False, compression="xz")
else:
    entirety_of_data = pd.read_csv(lyrics_csv_file_path, compression="xz")

print_dataframe(entirety_of_data.head())
print_dataframe(entirety_of_data.tail())

Unnamed: 0,song,artist,year,lyrics
0,Prisoner of Love,Perry Como,1946,"Alone from night to night you'll find me\nToo weak to break the chains that bind me\nI need no shackles to remind me\nI'm just a prisoner of love!\n\nFor one command I stand and wait now\nFrom one who's master of my fate now\nI can't escape for it's too late now\nI'm just a prisoner of love!\n\nWhat's the good of my caring\nIf someone is sharing those arms with me!\nAlthough she has another\nI can't have another, for I'm not free!\n\nShe's in my dreams awake or sleeping\nUpon my knees to her I'm creeping\nMy very life is in her keeping\nI'm just a prisoner of love\n\nWhat's the good of my caring\nIf someone is sharing those arms with me!\nAlthough she has another\nI can't have another, for I'm not free!\n\nShe's in my dreams awake or sleeping\nUpon my knees to her I'm creeping\nMy very life is in her keeping\nI'm just a prisoner of love"
1,To Each His Own,Eddy Howard,1946,"A rose must remain with the sun and the rain\nOr its lovely promise won't come true\nTo each his own, to each his own\nAnd my own is you\nWhat good is a song if the words just don't belong?\nAnd a dream must be a dream for two\nNo good alone, to each his own\nFor me there's you\n\nIf a flame is to grow there must be a glow\nTo open each door there's a key\nI need you, I know, I can't let you go\nYour touch means too much to me\nTwo lips must insist on two more to be kissed\nOr they'll never know what love can do\nTo each his own, I've found my own\nOne and only you\n\nTwo lips must insist on two more to be kissed\nOr they'll never know what love can do\nTo each his own, I've found my own\nOne and only you"
2,The Gypsy,The Ink Spots,1946,"In a quaint caravan\nThere's a lady they call the Gypsy\nShe can look in the future\nAnd drive away all your fears\nEverything will come right\nIf you only believe the Gypsy\nShe could tell at a glance\nThat my heart was so full of tears\nShe looked at my hand and told me\nMy lover was always true\nAnd yet in my heart I knew, dear\nSomebody else was kissing you\nBut I'll go there again\n'Cause I want to believe the Gypsy\nThat my lover is true\nAnd will come back to me someday\n\nYou see, she looked at my hand and told me that my baby would always be true\nAnd yet, in my heart I knew, dear\nThat somebody else was kissing you\n\nBut I'll go there again\n'Cause I want to believe the Gypsy\nThat my lover is true\nAnd will come back to me someday"
3,Five Minutes More,Frank Sinatra,1946,"Dear, this evening seemed to go so awfully fast\nWe had so much fun and now you're home at last\nI look forward to a kiss or two at the garden gate\nBut she gave me just a peck and insisted it was late\nGive me five minutes more, only five minutes more\nLet me stay, let me stay in your arms\nHere am I, begging for only five minutes more\nOnly five minutes more of your charms\nAll week long I dreamed about our Saturday date\nDon't you know that Sunday morning you can sleep late?\nGive me five minutes more, only five minutes more\nLet me stay, let me stay in your arms\nAll week long I dreamed about our Saturday date\nDon't you know that Sunday morning you can sleep late?\nGive me five minutes more, only five minutes more\nLet me stay, let me stay in your arms\nAh, come on"
4,Rumors Are Flying,Frankie Carle,1946,"Rumors are flying\nThat you've got me sighing\nThat I'm in a crazy kind of a daze\nA lazy sort of a haze\nWhen I go walking\nI hear people talking\nThey say our affair is not just a passing phase\nAnd they whisper 'bout the flowers\nYou keep sending me by the dozen\nAnd they wonder 'bout the hours\nThat you spend with me, it keeps them buzzin'\nRumors are flying\nAnd I'm not denying\nThat people are sure I'm falling in love with you\n'Cause for a change, darling\nAll the rumors are true\n------ guitar solo ------\nAnd they whisper all about the flowers\nThe many little flowers you keep sending me by the dozen\nAnd they wonder all about the hours\nThat you spend with your baby, what do you do-do-do\nThat keeps 'em buzzin' all the time, all the time\nRumors are flying\nAnd I'm not denying\nThat people are sure I'm falling in love with you\n'Cause for a change, darling\nAll the rumors are true"


Unnamed: 0,song,artist,year,lyrics
6731,Good Time,Niko Moon,2020,"We just tryna catch a good time\nEven if it takes all night\nPass that bottle 'round the campfire\nSippin' apple pie moonshine\nYeah, we pickin' on them guitars just right\nEverybody singin' ""Dixieland Delight""\nLike a bobber on a wet line\nWe just tryna catch a good time\n\nEighty degrees and the sun ain't even out\nWe got a spot a couple miles outta town\nWhen that moon comes up, you know what's goin' down\nWe got them foldin' chairs leanin' way back\nNo other plans other than relax\nWe ain't worried 'bout tomorrow from where we at\n\nWe just tryna catch a good time\nEven if it takes all night\nPass that bottle 'round the campfire\nSippin' apple pie moonshine\nYeah, we pickin' on them guitars just right\nEverybody singing ""Dixieland Delight""\nLike a bobber on a wet line\nWe just tryna catch a good time\nSee Niko Moon LiveGet tickets as low as $28\nWay down here, we all got that Southern drawl\nTake our time when we talkin', ""Hey Y'all""\nYeah, it don't take much for us to have it all (That's right)\nSomething 'bout a night this clear\nMakes your problems disappear\nWe just gonna stay right here and let the world go by\n\nWe just tryna catch a good time\nEven if it takes all night\nPass that bottle 'round the campfire\nSippin' apple pie moonshine\nYeah, we pickin' on them guitars just right\nEverybody singing ""Dixieland Delight""\nLike a bobber on a wet line\nWe just tryna catch a good time\n\nSomething 'bout a night this clear\nMakes your problems disappear\nSo we just gonna stay right here and let the world go by\n\nWe just tryna catch a good time\nEven if it takes all night, al night\nPass that bottle 'round the campfire\nSippin' apple pie moonshine, sippin' apple pie moonshine\nYeah, we pickin' on them guitars just right\nEverybody singing ""Dixieland Delight""\nLike a bobber on a wet line\nWe just tryna catch a good time\nTryna catch a good time (Good time)\nGood time\nTryna catch a good time\nGood time\nYeah, we try to catch a good time (Good time)\nGood time (Good time)\nTryna catch a good time (Good time, baby)\nGood time\nAw, I think I got a bite"
6732,Throat Baby (Go Baby),BRS Kash,2020,"(What's happenin', Chi Chi?)\n\nSexy lil' bitch, sexy lil' ho\nI love the way you walk, love the way you talk\nLet a young nigga come play in your throat\nDeep stroke your throat 'til I make you choke\nThroat babies, I'm tryna give 'em to you\nThroat babies, I'm tryna bust all on you\nSexy lil' bitch, sexy lil' ho\nI love the way you walk, love the way you talk\nLet a young nigga come play in your throat\nDeep stroke your throat 'til I make you choke\nThroat babies, I'm tryna give 'em to you\nThroat babies, I'm tryna bust all on you\n\nGot a lil' hundred, that's if you want it\nExtra lil' forty, gotta come right now\nAnd I got beans you bite down\nSuper good head, no cap, gown\nSince the first time, I been fiendin', couldn't even believe it\nHead like that, shit, pussy, don't need it\nWhen she get hungry, she eat my semen\nBalls and all, with her mouth, she clean it\nWashing machine trick, ooh\nShawty got mean grip, ooh\nKnow what she came to do\nSuck this dick, then she dip, ooh\nHead monster, set an appointment\nMy soul wet, need your anointing\nWay too good, may buy you somethin'\nMade a whole song, see, I ain't frontin'\nI shoot the world for you\nI go to war for you\nDamn, bae got me trippin'\nBOA head, make me cum instant\nShe ain't even gotta ask for attention (Why?)\n'Cause I'ma give it to her (What else?)\nMoney, I give it to her\nClock on rocks, I give it to her\n\nSexy lil' bitch, sexy lil' ho\nI love the way you walk, love the way you talk\nLet a young nigga come play in your throat\nDeep stroke your throat 'til I make you choke\nThroat babies, I'm tryna give 'em to you\nThroat babies, I'm tryna bust all on you\nSexy lil' bitch, sexy lil' ho\nI love the way you walk, love the way you talk\nLet a young nigga come play in your throat\nDeep stroke your throat 'til I make you choke\nThroat babies, I'm tryna give 'em to you\nThroat babies, I'm tryna bust all on you\n\nSexy lil' redbone, yeah, got a good head on her\nSexy lil' redbone, yeah, I spend a bag on her\nWith a mouth like that, she can talk her way out goin' to jail\nWith a mouth like that, if you go to jail, I'll pay your bail\nI need you, wanna see you\nSuck me up for an hour, I can't help but buy you flowers\nWhen we meet, I get excited\nShe gon' blow and she gon' ride it\nNo relations, feeling, fighting\nGrip like pliers, I feel it tighten\nGirl, I got plans for you, got a couple bands for you\nI like the stance on you, good mouth got me like, ""Ooh, ah""\nOoh, ah-ah, ooh, ah, ah-ah\nLovin' the, lovin' the mouth\nSexy lil' bitch, sexy lil' ho\nI love the way you walk, love the way you talk\nLet a young nigga come play in your throat\nDeep stroke your throat 'til I make you choke\nThroat babies, I'm tryna give 'em to you\nThroat babies, I'm tryna bust all on you\nSexy lil' bitch, sexy lil' ho\nI love the way you walk, love the way you talk\nLet a young nigga come play in your throat\nDeep stroke your throat 'til I make you choke\nThroat babies, I'm tryna give 'em to you\nThroat babies, I'm tryna bust all on you"
6733,Errbody,Lil Baby,2020,"Firin' at everybody\nFirin' at everybody\n(Section 8 just straight cooked this motherfucker up)\nFirin' at everybody\n\nI'm firin' at everybody\nHelicopter the middle of the hood, I'm flyer than everybody\nAll this fame these niggas be chasin', I swear I don't care 'bout it\nBoy, you play, you gon' die front of everybody\nLeave some blood on the street, buy some red bottoms\nCaught an L, but I wish that they headshot 'em\nFree my nigga Longway, yeah, the feds got him\n\nI used to pray for a plug who gon' go out the way\nAnd come back with a lot of them bitches\nI used to dream about condos when we leavin' Lenox\nNow I stay on top of them bitches\nI wouldn't give a damn if he ran him up a whole billion\nI still wouldn't acknowledge the nigga\nHe put my name in a song, what the fuck is he on?\nI don't care, boy, we gotta go get him\nHouse big as fuck in the middle of nowhere\nI dick her down, I don't care what my ho wear\nI made Lil' Cam spin the block on a four-wheel\nNah, for real, spin the block on a four-wheel\nFive million dollars, for me, that ain't no deal\nI done made that times six in like three years\nOff this rap shit, I can't even talk about this trap shit\nI just hope they can take it, I keep me a stallion like Megan\nA residency out in Vegas\nMight fly to L.A. and go fuck with LeBron and the Lakers\nCome back to Atlanta with Jayda\nBruh told us to take care of the neighbors and stack all the paper\nWhatever we want, buy it later\nI know how to shit on a hater and handle the business\nWhatever God give me, I'm grateful\n\nI'm firin' at everybody\nHelicopter the middle of the hood, I'm flyer than everybody\nAll this fame these niggas be chasin', I swear I don't care 'bout it\nBoy, you play, you gon' die front of everybody\nLeave some blood on the street, buy some red bottoms\nCaught an L, but I wish that they headshot 'em\nFree my nigga Longway, yeah, the feds got him\n\nCartier watches for everyone 'round me\nI ran it up, how the fuck could you down me?\nShe set for life, how the fuck could you clown her?\nBought all this water, I'm never gon' drown\nI'm in the 'yo, I'm perfectin' my sound\nI helped 'em out and they let me down again\nI ain't comin' back around again\nFell in love and I found a friend\nI done geared up the motor\nMy brother in prison in Polo\nI jumped in the game and went loco\nI put on these clothes like a hobo\nThat's layers of drip\nI showed my ass in Phipps\nYou play, ya mans get killed\nSaid that before, but I'm dead for real\nLil' bro jumpin' 'round like his leg done healed\nI ran it up off of vacuum seal\nIf I fall off today, I'm a legend still\nBro showed me the way, I ain't never steal\nThis Glock ain't no prop, it'll pop for real\nI fuck with that syrup, I don't pop no pills\nMy young niggas turnt, they don't got no deals\nI need me a billion so I can chill\nI'm drivin' like I ain't got no license still\nMy Trackhawk move like it got nitrogen, I'm poppin' seals\nI'm firin' at everybody\nHelicopter the middle of the hood, I'm flyer than everybody\nAll this fame these niggas be chasin', I swear I don't care 'bout it\nBoy, you play, you gon' die front of everybody\nLeave some blood on the street, buy some red bottoms\nCaught an L, but I wish that they headshot 'em\nFree my nigga Longway, yeah, the feds got him\n\nI paid attention and found out the recipe\nI take a seat and they still won't get 'head of me\nI'm on my shit, now a bitch can't get next to me\nYou ain't got a hundred thou', you can't have sex with me\nAin't showed the best of me yet\nAnd ain't in nobody debt, I'm somethin' like a vet\nI bought my big brother a 'Vette\nMy numbers so good, my label probably owe me a check\nThey play with Lil Baby, I swear it's gon' go down\nThe biggest lil' nigga that's over at Motown\nJumped off the porch and went straight kicked the door down\nFlip the clip, switch the clip, then some more rounds\nRappers fake and be hatin', I don't go 'round\nI took 12 on a chase, I had four ounces\nAsk around, we them guys, I do no clownin'\nIf I can, I'ma try keep the smoke down\nOnce it's up, then it's stuck, it can't go down\nLike to switch where I sleep, I got four houses\nEvery time the heat come, I make four thousand\nBig done gave me the game and I ran with it\nMilk the game 'til I quit, I ain't playin' with it\nCould've been hit your bitch, I be sparin' niggas\nMake me mad, I'ma call up them grave diggers\nBought my mom a new crib, and it's way bigger\nNever been to it once, that's a great feelin'\nI'ma go there when I get a chance\nI lost five hundred racks before I signed a deal\nSwear to God, I still got my advance\nYeah, I'm lit, I been savin' this guala up\nYeah, I hit, but she didn't get a follow-up\nAin't no reason to beef with no one about none of these women\nI promise they swallow us all\nI'm firin' at everybody\nHelicopter the middle of the hood, I'm flyer than everybody\nAll this fame these niggas be chasin', I swear I don't care 'bout it\nBoy, you play, you gon' die front of everybody\nLeave some blood on the street, buy some red bottoms\nCaught an L, but I wish that they headshot 'em\nFree my nigga Longway, yeah, the feds got him"
6734,Favorite Time Of Year,Carrie Underwood,2020,"Yeah\n\nBreak out the tinsel, unpack the lights\nWe're 'bout to bring on the merry and bright\nI hear those jingle bells in the air\nRinging out the sound of joy everywhere\n\n'Tis the season we've been waiting for\n(Waiting, don't keep me waiting)\nCan't you feel it knocking at your door?\n(Knock, knock, it's knocking at your door)\n\nLove pouring out like the snow from the sky\nSnuggled up together sitting by the fire\nOh, it's magical, it's my favorite time\nPretty little wishes tied up in a bow\nAnd every little kiss is like we're underneath the mistletoe\nIt's music to my ears\nChristmas is my favorite time of year\n\nStockings on chimneys, angels on trees\nSugar and cinnamon, it's the little things\nBlankets and movies in black and white\nTogether they make up this colorful life\nSee Carrie Underwood LiveGet tickets as low as $104\n'Tis the season we've been waiting for\n(Waiting, don't keep me waiting)\nOh, can't you feel it knocking at your door?\n(Knock, knock, it's knocking at your door)\n\nLove pouring out like the snow from the sky\nSnuggled up together sitting by the fire\nOh, it's magical, it's my favorite time\nPretty little wishes tied up in a bow\nAnd every little kiss is like we're underneath the mistletoe\nIt's music to my ears\nChristmas is my favorite time of year\n\nChristmas is my favorite time\nChristmas is my favorite time\nChristmas is my favorite time\nOh, don't keep me waiting\n\nLove pouring out like the snow from the sky\nSnuggled up together sitting by the fire\nOh, it's magical, it's my favorite time\nPretty little wishes tied up in a bow\nAnd every little kiss is like we're underneath the mistletoe\nIt's music to my ears\nChristmas is my favorite time of year\n(Christmas is my favorite time of year)\nChristmas is my favorite time of year\nOh, yes it is"
6735,Beautiful Trip,Kid Cudi,2020,"Oh, oh\nOh\nThree (Three, two, one)See Kid Cudi LiveGet tickets as low as $48"


In [25]:

print(download_lyrics(artist="Kid Cudi", title="Beautiful Trip"))
print("=" * 30)
print(sanitize_lyrics(download_lyrics(title="Beautiful Trip", artist="Kid Cudi")))

47 ContributorsTranslationsEspañolNorsk (bokmål / riksmål)PortuguêsРусскийBeautiful Trip Lyrics
Oh, oh
Oh
Three (Three, two, one)See Kid Cudi LiveGet tickets as low as $48You might also like3Embed
Oh, oh
Oh
Three (Three, two, one)


In [26]:
# print(missing_years_concatenated.index.has_duplicates)
# print(year_end_charts.index.has_duplicates)
# print(entirety_of_data.index.has_duplicates)

In [27]:
# entirety_of_data.loc[entirety_of_data.lyrics.str.contains("Get tickets")]
with_lyrics = entirety_of_data[entirety_of_data.lyrics.notna()]
# with_lyrics[with_lyrics.lyrics.str.contains("Get tickets")].year.hist()
print_dataframe(with_lyrics[with_lyrics.lyrics.str.contains("See .* Live")].head())
print(with_lyrics[with_lyrics.lyrics.str.contains("See .* Live")].iloc[0].lyrics)

Unnamed: 0,song,artist,year,lyrics
110,Baby Face,Art Mooney,1948,"Ya playin'\nThrowin' that name all up on me\nYa playin'\nKnowin' damn well that you want me\nI'm tired of waitin', conversatin'\nWhy you playin' game, games?\nMakin' love up in the club\nGive a fuck about your name, name\nYa playin'\nThrowin' that name all up on me\nYa playin'\nKnowin' damn well that you want me\nI'm tired of waitin', conversatin'\nWhy you playin' game, games?\nMakin' love up in the club\nGive a fuck about your name, name (Look, ha)\n\nWhat you playin' for?\nI can see it in your eyes, you ain't 'bout that shit\nI'm pelican fly, I left the weather inside\nI got it rainin' in this bitch\nDollar after dollar\nDon't play with a nigga, hop up on this dick\nOne drunk night will change your life\nNow you famous, bitch\nOkay, I'm off that Ace and that Rosé\nJosé, no way\nI'm swervin'\nWith the white CÎROC and that OJ\nMolly? I think I know her face\nI know them titties real but her ass fake\nI keep runnin' with the money at a fast face\nSo baby girl, just meet me at the valet\nTell your girls to pick your car up\nTell her you rollin' with me\nAnd you gon' call that bitch tomorrow\nFirst thing in the mornin', cookin' me breakfast\nYou wearin' my necklace\nSo, girl, why the fuck you out here flexin'?\nFuckin' with these peasants? What?\nSee Chris Brown LiveGet tickets as low as $60\nYa playin'\nThrowin' that name all up on me\nYa playin'\nKnowin' damn well that you want me\nI'm tired of waitin', conversatin'\nWhy you playin' game, games?\nMakin' love up in the club\nGive a fuck about your name, name\nYa playin'\nThrowin' that name all up on me\nYa playin'\nKnowin' damn well that you want me\nI'm tired of waitin', conversatin'\nWhy you playin' game, games?\nMakin' love up in the club\nGive a fuck about your name, name"
410,Diana,Paul Anka,1957,"I'm so young and you're so old\nThis, my darling, I've been told\nI don't care just what they say\n'Cause forever I will pray\nYou and I will be as free\nAs the birds up in the trees\n\nOh, please stay by me, Diana\n\nThrills I get when you hold me close\nOh, my darling, you're the most\nI love you but do you love me?\nOh, Diana, can't you see?\nI love you with all my heart\nAnd I hope we will never part\n\nOh, please stay by me, Diana\n\nOh, my darling, oh, my lover\nTell me that there is no other\nI love you with my heart\nOh-oh, oh-oh, oh don't you know I love you so\nSee Paul Anka LiveGet tickets as low as $42\nOnly you can-a take my heart\nOnly you can-a tear it apart\nWhen you hold me in your loving arms\nI can feel you giving all your charms\nHold me, darling, hold me tight\nSqueeze me baby with all your might\n\nOh, please stay by me, Diana\n\nOh, please, Diana\nOh, please, Diana\nOh, please, Diana"
420,It's Not for Me to Say,Johnny Mathis,1957,"It's not for me to say, you love me\nIt's not for me to say, you'll always care\nOh.. but here for the moment\nI can hold you fast\nAnd press your lips to mine\nAnd dream that love will last\nAs far as I can see, this is heaven\nAnd speaking just for me, It's ours to share\nPerhaps the glow\nOf love will grow\nWith every passing day\nOr we may never meet again\nBut then\nIt's not for me to say\n(Break)\nAnd speaking just for me\nIt's ours to share\nPerhaps the glow\nOf love will grow\nWith every passing day\nOr we may never meet again\nBut then\nIt's not for me to saySee Johnny Mathis LiveGet tickets as low as $47"
425,Chances Are,Johnny Mathis,1957,"Chances are, 'cause I wear a silly grin\nThe moment you come into view\nChances are, you think that I'm in love with you\nJust because my composure sort of slips\nThe moment that your lips meet mine\nChances are, you think my heart's your valentine\n\nIn the magic of moonlight\nWhen I sigh, ""Hold me close, dear""\nChances are, you'll believe the stars\nThat fill the skies are in my eyes\n\nGuess you feel you'll always be\nThe one and only one for me\nAnd if you think you could\nWell, chances are, your chances are awfully good\n\nChances are, you'll believe the stars\nThat fill the skies are in my eyes\n\nGuess you feel you'll always be\nThe one and only one for me\nAnd if you think you could\nWell, chances are, your chances are awfully good\nSee Johnny Mathis LiveGet tickets as low as $47\nThe chances are, your chances are awfully good"
446,Whole Lotta Loving,Fats Domino,1958,"All right\nWe're here to pay tribute\nTo the legendary\nMr. Fats Domino\nI got a whole lotta loving for you\nGood good loving for you\nI got a whole lotta loving for you\nI got a whole lotta kissin' for you\nWhole kissin lovin' for you\nI got a whole lotta kissin' for you\nI got a whole lotta uh-uh to do\nWhole lotta uh-uh to do\nAnd I'm so glad to see you\nTrombone Shorty\nI got a whole lotta loving for you\nGood good loving for you\nI got a whole lotta loving for you\nI got a whole lotta kissin' for you\nWhole lotta kissin' for you\nI got a whole lotta kissin' for you\nSee Lenny Kravitz LiveGet tickets as low as $67I got a whole lotta uh-uh to do\nWhole lotta uh-uh to do\nAnd I'm so glad to see you\nFred Wesley\nPeewee Ellis\nMaceo\nTrombone Shorty\nRebirth Brass Band\nFats Domino\nFats Domino\nAlright, we outta here now\nDon't stop what you're doin'\nWe love you, Fats\nA whole lotta love\nNew Orleans, Louisiana\nKeep the faith, yeah"


Ya playin'
Throwin' that name all up on me
Ya playin'
Knowin' damn well that you want me
I'm tired of waitin', conversatin'
Why you playin' game, games?
Makin' love up in the club
Give a fuck about your name, name
Ya playin'
Throwin' that name all up on me
Ya playin'
Knowin' damn well that you want me
I'm tired of waitin', conversatin'
Why you playin' game, games?
Makin' love up in the club
Give a fuck about your name, name (Look, ha)

What you playin' for?
I can see it in your eyes, you ain't 'bout that shit
I'm pelican fly, I left the weather inside
I got it rainin' in this bitch
Dollar after dollar
Don't play with a nigga, hop up on this dick
One drunk night will change your life
Now you famous, bitch
Okay, I'm off that Ace and that Rosé
José, no way
I'm swervin'
With the white CÎROC and that OJ
Molly? I think I know her face
I know them titties real but her ass fake
I keep runnin' with the money at a fast face
So baby girl, just meet me at the valet
Tell your girls to pick your car

In [28]:
# entirety_of_data2["year"] = entirety_of_data2["year"].apply(pd.to_datetime, format="%Y")
# entirety_of_data2.set_index("year", inplace=True, drop=False)

without_lyrics = entirety_of_data.loc[entirety_of_data.lyrics.isna()]
sanitize_artist_song_columns(without_lyrics)
without_lyrics

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["artist"] = table["artist"] \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["song"] = table["song"] \


Unnamed: 0,song,artist,year,lyrics
26,Hey! Ba-Ba-Re-Bop,Tex Beneke,1946,
42,Peg o' My Heart,The Harmonicats,1947,
47,Peg o' My Heart,The Three Suns,1947,
55,Temptation (Tim-Tayshun),Red Ingle,1947,
88,Twelfth Street Rag,Pee Wee Hunt,1948,
...,...,...,...,...
6480,Happy Xmas (War Is Over),John,2018,
6492,Close To Me,Ellie Goulding,2018,
6497,Arms Around You,XXXTENTACION,2018,
6585,It's Beginning To Look A Lot Like Christmas,Perry Como,2019,


In [29]:
table_download_lyrics(without_lyrics)

[########                                ] | 21% Completed | 43.21 ss


KeyboardInterrupt: 

In [None]:
without_lyrics

Unnamed: 0,song,artist,year,lyrics
26,Hey! Ba-Ba-Re-Bop,Tex Beneke,1946,
42,Peg o' My Heart,The Harmonicats,1947,
47,Peg o' My Heart,The Three Suns,1947,
55,Temptation (Tim-Tayshun),Red Ingle,1947,
88,Twelfth Street Rag,Pee Wee Hunt,1948,
...,...,...,...,...
6480,Happy Xmas (War Is Over),John,2018,"Happy Christmas, Kyoko\nHappy Christmas, Julia..."
6492,Close To Me,Ellie Goulding,2018,Even though we both know we're liars\nAnd we s...
6497,Arms Around You,XXXTENTACION,2018,"Oh-oh (Ooh, ooh)\n(Lil Pump\nMaluma, baby)\nOh..."
6585,It's Beginning To Look A Lot Like Christmas,Perry Como,2019,It's beginning to look a lot like Christmas\nE...


In [None]:
# try to show that number of songs without found lyrics is relatively small
# maybe find lyrics by hand

print(f'{without_lyrics.query("lyrics.isna()").year.value_counts().sum()}')
print(f'{without_lyrics.query("lyrics.isna()").year.value_counts().idxmax()}')
print(f'{100 * without_lyrics.query("year <= 1960 and lyrics.isna()").year.value_counts(normalize=True)}')
print(f'{without_lyrics.query("year <= 1960 and lyrics.isna()").year.value_counts()}')
without_lyrics.query("year <= 1960 and lyrics.isna()")


134
1960
year
1958    16.326531
1960    16.326531
1953    12.244898
1956    10.204082
1952     8.163265
1959     8.163265
1947     6.122449
1948     6.122449
1949     6.122449
1946     2.040816
1950     2.040816
1951     2.040816
1954     2.040816
1955     2.040816
Name: proportion, dtype: float64
year
1958    8
1960    8
1953    6
1956    5
1952    4
1959    4
1947    3
1948    3
1949    3
1946    1
1950    1
1951    1
1954    1
1955    1
Name: count, dtype: int64


Unnamed: 0,song,artist,year,lyrics
26,Hey! Ba-Ba-Re-Bop,Tex Beneke,1946,
42,Peg o' My Heart,The Harmonicats,1947,
47,Peg o' My Heart,The Three Suns,1947,
55,Temptation (Tim-Tayshun),Red Ingle,1947,
88,Twelfth Street Rag,Pee Wee Hunt,1948,
120,(I'd Like to Get You on a) Slow Boat to China,Kay Kyser,1948,
123,The Dicky-Bird Song,Freddy Martin,1948,
151,Whispering Hope,Jo Stafford,1949,
153,Careless Hands,Sammy Kaye Orchestra,1949,
155,The Hucklebuck,Tommy Dorsey Orchestra,1949,


In [None]:
# vietnam war lasted from 1955-11-1 to 1975-4-30

print(f'{len(entirety_of_data.query("lyrics.notna() and year < 1955"))}')
print(f'{len(entirety_of_data.query("lyrics.notna() and 1955 <= year <= 1975"))}')
print(f'{len(entirety_of_data.query("lyrics.notna() and 1975 < year"))}')

283
1805
4399


## Data processing 

In [None]:
# %%capture
# !pip install -U spacy
# !python -m spacy download en_core_web_lg

In [26]:
import spacy
from spacy import displacy

In [29]:
# spacy.cli.download("en_core_web_lg")

In [27]:
# NLP_API = spacy.load("en_core_web_sm")
NLP_API = spacy.load("en_core_web_lg")

In [28]:
entirety_of_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6736 entries, 0 to 6735
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   song    6736 non-null   object
 1   artist  6736 non-null   object
 2   year    6736 non-null   int64 
 3   lyrics  6487 non-null   object
dtypes: int64(1), object(3)
memory usage: 210.6+ KB


In [29]:
text = entirety_of_data.iloc[-2].lyrics
text

"Yeah\n\nBreak out the tinsel, unpack the lights\nWe're 'bout to bring on the merry and bright\nI hear those jingle bells in the air\nRinging out the sound of joy everywhere\n\n'Tis the season we've been waiting for\n(Waiting, don't keep me waiting)\nCan't you feel it knocking at your door?\n(Knock, knock, it's knocking at your door)\n\nLove pouring out like the snow from the sky\nSnuggled up together sitting by the fire\nOh, it's magical, it's my favorite time\nPretty little wishes tied up in a bow\nAnd every little kiss is like we're underneath the mistletoe\nIt's music to my ears\nChristmas is my favorite time of year\n\nStockings on chimneys, angels on trees\nSugar and cinnamon, it's the little things\nBlankets and movies in black and white\nTogether they make up this colorful life\nSee Carrie Underwood LiveGet tickets as low as $104\n'Tis the season we've been waiting for\n(Waiting, don't keep me waiting)\nOh, can't you feel it knocking at your door?\n(Knock, knock, it's knocking 

In [30]:
doc = NLP_API(text)
doc

Yeah

Break out the tinsel, unpack the lights
We're 'bout to bring on the merry and bright
I hear those jingle bells in the air
Ringing out the sound of joy everywhere

'Tis the season we've been waiting for
(Waiting, don't keep me waiting)
Can't you feel it knocking at your door?
(Knock, knock, it's knocking at your door)

Love pouring out like the snow from the sky
Snuggled up together sitting by the fire
Oh, it's magical, it's my favorite time
Pretty little wishes tied up in a bow
And every little kiss is like we're underneath the mistletoe
It's music to my ears
Christmas is my favorite time of year

Stockings on chimneys, angels on trees
Sugar and cinnamon, it's the little things
Blankets and movies in black and white
Together they make up this colorful life
See Carrie Underwood LiveGet tickets as low as $104
'Tis the season we've been waiting for
(Waiting, don't keep me waiting)
Oh, can't you feel it knocking at your door?
(Knock, knock, it's knocking at your door)

Love pouring o

In [31]:
for token in doc[:5]:
    print(f"{token.i} '{token}'")
print("=" * 30)

for token in doc[:10]:
    print(f"'{token.lemma_}'")
print("=" * 30)

for token in doc[:5]:
    print(f"{token.head.i} '{token.head}'")
print("=" * 30)

for token in doc[:5]:
    print(f"'{spacy.explain(token.tag_)}'")
print("=" * 30)
    
for token in doc[:5]:
    print(f"'{token.morph.to_dict()}'")
print("=" * 30)
    
for token in doc[:5]:
    print(f"'{spacy.explain(token.dep_)}'")
print("=" * 30)

displacy.render(doc[:5], style="dep", options={"compact": True})
displacy.render(doc[:5], style="ent",)

0 'Yeah'
1 '

'
2 'Break'
3 'out'
4 'the'
'yeah'
'

'
'break'
'out'
'the'
'tinsel'
','
'unpack'
'the'
'light'
2 'Break'
0 'Yeah'
2 'Break'
2 'Break'
5 'tinsel'
'interjection'
'whitespace'
'verb, base form'
'adverb, particle'
'determiner'
'{}'
'{}'
'{'VerbForm': 'Inf'}'
'{}'
'{'Definite': 'Def', 'PronType': 'Art'}'
'interjection'
'unclassified dependent'
'root'
'particle'
'determiner'




In [34]:
# sentence segmentation

# for token in tuple(doc.sents)[:5]:
#     print(f"'''{token}'''")
#     print("=" * 30)

# named entity recognition
for ent in doc.ents:
    print(f"'{ent.text}' {ent.label_}")
print("=" * 30)

'Christmas' DATE
'year' DATE
'Carrie Underwood LiveGet' PERSON
'as low as $104' MONEY
'Christmas' DATE
'year

' DATE
'Christmas' DATE
'Christmas' DATE
'Christmas' DATE
'Christmas' DATE
'Christmas' DATE
'year' DATE
