# Mini-Project in NLP

In [None]:
import sys
import os
import zipfile
import re

import requests
from bs4 import BeautifulSoup

import pandas as pd
from IPython.display import display, HTML
from pprint import pprint


from lyricsgenius import Genius
from credentials import CLIENT_ACCESS_TOKEN

import seaborn
import matplotlib as plt

In [None]:
# loading billboard top 100 years 1958-2021
# original dataset from: https://www.kaggle.com/datasets/dhruvildave/billboard-the-hot-100-songs/data

data_directory = "data"
csv_file_name = "charts.csv.zip"
lyrics_csv_file_name = f"lyrics_charts.csv.xz"

csv_file_path = os.path.join(data_directory, csv_file_name)
lyrics_csv_file_path = os.path.join(data_directory, lyrics_csv_file_name)

GENIUS_API_GATEWAY = Genius(
    access_token=CLIENT_ACCESS_TOKEN,
    
    verbose=False,
    retries=10,
    skip_non_songs=True,
    
    remove_section_headers=True,
)

In [None]:
# if not os.path.isfile(lyrics_csv_file_path) and not os.path.isfile(csv_file_path):
#     print("extracted CSV not found")
    
#     with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#         zip_ref.extractall(data_directory)

In [None]:
def covert_date_column(dataframe):
    print(f'{dataframe["date"].dtype = }')

    dataframe["date"] = dataframe[ ["date"] ].apply(pd.to_datetime, format="%Y-%m-%d")
    print(f'{dataframe["date"].dtype = }')

In [None]:
# if not os.path.isfile(lyrics_csv_file_path):
charts = pd.read_csv(
    csv_file_path, 
    compression="zip", 
    # converters={"date": lambda x: pd.to_datetime(x, format="%Y-%m-%d")}
)

covert_date_column(charts)

In [None]:
if not os.path.isfile(lyrics_csv_file_path):
    charts.shape

In [None]:
def print_dataframe(dataframe):
    display(HTML(
        dataframe
        .to_html())
    )

In [None]:
print_dataframe(charts.head(3))
print_dataframe(charts.tail(3))

In [None]:
def sanitize_lyrics(lyrics):
    splitting_pattern = \
        r"[\d,]*Embed$" \
        r"|\{.*?\}"
        # r"|\[.*?\]"
    lyrics = ''.join(re.split(splitting_pattern, lyrics))
    
    lyrics = lyrics.split("Lyrics", maxsplit=1)[1]
    lyrics = ''.join(lyrics.rsplit("You might also like", maxsplit=1))
    
    lyrics = re.sub(r"(\n\s*){2,}\n", r"\n\n", lyrics)
    lyrics = lyrics.strip()
    
    return lyrics
    
    
def download_lyrics(artist, title):
    song = GENIUS_API_GATEWAY.search_song(
        artist=artist,
        title=title,
        get_full_info=False,
    )
    return song.lyrics if song is not None else None


def lyrics_from_row(row):
   return sanitize_lyrics(
       download_lyrics(
           artist=row["artist"], 
           title=row["song"]
        )
    )

In [None]:
def get_n_lines(num_lines, text):
    line_separator = "\n"
    
    return line_separator.join(
        text.split(line_separator)
        [ : num_lines]
    )

In [None]:
relevant_columns = ["date", "artist", "song",]

In [None]:
# example of downloading lyrics for a single song
if not os.path.isfile(lyrics_csv_file_path):

    # example_song = charts.loc[:, relevant_columns].iloc[-1: , :]
    example_song = charts.loc[ : , relevant_columns].iloc[1:2 , :]
    print_dataframe(example_song)

    raw_lyrics = download_lyrics(example_song.iloc[0].artist, example_song.iloc[0].song)
    num_lines = 10
    print("="*30)
    print("Raw lyrics:")
    print("-"*30)
    print(get_n_lines(num_lines, raw_lyrics))
    print("...")
    print()

    print("="*30)
    print("Sanitized lyrics:")
    print("-"*30)
    sanitized_lyrics = sanitize_lyrics(raw_lyrics)
    print(get_n_lines(num_lines, sanitized_lyrics))
    print("...")

In [None]:
# example of downloading songs for each row in the dataset's dataframe
# TODO isolate a dataframe of wanted songs, download all needed lyrics to dataframe, and save for future use

if not os.path.isfile(lyrics_csv_file_path):
    example_songs_copy = charts.loc[:, relevant_columns].iloc[0:3 , :].copy()

    example_songs_copy["lyrics"] = example_songs_copy.apply(lyrics_from_row, axis=1)

    example_songs_copy.to_csv(lyrics_csv_file_path, index=False, compression="xz")
else:
    example_songs_copy = pd.read_csv(lyrics_csv_file_path, compression="xz")
    covert_date_column(example_songs_copy)
    
print_dataframe(example_songs_copy)
print(example_songs_copy["lyrics"].iloc[1])

In [None]:
# scraping top 100 lists prior to 1958

wikipedia_link = "https://en.wikipedia.org"
wikipedia_billboard_years_link = f"{wikipedia_link}/wiki/Template:Hot_100_year-end_charts"

wikipedia_billboard_years_html = requests.get(wikipedia_billboard_years_link)
billboard_years_html_soup = BeautifulSoup(wikipedia_billboard_years_html.content, 'html.parser')
year_link_tags = billboard_years_html_soup.select("tbody td a")

links_by_year = {int(tag.text): tag.attrs["href"] for tag in year_link_tags}
# pprint(links_by_year)


def full_wikipedia_link(short_wikipedia_link):
    return f'{wikipedia_link}{short_wikipedia_link}'


def list_by_year(year:int):
    list_link = full_wikipedia_link(links_by_year[year])
    print(f"following {list_link = }")
    response = requests.get(list_link)
    soup = BeautifulSoup(response.content, 'html.parser')

    list = soup.select("table.wikitable.sortable")[0]
    # print(list)
    dataframes = pd.read_html(list.prettify())
    return dataframes[0]


year = 1950
print_dataframe(list_by_year(year))

In [None]:
def sanitize_wikipedia_table(table:pd.DataFrame):
    table["Artist(s)"] = table["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True)
    table["Title"] = table["Title"].str.strip("\" ")
    

In [None]:
# sanitizing wikipedia-extracted data

temp = list_by_year(1950)
print_dataframe(temp)
sanitize_wikipedia_table(temp)
print_dataframe(temp)

In [None]:
# %timeit temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
# %timeit temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
# %timeit temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
# %timeit temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),

# print_dataframe(pd.concat([
#             temp["Artist(s)"].apply(lambda artist: re.split("with|&|,", artist)[0]),
#             temp["Artist(s)"].str.extract("^(.*?)(?= with|[&,]|$)"), 
#             temp["Artist(s)"].str.replace("(with |[&,]).*$", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|[&,].*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*$|&.*$|,.*$)", "", regex=True),
#             temp["Artist(s)"].str.replace("(with.*|&.*|,.*)", "", regex=True),
#         ],
#         axis=1
#     ))

In [None]:
# sanitize_lyrics(download_lyrics(title="The Cry of the Wild Goose", artist="Frankie Laine with Carl T. Fischer"))
# print(sanitize_lyrics(download_lyrics(title=" I Can Dream, Can't I", artist="Andrews Sisters")))

In [297]:
# getting only year-ends' top 100 from downloaded dataset

charts_index_by_date = charts.set_index("date")
dates_series = charts["date"]

year_ends = dates_series.groupby(charts_index_by_date.index.year).first()
year_ends = year_ends[year_ends.index <= 2020 ]

# print(charts_index_by_date.index)
print(year_ends)

charts_index_by_date.loc[year_ends]

date
1958   1958-12-29
1959   1959-12-28
1960   1960-12-26
1961   1961-12-25
1962   1962-12-29
          ...    
2016   2016-12-31
2017   2017-12-30
2018   2018-12-29
2019   2019-12-28
2020   2020-12-26
Name: date, Length: 63, dtype: datetime64[ns]


Unnamed: 0_level_0,rank,song,artist,last-week,peak-rank,weeks-on-board
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1958-12-29,1,The Chipmunk Song,The Chipmunks With David Seville,1.0,1,5
1958-12-29,2,Smoke Gets In Your Eyes,The Platters,2.0,2,7
1958-12-29,3,"To Know Him, Is To Love Him",The Teddy Bears,3.0,1,15
1958-12-29,4,One Night,Elvis Presley,6.0,4,8
1958-12-29,5,Problems,The Everly Brothers,4.0,2,8
...,...,...,...,...,...,...
2020-12-26,96,Good Time,Niko Moon,79.0,71,11
2020-12-26,97,Throat Baby (Go Baby),BRS Kash,81.0,69,9
2020-12-26,98,Errbody,Lil Baby,41.0,41,2
2020-12-26,99,Favorite Time Of Year,Carrie Underwood,80.0,80,3
