In [None]:
import pandas as pd
import re
import spacy


from spacy.language import Language
from spacy_language_detection import LanguageDetector

In [None]:
df = pd.read_pickle("data/all_songs_with_lyrics.pkl")

In [None]:
def clean_lyrics(lyrics: str) -> str:
    """
        Clean up lyrics:
            remove beginning up to and including the first occurrence of "Lyrics"
            remove everything between []
            remove numbers+Embed at the end
    """
    if not pd.isnull(lyrics):
        # remove beginning up to and including the first occurrence of "Lyrics"
        beginning_index = lyrics.find("Lyrics") + len("Lyrics")
        new_lyrics = lyrics[beginning_index:]
        
        # remove everything between [], like [Refrain]
        # This doesn't work with nested brackets
        new_lyrics = re.sub("[\[].*?[\]]", "", new_lyrics)

        # remove numbers and Embed at the end of the Lyrics, e.g. 11Embed
        new_lyrics = re.sub("[0-9]*Embed$", "", new_lyrics)
        return new_lyrics
    else:
        return None

df["clean_lyrics"] = df.apply(lambda song: clean_lyrics(song["lyrics"]), axis=1)


In [None]:
df["clean_lyrics"].isnull().value_counts()

In [None]:
df["lyrics"].isnull().value_counts()

In [None]:
# If you run into spacy error, you will need to install the language package:
# !python -m spacy download en_core_web_sm

In [None]:
# Idea from this blog: https://towardsdatascience.com/4-python-libraries-to-detect-english-and-non-english-language-c82ad3efd430
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)  # We use the seed 42

nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp_model.add_pipe('language_detector', last=True)

In [None]:
# Language table downloaded from
# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
language_map = (
    pd.read_csv("data/languages.csv", delimiter=';')
    .rename(columns={"ISO language name": "language", "ISO-639-1-Code": "language_code"})
    .set_index("language_code")
    .to_dict()
    ["language"]
)
# {'ab': 'Abkhazian',
#  'aa': 'Afar',
#  'af': 'Afrikaans',
#  'ak': 'Akan',
#  'sq': 'Albanian'}

In [None]:

def get_language(text: str) -> str:
    """return language estimated by nlp_model, this is in ISO 639-1 codes"""
    doc = nlp_model(text)
    language_code = doc._.language.get("language", None)
    return language_map.get(language_code, None)

In [None]:
df["language"] = df.apply(lambda song: get_language(song["clean_lyrics"]) if not pd.isnull(song["clean_lyrics"]) else None, axis=1)

In [None]:
df["has_lyrics"] = ~df["clean_lyrics"].isnull()

In [None]:
df.groupby(["playlist", "has_lyrics", "language"], dropna=False).count()["id"]

In [None]:
print(df[df["language"] == "Indonesian"]["clean_lyrics"].values[0])

In [None]:
df[["artist_names", "name", "playlist", "lyrics", "clean_lyrics", "has_lyrics", "language"]].to_csv("data/all_songs_with_lyrics_cleaned.csv", index_label="index")

In [None]:
df.to_pickle("data/all_songs_with_lyrics_cleaned.pkl")