In [1]:
import pandas as pd
import re
import spacy


from spacy.language import Language
from spacy_language_detection import LanguageDetector

In [2]:
df = pd.read_pickle("data/all_songs_with_lyrics.pkl")
df = df[["artist_names", "name", "playlist", "lyrics"]]

In [3]:
def clean_lyrics(lyrics: str) -> str:
    """
        Clean up lyrics:
            remove beginning up to and including the first occurrence of "Lyrics"
            remove everything between []
            remove numbers+Embed at the end
    """
    if not pd.isnull(lyrics):
        # remove beginning up to and including the first occurrence of "Lyrics"
        beginning_index = lyrics.find("Lyrics") + len("Lyrics")
        new_lyrics = lyrics[beginning_index:]
        
        # remove everything between [], like [Refrain]
        # This doesn't work with nested brackets
        new_lyrics = re.sub("[\[].*?[\]]", "", new_lyrics)

        # remove numbers and Embed at the end of the Lyrics, e.g. 11Embed
        new_lyrics = re.sub("[0-9]*Embed$", "", new_lyrics)
        return new_lyrics
    else:
        return None

df["clean_lyrics"] = df.apply(lambda song: clean_lyrics(song["lyrics"]), axis=1)


In [6]:
df["clean_lyrics"].isnull().value_counts()

False    1931
True      353
Name: clean_lyrics, dtype: int64

In [7]:
df["lyrics"].isnull().value_counts()

False    1931
True      353
Name: lyrics, dtype: int64

In [None]:
# If you run into spacy error, you will need to install the language package:
# !python -m spacy download en_core_web_sm

In [8]:
# Idea from this blog: https://towardsdatascience.com/4-python-libraries-to-detect-english-and-non-english-language-c82ad3efd430
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)  # We use the seed 42

nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp_model.add_pipe('language_detector', last=True)

<spacy_language_detection.spacy_language_detector.LanguageDetector at 0x1075a8c10>

In [9]:
# Language table downloaded from
# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
language_map = (
    pd.read_csv("data/languages.csv", delimiter=';')
    .rename(columns={"ISO language name": "language", "ISO-639-1-Code": "language_code"})
    .set_index("language_code")
    .to_dict()
    ["language"]
)
# {'ab': 'Abkhazian',
#  'aa': 'Afar',
#  'af': 'Afrikaans',
#  'ak': 'Akan',
#  'sq': 'Albanian'}

In [10]:

def get_language(text: str) -> str:
    """return language estimated by nlp_model, this is in ISO 639-1 codes"""
    doc = nlp_model(text)
    language_code = doc._.language.get("language", None)
    return language_map.get(language_code, None)

In [11]:
df["language"] = df.apply(lambda song: get_language(song["clean_lyrics"]) if not pd.isnull(song["clean_lyrics"]) else None, axis=1)

In [12]:
df["has_lyrics"] = ~df["clean_lyrics"].isnull()

In [13]:
df.groupby(["playlist", "has_lyrics", "language"], dropna=False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,artist_names,name,lyrics,clean_lyrics
playlist,has_lyrics,language,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Après Ski,False,,103,103,0,0
Après Ski,True,"Dutch, Flemish",77,77,77,77
Après Ski,True,English,42,42,42,42
Après Ski,True,Estonian,1,1,1,1
Après Ski,True,French,1,1,1,1
Après Ski,True,German,60,60,60,60
Après Ski,True,Indonesian,1,1,1,1
Après Ski,True,"Spanish, Castilian",1,1,1,1
Top 2000,False,,250,250,0,0
Top 2000,True,Afrikaans,5,5,5,5


In [14]:
print(df[df["language"] == "Indonesian"]["clean_lyrics"].values[0])


All we ever hear from you is blah blah blah
So, all we ever do is go ya ya ya
And we don't even care about what they say cause it's
Ya ya ya ya
Blah blah blah blah

All we ever hear from you is blah blah blah
So, all we ever do is go ya ya ya
And we don't even care about what they say cause it's
Ya ya ya ya
Blah blah blah blah

All we ever hear from you is blah blah blah
Blah blah blah
Blah blah blah
Blah blah blah
Blah blah blah
Blah blah blah
Blah blah blah.....


All we ever hear from you is blah blah blah
So, all we ever do is go ya ya ya
And we don't even care about what they say cause it's
Ya ya ya ya
Blah blah blah blah
All we ever hear from you is blah blah blah
So, all we ever do is go ya ya ya
And we don't even care about what they say cause it's
Ya ya ya ya
Blah blah blah blah


All we ever hear from you is blah blah blah
So, all we ever do is go ya ya ya
And we don't even care about what they say cause it's
Ya ya ya ya
Blah blah blah blah
You might also likeAll we ever hea

In [15]:
df.to_csv("data/all_songs_with_lyrics_cleaned.csv", index_label="index")