In [None]:
# everything ntlk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 

# prepare NLTK stop words
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# import libraries

import re
import numpy as np
import pandas as pd
import os

# Gensim
import gensim
from gensim.utils import simple_preprocess


In [None]:
# langauges and translation
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
  return LanguageDetector()

nlp = spacy.load('en_core_web_sm')
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

In [None]:
# remove puntuations
def sent_to_words(sentences: list[str]):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))


# use spacy to detect language
def detect_language(text: str):
  blob = nlp(text)
  return blob._.language['language']


In [None]:
# export folder path
export_folder_path = os.path.join("..", "datasets", "CleanData")
if not os.path.exists(export_folder_path):
  os.makedirs(export_folder_path)

In [None]:
start_year = 2017

# load dataset 2017
path = os.path.join("..", "datasets", "CompileData", str(start_year) + "_spotify_data_and_lyrics.csv")
df: pd.DataFrame = pd.read_csv(path, header=0)

# drop rows with na value (aka drop songs without lyrics)
df.dropna(inplace=True)
# drop rows which are not in english
df_english = df[df['lyrics'].apply(detect_language) == 'en']

lyrics_list: list[str] = df_english['lyrics'].values.tolist()
# remove new line characters
lyrics_list = [re.sub('\\n', ' ', str(song_lyric).lower()) for song_lyric in lyrics_list]
# remove punctuations
lyrics_words = list(sent_to_words(lyrics_list))

# remove stopwords
lyrics_no_stop_words = [[
    word for word in simple_preprocess(str(doc)) if word not in stop_words
  ] for doc in lyrics_words
]
# stemming
stemmed_lyrics = [[
    stemmer.stem(word) for word in song_lyric
  ] for song_lyric in lyrics_words
]
# lemmatizing
lemmatize_lyrics = [[
    lemmatizer.lemmatize(word) for word in song_lyric
  ] for song_lyric in lyrics_words
]

# combine cleaned words into 1 sentence
stemmed_cleaned_lyrics = [" ".join(word for word in song) for song in stemmed_lyrics]
lemmatize_cleaned_lyrics = [" ".join(word for word in song) for song in lemmatize_lyrics]

# add back to df_english
df_english.drop(columns=["lyrics"], inplace=True)
df_english['stemmed_lyrics'] = stemmed_cleaned_lyrics
df_english['lemmatized_lyrics'] = lemmatize_cleaned_lyrics

df_english.to_csv(
  export_folder_path + "/" + str(start_year) + "_cleaned_songs_lyrics.csv", 
  encoding="utf-8-sig"
)
