In [None]:
# everything ntlk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 

# prepare NLTK stop words
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
# import libraries

import re
import numpy as np
import pandas as pd
import os

# Gensim
import gensim
from gensim.utils import simple_preprocess


In [None]:
# langauges and translation
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

def get_lang_detector(nlp, name):
  return LanguageDetector()

nlp = spacy.load('en_core_web_sm')
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)

In [None]:
# use spacy to detect language
def detect_language(text: str):
  # covert to unique words only
  unique_word_set = set(text.split(' '))
  unique_sentence = " ".join(word for word in unique_word_set)

  blob = nlp(unique_sentence)
  blob_language = blob._.language
  lang = blob_language['language']
  confidence = blob_language['score']
   
  # return language if very confident in result (0.90)
  return lang if confidence >= 0.9 else 'NaN'

# confirm accuarcy of langauge detection
def en_language_accuracy(text: str):
  magic_number = 50
  en_counter = 0

  # covert to unique words only
  unique_word_set = set(text.split(' '))
  unique_sentence = " ".join(word for word in unique_word_set)

  for i in range(magic_number):
    lang = detect_language(unique_sentence)
    if lang == 'en':
      en_counter = en_counter + 1

  en_confidence = en_counter / magic_number
  # print('en', en_confidence)
  return 'en' if en_confidence >= 0.9 else 'NaN'



# remove stop word, lemmatize and stemmed words
def lyrics_processing(lyrics: str): 
  # remove
  lyrics_no_stop_words = [word for word in simple_preprocess(lyrics, deacc=True) if word not in stop_words]
  stemmed_lyrics_list: list[str] = [stemmer.stem(str(word)) for word in lyrics_no_stop_words]
  lemmatize_lyrics_list: list[str] = [lemmatizer.lemmatize(str(word)) for word in lyrics_no_stop_words]

  return lyrics_no_stop_words, stemmed_lyrics_list, lemmatize_lyrics_list 


In [None]:
# export folder path
export_folder_path = os.path.join("..", "datasets", "CleanData")
if not os.path.exists(export_folder_path):
  os.makedirs(export_folder_path)

In [None]:
start_year = 2017
end_year = 2021

while start_year <= end_year:
  print("Staring with", start_year)
  path = os.path.join("..", "datasets", "CompileData", str(start_year) + "_spotify_data_and_lyrics.csv")
  df: pd.DataFrame = pd.read_csv(path, header=0, index_col=0)

  # drop rows with na value (aka drop songs without lyrics)
  df.dropna(inplace=True)
  # drop column which is the IDs with na value (aka drop songs without lyrics)

  # drop rows which are not in english
  df_english = df[df['lyrics'].apply(detect_language) == 'en']

  lyrics_list: list[str] = df_english['lyrics'].values.tolist()
  # remove new line characters
  lyrics_list = [re.sub('\\n', ' ', str(song_lyric)) for song_lyric in lyrics_list]

  # get lemmentised and stemmed lyrics list
  cleaned_lyrics: list[str] = []
  stemmed_cleaned_lyrics: list[str] = []
  lemmatize_cleaned_lyrics: list[str] = []

  for lyrics in lyrics_list:
    clean, stemmed, lemmatize = lyrics_processing(lyrics)
    cleaned_lyrics.append(" ".join(word for word in clean))
    stemmed_cleaned_lyrics.append(" ".join(word for word in stemmed))
    lemmatize_cleaned_lyrics.append(" ".join(word for word in lemmatize))

  # add back to df_english
  df_english.drop(columns=["lyrics"], inplace=True)
  df_english['cleaned_lyrics'] = cleaned_lyrics
  df_english['stemmed_lyrics'] = stemmed_cleaned_lyrics
  df_english['lemmatized_lyrics'] = lemmatize_cleaned_lyrics

  # drop rows which are not in english again (since we remove the stop words)
  df_english[df_english['cleaned_lyrics'].apply(en_language_accuracy) == 'en']
  print(start_year, 'final', len(df_english.index))
  df_english.reset_index(drop=True, inplace=True)

  # export to excel
  df_english.to_csv(
    export_folder_path + "/" + str(start_year) + "_cleaned_songs_lyrics.csv", 
    encoding="utf-8-sig"
  )
  print(start_year, "written to file")
  start_year = start_year + 1

print("End of Program")