In [2]:
import pandas as pd
import numpy as np
import json
import sys
sys.path.append('../py_scripts/')
from lyrics_preprocessing import *
from lyrics_sentiment_emotion_morals_annotations import *
from py_lex import EmoLex # https://github.com/dropofwill/py-lex
import moralstrength
from moralstrength.moralstrength import estimate_morals
from moralstrength import lexicon_use
lexicon_use.select_version("latest")

#### 1. Read the dataset:
Here we have the initial dataset that contains up to 10 song for each artist/band including **original lyrics**, **cleaned lyrics** (for cleaning use clean_lyrics method from lyrics_processing.py) and the **detected language column** (the language detection can be implemented by running lyrics_spacy_language_detection.py).
Be aware that this data frame is not fully cleaned, it is what we extracted from the lyrics genius based artists/musicians page names filtered by category name on Facebook. We clean the data as we progress with further experiments.

In [2]:
artist_lyrics_dt = pd.read_csv('../data/artist_lyrics_initial_data.csv')
print(artist_lyrics_dt.shape)
artist_lyrics_dt.head()

(40225, 5)


Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en
1,*NSYNC,It’s Gonna Be Me,[Intro: Justin]\n(It's gonna be me)\nOooh yeah...,"Oooh yeah You might've been hurt, babe That ...",en
2,*NSYNC,Tearin’ Up My Heart,[Chorus: JC & Justin]\nIt's tearin' up my hear...,It's tearin' up my heart when I'm with you But...,en
3,*NSYNC,Gone,[Verse 1: Justin]\nThere's a thousand words th...,There's a thousand words that I could say To m...,en
4,*NSYNC,"Merry Christmas, Happy Holidays","[Intro: Justin, All & JC]\nOooh, ooh ooh\nMerr...","Oooh, ooh ooh Merry Christmas Happy holidays M...",en


In [8]:
print('Initial number of the artists is: ' + str(len(artist_lyrics_dt.Artist.unique())))

Initial number of the artists is: 4043


#### 2. Keep only english songs:

In [9]:
en_artist_lyrics_dt = artist_lyrics_dt[(artist_lyrics_dt.lang_detect_spacy == 'en')]
en_artist_lyrics_dt.reset_index(drop = True, inplace = True)
en_artist_lyrics_dt.head()

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en
1,*NSYNC,It’s Gonna Be Me,[Intro: Justin]\n(It's gonna be me)\nOooh yeah...,"Oooh yeah You might've been hurt, babe That ...",en
2,*NSYNC,Tearin’ Up My Heart,[Chorus: JC & Justin]\nIt's tearin' up my hear...,It's tearin' up my heart when I'm with you But...,en
3,*NSYNC,Gone,[Verse 1: Justin]\nThere's a thousand words th...,There's a thousand words that I could say To m...,en
4,*NSYNC,"Merry Christmas, Happy Holidays","[Intro: Justin, All & JC]\nOooh, ooh ooh\nMerr...","Oooh, ooh ooh Merry Christmas Happy holidays M...",en


In [10]:
en_artist_lyrics_dt.shape

(33761, 5)

In [11]:
print('The number of the artists with only english detected songs is: ' + str(len(en_artist_lyrics_dt.Artist.unique())))

The number of the artists with only english detected songs is: 3640


#### 3. Filter out songs with less than 150 characters:

In [12]:
en_artist_lyrics_dt = en_artist_lyrics_dt[en_artist_lyrics_dt['cleaned_lyrics'].str.len()>=150]
en_artist_lyrics_dt.shape

(33496, 5)

#### 4. Filter out the duplicated songs or remix version of the songs:

In [13]:
en_artist_lyrics_dt = en_artist_lyrics_dt.drop_duplicates(subset = ['Artist', 'title'])
en_artist_lyrics_dt.reset_index(drop = True, inplace = True)
en_artist_lyrics_dt.shape

(31952, 5)

In [14]:
en_artist_lyrics_dt = en_artist_lyrics_dt[en_artist_lyrics_dt["title"].str.contains("Remix") == False]
en_artist_lyrics_dt.reset_index(drop = True, inplace = True)
en_artist_lyrics_dt.shape

(31729, 5)

Let's check if there's any null values for the colums

In [15]:
en_artist_lyrics_dt.isna().sum()

Artist               0
title                0
original_lyrics      0
cleaned_lyrics       0
lang_detect_spacy    0
dtype: int64

In [16]:
en_artist_lyrics_dt[en_artist_lyrics_dt.cleaned_lyrics =='No-lyrics-found']

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy


#### 5. Lyrics sentiment annotation with VADER lexicon:
For Vader, we don't need to lemmatize or tokenize the text. Also vader is key and punctuation sensitive.

In [17]:
en_artist_lyrics_dt = apply_vader_sentiment(en_artist_lyrics_dt)

In [18]:
en_artist_lyrics_dt.head()

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy,vader_neg,vader_neu,vader_pos,vader_comp
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en,0.083,0.746,0.171,0.9887
1,*NSYNC,It’s Gonna Be Me,[Intro: Justin]\n(It's gonna be me)\nOooh yeah...,"Oooh yeah You might've been hurt, babe That ...",en,0.083,0.728,0.189,0.9887
2,*NSYNC,Tearin’ Up My Heart,[Chorus: JC & Justin]\nIt's tearin' up my hear...,It's tearin' up my heart when I'm with you But...,en,0.18,0.747,0.073,-0.9927
3,*NSYNC,Gone,[Verse 1: Justin]\nThere's a thousand words th...,There's a thousand words that I could say To m...,en,0.076,0.772,0.152,0.9904
4,*NSYNC,"Merry Christmas, Happy Holidays","[Intro: Justin, All & JC]\nOooh, ooh ooh\nMerr...","Oooh, ooh ooh Merry Christmas Happy holidays M...",en,0.014,0.478,0.508,0.9998


#### 6. Lyrics preprocessing: tokenization, lemmatization and stopwords
For NRC emotion lexicon and MoralStrength lexion, we tokenize and lemmatize the lyrics and also remove the stopwords. 

*6.1. Tokenization:*

In [20]:
tokens_list, tokens_count = tokenize_lyrics(en_artist_lyrics_dt.cleaned_lyrics)
en_artist_lyrics_dt['words'] = tokens_list
en_artist_lyrics_dt['word_counts'] = tokens_count

In [21]:
en_artist_lyrics_dt.head(1)

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy,vader_neg,vader_neu,vader_pos,vader_comp,words,word_counts
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en,0.083,0.746,0.171,0.9887,"[hey, hey, bye, bye, bye, bye, bye, bye, bye, ...",409


*6.2. Lemmatization and stopwords:*

In [22]:
lyrics_spacy_lemmas = spacy_lematization(en_artist_lyrics_dt.cleaned_lyrics.str.lower())

In [23]:
lyrics_spacy_lemmas, lemma_counts = remove_stop_words(lyrics_spacy_lemmas)
en_artist_lyrics_dt['lemmas'] = lyrics_spacy_lemmas   
en_artist_lyrics_dt['lemma_counts'] = lemma_counts    

In [24]:
en_artist_lyrics_dt.head(1)

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy,vader_neg,vader_neu,vader_pos,vader_comp,words,word_counts,lemmas,lemma_counts
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en,0.083,0.746,0.171,0.9887,"[hey, hey, bye, bye, bye, bye, bye, bye, bye, ...",409,"[tonight, probably, going, start, fight, know,...",117


In [26]:
len(lyrics_spacy_lemmas)

31729

We save lemmas in a json file, so we don't need to lemmatize the lyrics everytime as it is a process that takes a bit of time:

In [27]:
with open('../data/lyrics_lemmas', "w") as f:
    json.dump(lyrics_spacy_lemmas, f)

If we want to read the lemmas, we use the command below: 

In [28]:
with open('../data/lyrics_lemmas') as f:
    lyrics_spacy_lemmas = json.load(f)

#### 7. NRC lyrics word emotion annotation:
for using the word emotion annotation lexicon, please download the lexion from the offical page: http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm, and then add it to your prefered path as I did below:

In [29]:
en_lexicon = EmoLex("NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")

In [30]:
en_artist_lyrics_dt = apply_nrc_sentiment_emo(en_artist_lyrics_dt, 
                                              en_lexicon, 'lemmas')

In [31]:
en_artist_lyrics_dt.head(5)

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy,vader_neg,vader_neu,vader_pos,vader_comp,words,...,negative,positive,anger,disgust,fear,sadness,anticipation,surprise,joy,trust
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en,0.083,0.746,0.171,0.9887,"[hey, hey, bye, bye, bye, bye, bye, bye, bye, ...",...,0.25641,0.128205,0.136752,0.136752,0.051282,0.17094,0.025641,0.025641,0.102564,0.017094
1,*NSYNC,It’s Gonna Be Me,[Intro: Justin]\n(It's gonna be me)\nOooh yeah...,"Oooh yeah You might've been hurt, babe That ...",en,0.083,0.728,0.189,0.9887,"[oooh, yeah, you, might, ve, been, hurt, babe,...",...,0.107527,0.182796,0.086022,0.129032,0.064516,0.086022,0.064516,0.16129,0.150538,0.053763
2,*NSYNC,Tearin’ Up My Heart,[Chorus: JC & Justin]\nIt's tearin' up my hear...,It's tearin' up my heart when I'm with you But...,en,0.18,0.747,0.073,-0.9927,"[it, s, tearin, up, my, heart, when, i, m, wit...",...,0.094118,0.070588,0.0,0.0,0.082353,0.082353,0.023529,0.0,0.070588,0.011765
3,*NSYNC,Gone,[Verse 1: Justin]\nThere's a thousand words th...,There's a thousand words that I could say To m...,en,0.076,0.772,0.152,0.9904,"[there, s, a, thousand, words, that, i, could,...",...,0.051613,0.116129,0.025806,0.0,0.045161,0.012903,0.03871,0.045161,0.077419,0.070968
4,*NSYNC,"Merry Christmas, Happy Holidays","[Intro: Justin, All & JC]\nOooh, ooh ooh\nMerr...","Oooh, ooh ooh Merry Christmas Happy holidays M...",en,0.014,0.478,0.508,0.9998,"[oooh, ooh, ooh, merry, christmas, happy, holi...",...,0.06422,0.302752,0.073394,0.06422,0.045872,0.036697,0.348624,0.137615,0.302752,0.192661


In [32]:
en_artist_lyrics_dt.describe()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_comp,word_counts,lemma_counts,negative,positive,anger,disgust,fear,sadness,anticipation,surprise,joy,trust
count,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0
mean,0.110826,0.751808,0.137364,0.158827,273.363138,108.195815,0.113801,0.126722,0.0525,0.036906,0.067384,0.067795,0.073214,0.046343,0.080985,0.069054
std,0.086308,0.106759,0.09019,0.869141,167.700472,68.967334,0.082536,0.082825,0.0545,0.045523,0.063583,0.061036,0.05896,0.045115,0.072809,0.056843
min,0.0,0.0,0.0,-1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.048,0.687,0.074,-0.9137,169.0,66.0,0.053763,0.07,0.014925,0.005917,0.022472,0.025,0.033708,0.016667,0.032258,0.030769
50%,0.092,0.76,0.12,0.679,241.0,94.0,0.098361,0.111111,0.038462,0.02381,0.05102,0.052632,0.060606,0.036145,0.062016,0.057471
75%,0.155,0.826,0.182,0.9789,332.0,130.0,0.15534,0.165517,0.073529,0.051282,0.09434,0.09375,0.098592,0.0625,0.109091,0.092593
max,0.836,1.0,0.797,0.9999,3330.0,1448.0,1.0,0.906977,1.0,1.0,0.888889,0.64574,0.826087,0.934783,0.883721,0.731183


#### 8. MoralStrength Lexicon:
Here we will implement  the Moral lexicon by Araque et al. 2019 (https://github.com/oaraque/moral-foundations).

In [33]:
preprocessed_and_lemmatized_lyrics = list_of_lemmas_to_text(lyrics_spacy_lemmas)

Joined lemmas are used as input for the *MoralStrength* method since the lexicon contain only lemmas:

In [34]:
preprocessed_and_lemmatized_lyrics[:1] 

['tonight probably going start fight know right baby come love endlessly time leave make alone know take lie wanna see door baby wanna fool player game hate lie baby really wanna make tough tell enough sound crazy lie baby hit truth girl welcome give good reason baby come live really come see life much well go know take lie wanna see door baby wanna fool player game hate lie baby really wanna make tough tell enough sound crazy lie baby give know sure wanna reason love checkin signin want loser enough wanna fool game leave wanna make tough enough lie wanna fool player game wanna fool lie baby really wanna make tough tell enough sound crazy lie']

In [35]:
lyrics_moral_strength_score = calculate_moral_scores(preprocessed_and_lemmatized_lyrics)



In [36]:
lyrics_moral_strength_score.head()

Unnamed: 0,care,fairness,loyalty,authority,purity
0,4.0,8.166667,,,8.0
1,2.285714,,,,
2,,,,,
3,,8.166667,,,8.0
4,,8.166667,8.0,,8.0


8.1. The lyrics that are not annotated with any of the moral scores (NaN lyrics values for morals), are considered as neutral (5 is considered as neutral value by the authors of the lexicon.)

In [37]:
lyrics_moral_strength_score = lyrics_moral_strength_score.fillna(5) # 5 is the neutral number here!
lyrics_moral_strength_score

Unnamed: 0,care,fairness,loyalty,authority,purity
0,4.000000,8.166667,5.0,5.000000,8.0
1,2.285714,5.000000,5.0,5.000000,5.0
2,5.000000,5.000000,5.0,5.000000,5.0
3,5.000000,8.166667,5.0,5.000000,8.0
4,5.000000,8.166667,8.0,5.000000,8.0
...,...,...,...,...,...
31724,5.000000,5.000000,5.0,5.000000,5.0
31725,7.800000,5.000000,5.0,5.000000,5.0
31726,2.833333,5.000000,5.0,7.666667,5.0
31727,5.000000,5.000000,5.0,5.000000,5.0


#### 9. Concatenate MoralStrength values with the rest of the lyrics data:
Here we put all together: vader sentiments, nrc word emotion association and moral strength annotation scores.

In [38]:
en_artist_lyrics_dt.reset_index(drop = True, inplace = True)
lyrics_moral_strength_score.reset_index(drop = True, inplace = True)
en_lyrics_sent_emo_morals_dt = pd.concat([en_artist_lyrics_dt, lyrics_moral_strength_score], axis = 1)

In [39]:
en_lyrics_sent_emo_morals_dt.head(5)

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy,vader_neg,vader_neu,vader_pos,vader_comp,words,...,sadness,anticipation,surprise,joy,trust,care,fairness,loyalty,authority,purity
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en,0.083,0.746,0.171,0.9887,"[hey, hey, bye, bye, bye, bye, bye, bye, bye, ...",...,0.17094,0.025641,0.025641,0.102564,0.017094,4.0,8.166667,5.0,5.0,8.0
1,*NSYNC,It’s Gonna Be Me,[Intro: Justin]\n(It's gonna be me)\nOooh yeah...,"Oooh yeah You might've been hurt, babe That ...",en,0.083,0.728,0.189,0.9887,"[oooh, yeah, you, might, ve, been, hurt, babe,...",...,0.086022,0.064516,0.16129,0.150538,0.053763,2.285714,5.0,5.0,5.0,5.0
2,*NSYNC,Tearin’ Up My Heart,[Chorus: JC & Justin]\nIt's tearin' up my hear...,It's tearin' up my heart when I'm with you But...,en,0.18,0.747,0.073,-0.9927,"[it, s, tearin, up, my, heart, when, i, m, wit...",...,0.082353,0.023529,0.0,0.070588,0.011765,5.0,5.0,5.0,5.0,5.0
3,*NSYNC,Gone,[Verse 1: Justin]\nThere's a thousand words th...,There's a thousand words that I could say To m...,en,0.076,0.772,0.152,0.9904,"[there, s, a, thousand, words, that, i, could,...",...,0.012903,0.03871,0.045161,0.077419,0.070968,5.0,8.166667,5.0,5.0,8.0
4,*NSYNC,"Merry Christmas, Happy Holidays","[Intro: Justin, All & JC]\nOooh, ooh ooh\nMerr...","Oooh, ooh ooh Merry Christmas Happy holidays M...",en,0.014,0.478,0.508,0.9998,"[oooh, ooh, ooh, merry, christmas, happy, holi...",...,0.036697,0.348624,0.137615,0.302752,0.192661,5.0,8.166667,8.0,5.0,8.0


*9.1. Some statistical information:*

In [40]:
en_lyrics_sent_emo_morals_dt.describe()

Unnamed: 0,vader_neg,vader_neu,vader_pos,vader_comp,word_counts,lemma_counts,negative,positive,anger,disgust,...,sadness,anticipation,surprise,joy,trust,care,fairness,loyalty,authority,purity
count,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,...,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0,31729.0
mean,0.110826,0.751808,0.137364,0.158827,273.363138,108.195815,0.113801,0.126722,0.0525,0.036906,...,0.067795,0.073214,0.046343,0.080985,0.069054,4.619023,5.862614,5.143091,5.359293,5.318834
std,0.086308,0.106759,0.09019,0.869141,167.700472,68.967334,0.082536,0.082825,0.0545,0.045523,...,0.061036,0.05896,0.045115,0.072809,0.056843,1.727357,1.403884,1.175638,1.035818,1.599695
min,0.0,0.0,0.0,-1.0,20.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.714286,1.0,1.2,1.333333
25%,0.048,0.687,0.074,-0.9137,169.0,66.0,0.053763,0.07,0.014925,0.005917,...,0.025,0.033708,0.016667,0.032258,0.030769,4.0,5.0,5.0,5.0,5.0
50%,0.092,0.76,0.12,0.679,241.0,94.0,0.098361,0.111111,0.038462,0.02381,...,0.052632,0.060606,0.036145,0.062016,0.057471,5.0,5.0,5.0,5.0,5.0
75%,0.155,0.826,0.182,0.9789,332.0,130.0,0.15534,0.165517,0.073529,0.051282,...,0.09375,0.098592,0.0625,0.109091,0.092593,5.0,7.833333,5.0,5.0,5.1875
max,0.836,1.0,0.797,0.9999,3330.0,1448.0,1.0,0.906977,1.0,1.0,...,0.64574,0.826087,0.934783,0.883721,0.731183,8.8,9.0,8.9,8.8,9.0


In [41]:
print('We have annotated ' + str(en_lyrics_sent_emo_morals_dt.shape[0]) + ' song lyrics from ' + str(len(en_lyrics_sent_emo_morals_dt.Artist.unique())) + ' artists.')

We have annotated 31729 song lyrics from 3625 artists.


Now we check the song lyrics with the lowest number of words and lemmas:

#### 10. Now we save the annotated dataset:

In [42]:
en_lyrics_sent_emo_morals_dt.to_csv('../data/artist_lyrics_annotated_vader_nrc_moralStrength.csv', index = None)