In [22]:
import pandas as pd
import numpy as np
import json
import pickle

# Gensim libraries:
import gensim
from gensim import corpora
from gensim.corpora import Dictionary, MmCorpus
from gensim.utils import simple_preprocess
from gensim.models import LdaModel, CoherenceModel, LdaMulticore
import pyLDAvis
from pyLDAvis import gensim_models as gensimvis
pyLDAvis.enable_notebook()
from pprint import pprint

# NLTK:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')


# Matplotlib:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors

# Word-cloud:
from wordcloud import WordCloud, STOPWORDS

#### 1. Read the dataset:
Here we read the dataset that we have annotated with Vader, nrc and moralstrength. The idea is to add the topic scores besides the other lyrics features scores

In [2]:
en_lyrics_sent_emo_morals_dt = pd.read_csv('../data/artist_lyrics_annotated_vader_nrc_moralStrength.csv')
print(en_lyrics_sent_emo_morals_dt.shape)
en_lyrics_sent_emo_morals_dt.head(5)

(31729, 28)


Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy,vader_neg,vader_neu,vader_pos,vader_comp,words,...,sadness,anticipation,surprise,joy,trust,care,fairness,loyalty,authority,purity
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en,0.083,0.746,0.171,0.9887,"['hey', 'hey', 'bye', 'bye', 'bye', 'bye', 'by...",...,0.17094,0.025641,0.025641,0.102564,0.017094,4.0,8.166667,5.0,5.0,8.0
1,*NSYNC,It’s Gonna Be Me,[Intro: Justin]\n(It's gonna be me)\nOooh yeah...,"Oooh yeah You might've been hurt, babe That ...",en,0.083,0.728,0.189,0.9887,"['oooh', 'yeah', 'you', 'might', 've', 'been',...",...,0.086022,0.064516,0.16129,0.150538,0.053763,2.285714,5.0,5.0,5.0,5.0
2,*NSYNC,Tearin’ Up My Heart,[Chorus: JC & Justin]\nIt's tearin' up my hear...,It's tearin' up my heart when I'm with you But...,en,0.18,0.747,0.073,-0.9927,"['it', 's', 'tearin', 'up', 'my', 'heart', 'wh...",...,0.082353,0.023529,0.0,0.070588,0.011765,5.0,5.0,5.0,5.0,5.0
3,*NSYNC,Gone,[Verse 1: Justin]\nThere's a thousand words th...,There's a thousand words that I could say To m...,en,0.076,0.772,0.152,0.9904,"['there', 's', 'a', 'thousand', 'words', 'that...",...,0.012903,0.03871,0.045161,0.077419,0.070968,5.0,8.166667,5.0,5.0,8.0
4,*NSYNC,"Merry Christmas, Happy Holidays","[Intro: Justin, All & JC]\nOooh, ooh ooh\nMerr...","Oooh, ooh ooh Merry Christmas Happy holidays M...",en,0.014,0.478,0.508,0.9998,"['oooh', 'ooh', 'ooh', 'merry', 'christmas', '...",...,0.036697,0.348624,0.137615,0.302752,0.192661,5.0,8.166667,8.0,5.0,8.0


#### Preprocessing Step:
We skip preprocessing step and just read the lemmas we saved from when we extracted other lyrics features (see point 6 in *lyrics_vader_nrc_moral_strength_annotations.ipynb*). Instead, we just read the lemmas of each song from the json file.

In [3]:
with open('../data/lyrics_lemmas') as f:
    lyrics_spacy_lemmas = json.load(f)

In [4]:
len(lyrics_spacy_lemmas)

31729

#### 2. LDA Dictionary Creation and occurrence-based Filtering:
To perform Latent Dirichlet Allocation, the well-established Python library-gensim is used, which requires a dictionary representation of the documents. Meaning that all tokens are mapped to a unique ID, this reduces the overall dimensionality of a literature corpus. 

In [5]:
dictionary = corpora.Dictionary(lyrics_spacy_lemmas)
dictionary.filter_extremes(no_above = 0.9) #no_below = 340,

#### 3. Bag-of-Words and Index to Dictionary Conversion:
Each song (transformed in a list of tokens) is converted into the bag-of-words, which stores the unique token ID and its count for each song.

In [6]:
gensim_corpus = [dictionary.doc2bow(song) for song in lyrics_spacy_lemmas]
temp = dictionary[0]
# id2word = dictionary.id2token
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(gensim_corpus))
print(gensim_corpus[:1])

Number of unique tokens: 13902
Number of documents: 31729
[[(0, 1), (1, 9), (2, 1), (3, 3), (4, 3), (5, 2), (6, 1), (7, 5), (8, 1), (9, 5), (10, 4), (11, 1), (12, 2), (13, 1), (14, 1), (15, 1), (16, 2), (17, 1), (18, 4), (19, 2), (20, 9), (21, 1), (22, 1), (23, 1), (24, 2), (25, 5), (26, 1), (27, 3), (28, 1), (29, 4), (30, 2), (31, 1), (32, 3), (33, 1), (34, 3), (35, 1), (36, 1), (37, 2), (38, 3), (39, 1), (40, 1), (41, 4), (42, 1), (43, 12), (44, 1), (45, 1), (46, 1)]]


#### 4. Choosing the number of topics in LDA:
Here we find the optimal number of topics. We need to build many LDA models with different values of the number of topics (k) and pick the one that gives the highest coherence value. Choosing a ‘k’ that marks the end of a rapid growth of topic coherence usually offers meaningful and interpretable topics. Picking an even higher value can sometimes provide more granular sub-topics. If you see the same keywords being repeated in multiple topics, it’s probably a sign that the ‘k’ is too large.

In [7]:
def compute_coherence_and_perplexity_values(dictionary, gensim_corpus,
                             lyrics_spacy_lemmas, limit, start=2, step=2):
    coherence_values = []
    u_mass_values = []
    perplexity_values = []
    model_list = []
    
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=gensim_corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        
        # C_V:
        coherencemodel = CoherenceModel(model=model, texts=lyrics_spacy_lemmas, 
                                        dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        
        # U_mass:
        coherencemodel_u_mass = CoherenceModel(model=model, texts=lyrics_spacy_lemmas, 
                                               dictionary=dictionary, coherence="u_mass")
        u_mass_values .append(coherencemodel_u_mass.get_coherence())
        
        # Perplexity:
        perplexitymodel = model.log_perplexity(gensim_corpus)
        perplexity_values.append(perplexitymodel)


        
    return model_list, coherence_values, u_mass_values, perplexity_values

In [15]:
start=2;limit=20; step=2

model_list, coherence_values, u_mass_values, perplexity_values  = compute_coherence_and_perplexity_values(dictionary=dictionary, 
                            gensim_corpus=gensim_corpus, 
                            lyrics_spacy_lemmas=lyrics_spacy_lemmas, 
                            start=start, limit=limit, step=step)

x = range(start, limit, step)

# Show c_v graph
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

# Show u_mass graph
plt.plot(x, u_mass_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score U_mass")
plt.legend(("u_mass_values"), loc='best')
plt.show()



# Show perplexity graph
plt.plot(x, perplexity_values)
plt.xlabel("Num Topics")
plt.ylabel("Perplexity score")
plt.legend(("perplexity_values"), loc='best')
plt.show()

*4.1. The LDA model with the proposed number of k topics based on the coherence and perplexity estimations:*

In [8]:
chunksize = 100
passes = 10
iterations = 100
num_topics = 4
random_state = 100

lda_model = LdaModel(corpus=gensim_corpus,
                    id2word=dictionary,
                    chunksize=chunksize,
#                     alpha='auto',
#                     eta='auto',
                    iterations=iterations,
                    num_topics=num_topics,
                    passes=passes,
                    per_word_topics = True)

In [25]:
pprint(lda_model.print_topics())

[(0,
  '0.050*"get" + 0.012*"go" + 0.011*"man" + 0.011*"girl" + 0.009*"say" + '
  '0.009*"fuck" + 0.009*"come" + 0.008*"back" + 0.008*"make" + 0.007*"bitch"'),
 (1,
  '0.019*"come" + 0.015*"light" + 0.014*"night" + 0.014*"see" + 0.012*"eye" + '
  '0.012*"fall" + 0.012*"away" + 0.010*"go" + 0.009*"run" + 0.009*"day"'),
 (2,
  '0.013*"die" + 0.011*"life" + 0.009*"man" + 0.009*"blood" + 0.008*"dead" + '
  '0.008*"live" + 0.008*"fight" + 0.007*"world" + 0.007*"death" + '
  '0.006*"kill"'),
 (3,
  '0.040*"love" + 0.038*"know" + 0.024*"go" + 0.020*"say" + 0.020*"get" + '
  '0.018*"make" + 0.018*"want" + 0.017*"never" + 0.017*"time" + 0.016*"feel"')]


In [26]:
coherencemodel = CoherenceModel(model=lda_model, texts=lyrics_spacy_lemmas, dictionary=dictionary, coherence='c_v')
print(coherencemodel.get_coherence())

0.4243921924737374


#### We decided to keep 4 general topics for the song lyrics:
Based on the Coherence and Perplexity estimations 4-6 topics are the most suited for the song lyrics in our dataset. We tried multiple combinations and the most clear definition of the topis was for k=4. Feel free to try other k values (6, 8, etc).

We named our 4 topics as follows: 
**1. World/Life/Time:** We named this topic like this since it contains keywords as light, night, day, time, world, etc,that are related to living and have more neutral context; 
**2. Obscene:** This topic contains swearing and insulting words;
**3. Romantic:** This topic contains words related to love;
**Death/Fear/Violence:** This topic contains words related to war, death and darkness.

#### 5. Save the LDA model:

In [11]:
# with open("../data/lda_model_best_k4.pk","wb") as f:
#     pickle.dump(lda_model, f)

*5.1.Read the best LDA model that we hace saved:*

In [8]:
with open('../data/lda_model_best_k4.pk', 'rb') as pickle_file:
    lda_model= pickle.load(pickle_file)

In [9]:
pprint(lda_model.print_topics())

[(0,
  '0.050*"get" + 0.012*"go" + 0.011*"man" + 0.011*"girl" + 0.009*"say" + '
  '0.009*"fuck" + 0.009*"come" + 0.008*"back" + 0.008*"make" + 0.007*"bitch"'),
 (1,
  '0.019*"come" + 0.015*"light" + 0.014*"night" + 0.014*"see" + 0.012*"eye" + '
  '0.012*"fall" + 0.012*"away" + 0.010*"go" + 0.009*"run" + 0.009*"day"'),
 (2,
  '0.013*"die" + 0.011*"life" + 0.009*"man" + 0.009*"blood" + 0.008*"dead" + '
  '0.008*"live" + 0.008*"fight" + 0.007*"world" + 0.007*"death" + '
  '0.006*"kill"'),
 (3,
  '0.040*"love" + 0.038*"know" + 0.024*"go" + 0.020*"say" + 0.020*"get" + '
  '0.018*"make" + 0.018*"want" + 0.017*"never" + 0.017*"time" + 0.016*"feel"')]


#### 6. Visualise the LDA model and save as .html file using pyLDAvis:

In [62]:
vis_data = gensimvis.prepare(lda_model, gensim_corpus, dictionary)
#pyLDAvis.display(vis_data)
pyLDAvis.save_html(vis_data, '../data/./lyrics_lda_7_k_'+ str(num_topics) +'.html')


#### 7. The Dominant topic and its percentage contribution in each Lyrics
In LDA models, each document is composed of multiple topics. But, typically only one of the topics is dominant. The code below extracts the dominant topic for each lyrics and shows the weight of the topic and the keywords in a nicely formatted output. This way, we will know which lyrics belongs predominantly to which topic.

In [10]:
def format_topics_lyrics(ldamodel=None, corpus=gensim_corpus, texts=lyrics_spacy_lemmas):    
    lyrics_topics_dt = pd.DataFrame()

    # Get main topic in each lyrics
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list # assign weights per each lyrics rows          
      
        row = sorted(row, key=lambda x: (x[1]), reverse=True) #sorted rows based on the weights
        
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row): # topic num and propability
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num) # ('word', topic_pribablity)
                topic_keywords = ", ".join([word for word, prop in wp]) # join keywords per topic
                lyrics_topics_dt = lyrics_topics_dt.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    lyrics_topics_dt.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    lyrics_topics_dt = pd.concat([lyrics_topics_dt, contents], axis=1) #column wise
    return(lyrics_topics_dt)

In [11]:
topic_lyrics_keywords_dt = format_topics_lyrics(ldamodel = lda_model, 
                                                corpus = gensim_corpus, 
                                                texts = lyrics_spacy_lemmas)

In [12]:
topic_lyrics_keywords_dt

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,3.0,0.8785,"love, know, go, say, get, make, want, never, t...","[tonight, probably, going, start, fight, know,..."
1,3.0,0.9537,"love, know, go, say, get, make, want, never, t...","[hurt, babe, lie, see, come, go, remember, tel..."
2,3.0,0.9905,"love, know, go, say, get, make, want, never, t...","[tearin, heart, apart, feel, matter, feel, pai..."
3,3.0,0.9337,"love, know, go, say, get, make, want, never, t...","[word, say, make, come, home, seem, long, ago,..."
4,3.0,0.6957,"love, know, go, say, get, make, want, never, t...","[wait, year, night, snow, glisten, tree, outsi..."
...,...,...,...,...
31724,1.0,0.8494,"come, light, night, see, eye, fall, away, go, ...","[sun, old, water, yearle, flake, keep, whirl, ..."
31725,3.0,0.5175,"love, know, go, say, get, make, want, never, t...","[moon, star, feel, arrest, unknowable, fade, d..."
31726,3.0,0.5566,"love, know, go, say, get, make, want, never, t...","[get, big, big, distance, outside, world, beco..."
31727,1.0,0.8014,"come, light, night, see, eye, fall, away, go, ...","[come, come, warn, wake, wake, call, back, ope..."


*7.1. Format dominant keyword dataframe*

In [13]:
dominant_topic_dt = topic_lyrics_keywords_dt.reset_index()
dominant_topic_dt.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Lyrics']
dominant_topic_dt.reset_index(drop = True, inplace = True)
dominant_topic_dt[:10]

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Lyrics
0,0,3.0,0.8785,"love, know, go, say, get, make, want, never, t...","[tonight, probably, going, start, fight, know,..."
1,1,3.0,0.9537,"love, know, go, say, get, make, want, never, t...","[hurt, babe, lie, see, come, go, remember, tel..."
2,2,3.0,0.9905,"love, know, go, say, get, make, want, never, t...","[tearin, heart, apart, feel, matter, feel, pai..."
3,3,3.0,0.9337,"love, know, go, say, get, make, want, never, t...","[word, say, make, come, home, seem, long, ago,..."
4,4,3.0,0.6957,"love, know, go, say, get, make, want, never, t...","[wait, year, night, snow, glisten, tree, outsi..."
5,5,3.0,0.7666,"love, know, go, say, get, make, want, never, t...","[hang, see, new, boyfriend, make, jealous, hop..."
6,6,3.0,0.88,"love, know, go, say, get, make, want, never, t...","[vision, bring, tear, eye, surround, secret, l..."
7,7,0.0,0.5928,"get, go, man, girl, say, fuck, come, back, mak...","[dirty, pop, sick, tired, people, talk, deal, ..."
8,8,3.0,0.9901,"love, know, go, say, get, make, want, never, t...","[ever, want, ever, need, tell, lance, want, ba..."
9,9,1.0,0.4777,"come, light, night, see, eye, fall, away, go, ...","[cold, salty, sea, water, touch, lip, cold, sa..."


In [15]:
dominant_topic_dt.Dominant_Topic.value_counts()

3.0    15340
1.0     7140
0.0     5700
2.0     3549
Name: Dominant_Topic, dtype: int64

*7.2. Let's add the topics to the rest of lyrics emotions and sentiment dataframe:*

In [16]:
dominant_topic_dt['Dominant_Topic'] = dominant_topic_dt['Dominant_Topic'].replace(2.0, 'Death/Fear/Violence')
dominant_topic_dt['Dominant_Topic'] = dominant_topic_dt['Dominant_Topic'].replace(1.0, 'World/Life/Time')
dominant_topic_dt['Dominant_Topic'] = dominant_topic_dt['Dominant_Topic'].replace(3.0, 'Romantic')
dominant_topic_dt['Dominant_Topic'] = dominant_topic_dt['Dominant_Topic'].replace(0.0, 'Obscene')

#### 8. Concatenate the artist lyrics features dataset with the extracted topics: 

In [17]:
en_lyrics_sent_emo_morals_topics_dt = pd.concat([en_lyrics_sent_emo_morals_dt,
                                                         dominant_topic_dt[['Dominant_Topic','Topic_Perc_Contrib']]], axis = 1)

In [18]:
en_lyrics_sent_emo_morals_topics_dt.head(3)

Unnamed: 0,Artist,title,original_lyrics,cleaned_lyrics,lang_detect_spacy,vader_neg,vader_neu,vader_pos,vader_comp,words,...,surprise,joy,trust,care,fairness,loyalty,authority,purity,Dominant_Topic,Topic_Perc_Contrib
0,*NSYNC,Bye Bye Bye,"[Intro: Justin & All]\nHey, hey\nBye bye bye\n...","Hey, hey Bye bye bye Bye bye! Bye bye! I'm ...",en,0.083,0.746,0.171,0.9887,"['hey', 'hey', 'bye', 'bye', 'bye', 'bye', 'by...",...,0.025641,0.102564,0.017094,4.0,8.166667,5.0,5.0,8.0,Romantic,0.8785
1,*NSYNC,It’s Gonna Be Me,[Intro: Justin]\n(It's gonna be me)\nOooh yeah...,"Oooh yeah You might've been hurt, babe That ...",en,0.083,0.728,0.189,0.9887,"['oooh', 'yeah', 'you', 'might', 've', 'been',...",...,0.16129,0.150538,0.053763,2.285714,5.0,5.0,5.0,5.0,Romantic,0.9537
2,*NSYNC,Tearin’ Up My Heart,[Chorus: JC & Justin]\nIt's tearin' up my hear...,It's tearin' up my heart when I'm with you But...,en,0.18,0.747,0.073,-0.9927,"['it', 's', 'tearin', 'up', 'my', 'heart', 'wh...",...,0.0,0.070588,0.011765,5.0,5.0,5.0,5.0,5.0,Romantic,0.9905


#### 9. Save the dataset with all lyrics features:

In [24]:
en_lyrics_sent_emo_morals_topics_dt.to_csv('../data/artist_lyrics_annotated_vader_nrc_moralStrength_lda.csv', 
                                           index = None)

#### 10. References:
1. Temporal Analysis and Visualisation of Music (Misael et al 2020): https://sol.sbc.org.br/index.php/eniac/article/view/12155
2. Evaluate Topic Models: Latent Dirichlet Allocation (LDA): https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
3. What’s in a Song? LDA Topic Modeling of over 120,000 Lyrics:
https://tim-denzler.medium.com/whats-in-a-song-using-lda-to-find-topics-in-over-120-000-songs-53785767b692
4. Evaluation of Topic Modeling: Topic Coherence: https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/