In [1]:
import pandas as pd
import pprint

In [2]:
lyrics = pd.read_csv('../genius_lyricsglobal.csv')

LDA Model

In [3]:
docs = lyrics['lyrics'].tolist() #list of Unicode strings

In [4]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

#Removing stopwords:
from sklearn.feature_extraction import text
sw = ['oh','ooh','yeah','na','la','hey','like','da', 'feat',
      'whoa','uh','huh','doh','doo','ha','eh','ay','ayy','ll','re','ve'] #removing sounds & contractions
stop_words= text.ENGLISH_STOP_WORDS.union(sw)

docs = [[token for token in doc if token not in stop_words] for doc in docs]

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zylst\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [7]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [8]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=15, no_above=0.7)

In [9]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [10]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 4608
Number of documents: 2758


In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #enable logging

In [12]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 3
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2021-06-05 18:50:15,524 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2021-06-05 18:50:15,526 : INFO : using serial LDA version on this node
2021-06-05 18:50:15,530 : INFO : running online (multi-pass) LDA training, 3 topics, 20 passes over the supplied corpus of 2758 documents, updating model once every 2000 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2021-06-05 18:50:15,532 : INFO : PROGRESS: pass 0, at document #2000/2758
2021-06-05 18:50:20,190 : INFO : optimized alpha [0.33333334, 0.33333334, 0.33333334]
2021-06-05 18:50:20,191 : INFO : merging changes from 2000 documents into a model of 2758 documents
2021-06-05 18:50:20,193 : INFO : topic #0 (0.333): 0.016*"ody" + 0.012*"que" + 0.012*"ah" + 0.010*"love" + 0.008*"know" + 0.007*"night" + 0.007*"te" + 0.007*"lo" + 0.007*"just" + 0.007*"tú"
2021-06-05 18:50:20,194 : INFO : topic #1 (0.333): 0.016*"know" + 0.016*"don" + 0.011*"love" +

2021-06-05 18:50:31,691 : INFO : topic #2 (0.033): 0.018*"just" + 0.017*"don" + 0.014*"time" + 0.013*"know" + 0.010*"mind" + 0.009*"love" + 0.009*"ain" + 0.008*"nah" + 0.008*"way" + 0.008*"wanna"
2021-06-05 18:50:31,691 : INFO : topic diff=0.168959, rho=0.395935
2021-06-05 18:50:31,695 : INFO : PROGRESS: pass 4, at document #2758/2758
2021-06-05 18:50:32,429 : INFO : optimized alpha [0.02457372, 0.0821705, 0.03323737]
2021-06-05 18:50:32,431 : INFO : merging changes from 758 documents into a model of 2758 documents
2021-06-05 18:50:32,433 : INFO : topic #0 (0.025): 0.024*"que" + 0.016*"te" + 0.015*"ft" + 0.014*"lo" + 0.013*"yo" + 0.011*"tú" + 0.011*"el" + 0.009*"remix" + 0.008*"en" + 0.008*"mi"
2021-06-05 18:50:32,434 : INFO : topic #1 (0.082): 0.018*"know" + 0.018*"don" + 0.013*"love" + 0.012*"got" + 0.011*"let" + 0.011*"ft" + 0.009*"just" + 0.008*"nigga" + 0.007*"cause" + 0.007*"baby"
2021-06-05 18:50:32,435 : INFO : topic #2 (0.033): 0.018*"just" + 0.016*"don" + 0.013*"time" + 0.012

2021-06-05 18:50:37,217 : INFO : topic diff=0.075643, rho=0.310400
2021-06-05 18:50:37,220 : INFO : PROGRESS: pass 9, at document #2000/2758
2021-06-05 18:50:37,815 : INFO : optimized alpha [0.02179481, 0.07717559, 0.033234343]
2021-06-05 18:50:37,817 : INFO : merging changes from 2000 documents into a model of 2758 documents
2021-06-05 18:50:37,818 : INFO : topic #0 (0.022): 0.025*"ft" + 0.024*"que" + 0.016*"te" + 0.014*"lo" + 0.013*"yo" + 0.012*"tú" + 0.012*"el" + 0.010*"remix" + 0.008*"en" + 0.008*"mi"
2021-06-05 18:50:37,819 : INFO : topic #1 (0.077): 0.019*"know" + 0.018*"don" + 0.014*"love" + 0.012*"got" + 0.011*"let" + 0.009*"just" + 0.008*"baby" + 0.008*"nigga" + 0.008*"cause" + 0.008*"say"
2021-06-05 18:50:37,820 : INFO : topic #2 (0.033): 0.019*"just" + 0.017*"don" + 0.014*"time" + 0.013*"know" + 0.010*"love" + 0.009*"mind" + 0.009*"got" + 0.009*"ain" + 0.009*"nah" + 0.009*"way"
2021-06-05 18:50:37,821 : INFO : topic diff=0.072367, rho=0.296448
2021-06-05 18:50:37,824 : INFO 

2021-06-05 18:50:41,427 : INFO : optimized alpha [0.020506607, 0.07556628, 0.033133946]
2021-06-05 18:50:41,429 : INFO : merging changes from 758 documents into a model of 2758 documents
2021-06-05 18:50:41,430 : INFO : topic #0 (0.021): 0.034*"ft" + 0.023*"que" + 0.015*"te" + 0.013*"lo" + 0.012*"yo" + 0.010*"tú" + 0.010*"el" + 0.010*"remix" + 0.007*"en" + 0.007*"mi"
2021-06-05 18:50:41,431 : INFO : topic #1 (0.076): 0.019*"know" + 0.018*"don" + 0.014*"love" + 0.012*"got" + 0.011*"let" + 0.009*"just" + 0.008*"nigga" + 0.008*"baby" + 0.008*"cause" + 0.008*"say"
2021-06-05 18:50:41,432 : INFO : topic #2 (0.033): 0.018*"just" + 0.016*"don" + 0.013*"time" + 0.013*"know" + 0.011*"love" + 0.009*"mind" + 0.009*"ain" + 0.009*"got" + 0.009*"nah" + 0.009*"way"
2021-06-05 18:50:41,433 : INFO : topic diff=0.053421, rho=0.254998
2021-06-05 18:50:41,437 : INFO : PROGRESS: pass 14, at document #2000/2758
2021-06-05 18:50:41,974 : INFO : optimized alpha [0.020352282, 0.075003855, 0.03304273]
2021-06-0

2021-06-05 18:50:45,111 : INFO : merging changes from 2000 documents into a model of 2758 documents
2021-06-05 18:50:45,113 : INFO : topic #0 (0.020): 0.032*"ft" + 0.023*"que" + 0.015*"te" + 0.013*"lo" + 0.013*"yo" + 0.011*"tú" + 0.011*"el" + 0.010*"remix" + 0.007*"en" + 0.007*"mi"
2021-06-05 18:50:45,114 : INFO : topic #1 (0.074): 0.020*"know" + 0.019*"don" + 0.014*"love" + 0.012*"got" + 0.011*"let" + 0.009*"just" + 0.008*"nigga" + 0.008*"baby" + 0.008*"cause" + 0.008*"say"
2021-06-05 18:50:45,114 : INFO : topic #2 (0.033): 0.019*"just" + 0.016*"don" + 0.014*"time" + 0.013*"know" + 0.010*"love" + 0.009*"mind" + 0.009*"got" + 0.009*"way" + 0.009*"nah" + 0.009*"ain"
2021-06-05 18:50:45,115 : INFO : topic diff=0.041524, rho=0.221518
2021-06-05 18:50:45,119 : INFO : PROGRESS: pass 18, at document #2758/2758
2021-06-05 18:50:45,334 : INFO : optimized alpha [0.019637346, 0.07390874, 0.03278954]
2021-06-05 18:50:45,335 : INFO : merging changes from 758 documents into a model of 2758 document

In [13]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2021-06-05 18:50:46,093 : INFO : CorpusAccumulator accumulated stats from 1000 documents
2021-06-05 18:50:46,114 : INFO : CorpusAccumulator accumulated stats from 2000 documents


Average topic coherence: -1.5207.
[([(0.019224603, 'know'),
   (0.018683005, 'don'),
   (0.013990026, 'love'),
   (0.012121269, 'got'),
   (0.011195524, 'let'),
   (0.00902085, 'just'),
   (0.008431328, 'nigga'),
   (0.008003117, 'baby'),
   (0.007880844, 'cause'),
   (0.007778714, 'say'),
   (0.0075719724, 'ain'),
   (0.0071542384, 'need'),
   (0.006693479, 'come'),
   (0.006557477, 'life'),
   (0.006459878, 'make'),
   (0.006455867, 'day'),
   (0.0064523113, 'wanna'),
   (0.006027782, 'bitch'),
   (0.005908841, 'time'),
   (0.005829864, 'tell')],
  -0.8718477108201973),
 ([(0.03493578, 'ft'),
   (0.021996569, 'que'),
   (0.014403172, 'te'),
   (0.013038675, 'lo'),
   (0.012041186, 'yo'),
   (0.010274045, 'tú'),
   (0.010092807, 'el'),
   (0.009979568, 'remix'),
   (0.0071873213, 'lil'),
   (0.007162363, 'en'),
   (0.007060864, 'mi'),
   (0.006873824, 'si'),
   (0.0064940685, 'kidd'),
   (0.0064816833, 'se'),
   (0.006391284, 'keo'),
   (0.0063902405, 'kidd_keo'),
   (0.0059809024, 'p