In [2]:
import pandas as pd
import pprint

In [3]:
lyrics = pd.read_csv('../genius_lyrics21.csv')

LDA Model

In [4]:
docs = lyrics['lyrics'].tolist() #list of Unicode strings

In [5]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

#Removing stopwords:
from sklearn.feature_extraction import text
sw = ['oh','ooh','yeah','na','la','hey','like','da', 'feat',
      'whoa','uh','huh','doh','doo','ha','eh','ay','ayy','ll','re','ve'] #removing sounds & contractions
stop_words= text.ENGLISH_STOP_WORDS.union(sw)

docs = [[token for token in doc if token not in stop_words] for doc in docs]

In [6]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zylst\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [8]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [9]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=15, no_above=0.7)

In [10]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [11]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 2132
Number of documents: 1362


In [12]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #enable logging

In [13]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 3
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2021-06-05 17:03:11,363 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2021-06-05 17:03:11,364 : INFO : using serial LDA version on this node
2021-06-05 17:03:11,367 : INFO : running online (multi-pass) LDA training, 3 topics, 20 passes over the supplied corpus of 1362 documents, updating model once every 1362 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2021-06-05 17:03:11,368 : INFO : PROGRESS: pass 0, at document #1362/1362
2021-06-05 17:03:14,366 : INFO : optimized alpha [0.33333334, 0.33333334, 0.33333334]
2021-06-05 17:03:14,368 : INFO : topic #0 (0.333): 0.017*"got" + 0.015*"nigga" + 0.013*"bitch" + 0.013*"let" + 0.013*"love" + 0.012*"ft" + 0.011*"shit" + 0.010*"ain" + 0.009*"anderson" + 0.008*"way"
2021-06-05 17:03:14,369 : INFO : topic #1 (0.333): 0.018*"mind" + 0.012*"day" + 0.012*"got" + 0.010*"long" + 0.010*"just" + 0.010*"way" + 0.009*"life" + 0.009*"tryin" + 0.009*"cause" + 

2021-06-05 17:03:21,406 : INFO : topic #1 (0.064): 0.017*"mind" + 0.014*"got" + 0.012*"just" + 0.011*"love" + 0.011*"way" + 0.010*"tell" + 0.010*"wanna" + 0.010*"day" + 0.010*"cause" + 0.009*"ain"
2021-06-05 17:03:21,407 : INFO : topic #2 (0.075): 0.014*"ain" + 0.011*"baby" + 0.011*"just" + 0.011*"love" + 0.011*"got" + 0.011*"need" + 0.011*"que" + 0.009*"young" + 0.009*"time" + 0.008*"nah"
2021-06-05 17:03:21,408 : INFO : topic diff=0.056853, rho=0.301511
2021-06-05 17:03:21,410 : INFO : PROGRESS: pass 10, at document #1362/1362
2021-06-05 17:03:21,907 : INFO : optimized alpha [0.06848299, 0.063573465, 0.072007366]
2021-06-05 17:03:21,907 : INFO : topic #0 (0.068): 0.021*"nigga" + 0.019*"got" + 0.015*"bitch" + 0.013*"shit" + 0.013*"let" + 0.013*"love" + 0.011*"ain" + 0.011*"ft" + 0.009*"anderson" + 0.008*"make"
2021-06-05 17:03:21,908 : INFO : topic #1 (0.064): 0.017*"mind" + 0.014*"got" + 0.012*"just" + 0.011*"love" + 0.011*"way" + 0.010*"tell" + 0.010*"wanna" + 0.010*"day" + 0.010*"c

2021-06-05 17:03:25,830 : INFO : topic #1 (0.062): 0.016*"mind" + 0.015*"got" + 0.015*"love" + 0.012*"just" + 0.011*"way" + 0.011*"tell" + 0.010*"wanna" + 0.010*"cause" + 0.010*"ain" + 0.009*"long"
2021-06-05 17:03:25,830 : INFO : topic #2 (0.060): 0.014*"ain" + 0.012*"que" + 0.011*"baby" + 0.011*"need" + 0.011*"just" + 0.009*"young" + 0.009*"got" + 0.009*"nah" + 0.009*"love" + 0.008*"time"
2021-06-05 17:03:25,831 : INFO : topic diff=0.021719, rho=0.218218
2021-06-05 17:03:25,833 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=2132, num_topics=3, decay=0.5, chunksize=2000) in 14.46s', 'datetime': '2021-06-05T17:03:25.832059', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [14]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2021-06-05 17:03:25,875 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -1.8952.
[([(0.016109781, 'mind'),
   (0.015322884, 'got'),
   (0.015135687, 'love'),
   (0.0121295545, 'just'),
   (0.011171913, 'way'),
   (0.010941835, 'tell'),
   (0.009910608, 'wanna'),
   (0.009875395, 'cause'),
   (0.00984214, 'ain'),
   (0.009008761, 'long'),
   (0.008997982, 'day'),
   (0.008646027, 'time'),
   (0.008575311, 'life'),
   (0.008149193, 'good'),
   (0.007898341, 'girl'),
   (0.007887297, 'let'),
   (0.007836657, 'make'),
   (0.007789673, 'say'),
   (0.0073592225, 'night'),
   (0.007258794, 'right')],
  -0.9151906930555305),
 ([(0.022344874, 'nigga'),
   (0.019564088, 'got'),
   (0.015909702, 'bitch'),
   (0.013216442, 'shit'),
   (0.01266304, 'let'),
   (0.011929124, 'ain'),
   (0.011274982, 'love'),
   (0.011174873, 'ft'),
   (0.008744023, 'anderson'),
   (0.008187753, 'lil'),
   (0.007881395, 'make'),
   (0.007830005, 'just'),
   (0.0074762865, 'come'),
   (0.007122775, 'gon'),
   (0.006424255, 'em'),
   (0.0063777124, 'blrrrd'),
   (0.