In [1]:
import pandas as pd
import pprint

In [2]:
lyrics = pd.read_csv('../genius_lyrics.csv')

LDA Model

In [3]:
docs = lyrics['lyrics'].tolist() #list of Unicode strings

In [4]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

#Removing stopwords:
from sklearn.feature_extraction import text
sw = ['oh','ooh','yeah','na','la','hey','like','da', 'feat',
      'whoa','uh','huh','doh','doo','ha','eh','ay','ayy','ll','re','ve'] #removing sounds & contractions
stop_words= text.ENGLISH_STOP_WORDS.union(sw)

docs = [[token for token in doc if token not in stop_words] for doc in docs]

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zylst\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [7]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [8]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=15, no_above=0.7)

In [9]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [10]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1483
Number of documents: 1397


In [11]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) #enable logging

In [12]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 3
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2021-06-05 12:54:53,437 : INFO : using autotuned alpha, starting with [0.33333334, 0.33333334, 0.33333334]
2021-06-05 12:54:53,439 : INFO : using serial LDA version on this node
2021-06-05 12:54:53,441 : INFO : running online (multi-pass) LDA training, 3 topics, 20 passes over the supplied corpus of 1397 documents, updating model once every 1397 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2021-06-05 12:54:53,442 : INFO : PROGRESS: pass 0, at document #1397/1397
2021-06-05 12:54:56,678 : INFO : optimized alpha [0.33333334, 0.33333334, 0.33333334]
2021-06-05 12:54:56,680 : INFO : topic #0 (0.333): 0.019*"just" + 0.017*"love" + 0.013*"baby" + 0.013*"gonna" + 0.012*"time" + 0.011*"girl" + 0.011*"ain" + 0.010*"got" + 0.010*"want" + 0.010*"make"
2021-06-05 12:54:56,680 : INFO : topic #1 (0.333): 0.022*"just" + 0.018*"love" + 0.017*"got" + 0.017*"let" + 0.014*"girl" + 0.014*"wanna" + 0.013*"make" + 0.013*"want" + 0.013*"say" + 0.

2021-06-05 12:55:09,985 : INFO : topic #1 (0.168): 0.028*"love" + 0.025*"just" + 0.021*"let" + 0.021*"baby" + 0.018*"wanna" + 0.018*"girl" + 0.017*"want" + 0.017*"got" + 0.016*"say" + 0.016*"make"
2021-06-05 12:55:09,986 : INFO : topic #2 (0.103): 0.022*"got" + 0.014*"nigga" + 0.012*"bitch" + 0.011*"ain" + 0.011*"just" + 0.010*"night" + 0.009*"baby" + 0.009*"love" + 0.008*"shit" + 0.008*"money"
2021-06-05 12:55:09,987 : INFO : topic diff=0.055619, rho=0.301511
2021-06-05 12:55:09,989 : INFO : PROGRESS: pass 10, at document #1397/1397
2021-06-05 12:55:10,759 : INFO : optimized alpha [0.08021256, 0.17082067, 0.10118449]
2021-06-05 12:55:10,760 : INFO : topic #0 (0.080): 0.019*"gonna" + 0.018*"love" + 0.016*"just" + 0.016*"away" + 0.012*"life" + 0.011*"time" + 0.011*"shake" + 0.010*"baby" + 0.010*"cause" + 0.010*"come"
2021-06-05 12:55:10,761 : INFO : topic #1 (0.171): 0.028*"love" + 0.025*"just" + 0.022*"baby" + 0.021*"let" + 0.018*"wanna" + 0.018*"girl" + 0.017*"want" + 0.017*"got" + 0.

2021-06-05 12:55:18,820 : INFO : topic #1 (0.196): 0.030*"love" + 0.025*"just" + 0.025*"baby" + 0.021*"let" + 0.018*"girl" + 0.018*"wanna" + 0.018*"got" + 0.018*"want" + 0.017*"say" + 0.015*"make"
2021-06-05 12:55:18,821 : INFO : topic #2 (0.094): 0.022*"got" + 0.016*"nigga" + 0.014*"bitch" + 0.012*"ain" + 0.010*"money" + 0.010*"just" + 0.009*"shit" + 0.009*"fuck" + 0.008*"man" + 0.007*"night"
2021-06-05 12:55:18,821 : INFO : topic diff=0.031365, rho=0.218218
2021-06-05 12:55:18,823 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=1483, num_topics=3, decay=0.5, chunksize=2000) in 25.38s', 'datetime': '2021-06-05T12:55:18.823058', 'gensim': '4.0.1', 'python': '3.8.3 (default, Jul  2 2020, 17:30:36) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [13]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2021-06-05 12:55:18,857 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -1.0647.
[([(0.030440262, 'love'),
   (0.025213324, 'just'),
   (0.024557464, 'baby'),
   (0.021177616, 'let'),
   (0.017882733, 'girl'),
   (0.017807337, 'wanna'),
   (0.017707216, 'got'),
   (0.01761536, 'want'),
   (0.016698467, 'say'),
   (0.015431519, 'make'),
   (0.015335694, 'cause'),
   (0.0122458385, 'time'),
   (0.012028326, 'way'),
   (0.011631641, 'need'),
   (0.011394326, 'feel'),
   (0.0108841555, 'right'),
   (0.00963318, 'come'),
   (0.00949793, 'ain'),
   (0.0083824545, 'good'),
   (0.0071659125, 'thing')],
  -0.8351752105030299),
 ([(0.022234192, 'got'),
   (0.01596885, 'nigga'),
   (0.013837871, 'bitch'),
   (0.0123980595, 'ain'),
   (0.009765779, 'money'),
   (0.009704002, 'just'),
   (0.008844734, 'shit'),
   (0.008834384, 'fuck'),
   (0.008218589, 'man'),
   (0.007367708, 'night'),
   (0.0070624975, 'lil'),
   (0.0070259366, 'em'),
   (0.0067282226, 'look'),
   (0.0065964675, 'girl'),
   (0.0058549847, 'walk'),
   (0.0056868345, 'young'),
