# Install dependencies

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import plotly.io as io
io.templates.default = 'plotly_white'
# # enable modules import
# sys.path.insert(1, '/kaggle/input/modules')

# decade
decade = [1980, 1990]

# Load and preprocess data

In [2]:
from modules.spark_preprocess import SparkSPreprocessor

# SparkPreprocessor read the big csv file and convert it in the more convenient format parquet
#
sp = SparkSPreprocessor("song_lyrics.csv","./data/", "20g")


23/06/03 16:50:06 WARN Utils: Your hostname, thomas-ThinkPad-T490 resolves to a loopback address: 127.0.1.1; using 192.168.31.207 instead (on interface wlp0s20f3)
23/06/03 16:50:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/03 16:50:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
df = sp.preprocess_data(freq=0.01, seed = 42, sample_by = 'ddecade')

                                                                                

In [4]:
# load data
import pandas as pd

df = pd.read_csv('./data/preprocessed_data.csv')
len(df)

4308

In [5]:
from modules.preprocess_text import clean_lyrics

df = df[df['decade'].isin(decade)]
df = clean_lyrics(df)
len(df)

1095

# Text preprocessing

In [6]:
import spacy
from modules.preprocess_text import ngram_models, ngram_preprocess

# gpu off
print("set gpu: ", spacy.prefer_gpu())

new_nlp = spacy.load('en_core_web_sm')

# get bigram_model
bigram_model, trigram_model = ngram_models(df)

# set personalised stop words
new_stop_words = {
    'like','know','come','get', 'got',
    'go','to','oh','yeah','la', 'lala', 'lalala','ooh','yeah',
    'hey','whoa','woah', 'ohh', 'was', 'mmm',
    'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa',
} 

set gpu:  False


# Grid search based on LDA

In [7]:
from modules.lda_models import LDATopicModeling
import logging

gensim_log = './gensim.log'

with open(gensim_log, 'w'):
    pass

# Remove all handlers associated with the root logger object.
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

#initiate log file
logging.basicConfig(
    filename = gensim_log,
    filemode = 'r+',
    format='%(asctime)s:%(levelname)s:%(message)s',
    level=logging.INFO)

ngram_model = LDATopicModeling(
    df,
    gensim_log=gensim_log,
    decade = decade,
    directory = "./models/",
    lang_preprocess = lambda x : ngram_preprocess(
                        x, new_nlp, bigram_model,
                        trigram_model, new_stop_words),
    grid_search = True,
    n_topics=20,
    chunks=2000,
    worker_nodes=4)

total lda computation:  32
coherence cv:0.26626775794320423, coherence umass:-2.385091138280815
alpha:symmetric
eta:symmetric
topic:5
coherence cv:0.3459887464764611, coherence umass:-3.2107966765843634
alpha:asymmetric
eta:symmetric
topic:5
coherence cv:0.2962170961667201, coherence umass:-2.9830026810482084
alpha:symmetric
eta:symmetric
topic:6
coherence cv:0.3132522276602579, coherence umass:-2.6941937338820097
alpha:asymmetric
eta:symmetric
topic:6
coherence cv:0.2555799900092339, coherence umass:-2.585950767164519
alpha:symmetric
eta:symmetric
topic:7
coherence cv:0.3083829771705017, coherence umass:-2.9809536257556744
alpha:asymmetric
eta:symmetric
topic:7
coherence cv:0.310723395859646, coherence umass:-3.1634219069318696
alpha:symmetric
eta:symmetric
topic:8
coherence cv:0.29220207895849926, coherence umass:-3.380749187489823
alpha:asymmetric
eta:symmetric
topic:8
coherence cv:0.29810529195831675, coherence umass:-3.2530217774665173
alpha:symmetric
eta:symmetric
topic:9
coheren

In [8]:
ngram_model.plot_tsne(2)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1095 samples in 0.000s...
[t-SNE] Computed neighbors for 1095 samples in 0.189s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1095
[t-SNE] Computed conditional probabilities for sample 1095 / 1095
[t-SNE] Mean sigma: 0.022038
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.734928
[t-SNE] KL divergence after 1000 iterations: 0.386103


In [9]:
ngram_model.dashboard_LDAvis()

In [10]:
ngram_model.plot_likelihood(30)

In [11]:
ngram_model.plot_coherence()

In [12]:
ngram_model.get_cv_results

Unnamed: 0,topics,alpha,eta,c_v,u_mass
0,5,symmetric,symmetric,0.266268,-2.385091
1,5,asymmetric,symmetric,0.345989,-3.210797
2,6,symmetric,symmetric,0.296217,-2.983003
3,6,asymmetric,symmetric,0.313252,-2.694194
4,7,symmetric,symmetric,0.25558,-2.585951
5,7,asymmetric,symmetric,0.308383,-2.980954
6,8,symmetric,symmetric,0.310723,-3.163422
7,8,asymmetric,symmetric,0.292202,-3.380749
8,9,symmetric,symmetric,0.298105,-3.253022
9,9,asymmetric,symmetric,0.310806,-3.724868


In [13]:
ngram_model.dashboard()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1095 samples in 0.000s...
[t-SNE] Computed neighbors for 1095 samples in 0.158s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1095
[t-SNE] Computed conditional probabilities for sample 1095 / 1095
[t-SNE] Mean sigma: 0.022037
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.588524
[t-SNE] KL divergence after 1000 iterations: 0.389059


In [14]:
ngram_model.save_current_model()

# U_MASS Coherence grid search

In [15]:
ngram_model = LDATopicModeling(
    df,
    gensim_log=gensim_log,
    decade = decade,
    lang_preprocess = lambda x : ngram_preprocess(
                        x, new_nlp, bigram_model,
                        trigram_model, new_stop_words),
    grid_search = True,
    n_topics=20,
    chunks=2000,
    directory = "./models/",
    metric='u_mass',
    worker_nodes=4)

total lda computation:  32
coherence cv:0.3212332926334624, coherence umass:-2.537261163077799
alpha:symmetric
eta:symmetric
topic:5
coherence cv:0.27929165548103674, coherence umass:-2.541326152114862
alpha:asymmetric
eta:symmetric
topic:5
coherence cv:0.2480469302769439, coherence umass:-2.3259676249008487
alpha:symmetric
eta:symmetric
topic:6
coherence cv:0.2998066375898555, coherence umass:-2.563849397725305
alpha:asymmetric
eta:symmetric
topic:6
coherence cv:0.32143996053316354, coherence umass:-3.834739818212864
alpha:symmetric
eta:symmetric
topic:7
coherence cv:0.2698165156715153, coherence umass:-2.5819810120891695
alpha:asymmetric
eta:symmetric
topic:7
coherence cv:0.2630723300510288, coherence umass:-3.198287766449436
alpha:symmetric
eta:symmetric
topic:8
coherence cv:0.297259087993039, coherence umass:-3.012206747015346
alpha:asymmetric
eta:symmetric
topic:8
coherence cv:0.305128568523609, coherence umass:-3.37241564087471
alpha:symmetric
eta:symmetric
topic:9
coherence cv:0

In [16]:
ngram_model.plot_tsne(2)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1095 samples in 0.001s...
[t-SNE] Computed neighbors for 1095 samples in 0.019s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1095
[t-SNE] Computed conditional probabilities for sample 1095 / 1095
[t-SNE] Mean sigma: 0.003274
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.809528
[t-SNE] KL divergence after 1000 iterations: 0.304494


In [17]:
ngram_model.dashboard_LDAvis()

In [18]:
ngram_model.plot_likelihood(30)

In [19]:
ngram_model.plot_coherence()

In [20]:
ngram_model.get_cv_results

Unnamed: 0,topics,alpha,eta,c_v,u_mass
0,5,symmetric,symmetric,0.321233,-2.537261
1,5,asymmetric,symmetric,0.279292,-2.541326
2,6,symmetric,symmetric,0.248047,-2.325968
3,6,asymmetric,symmetric,0.299807,-2.563849
4,7,symmetric,symmetric,0.32144,-3.83474
5,7,asymmetric,symmetric,0.269817,-2.581981
6,8,symmetric,symmetric,0.263072,-3.198288
7,8,asymmetric,symmetric,0.297259,-3.012207
8,9,symmetric,symmetric,0.305129,-3.372416
9,9,asymmetric,symmetric,0.319656,-3.726587


In [21]:
ngram_model.dashboard()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1095 samples in 0.001s...
[t-SNE] Computed neighbors for 1095 samples in 0.022s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1095
[t-SNE] Computed conditional probabilities for sample 1095 / 1095
[t-SNE] Mean sigma: 0.003270
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.334614
[t-SNE] KL divergence after 1000 iterations: 0.301103


In [22]:
ngram_model.save_current_model()

# Bertopic

Preprocess data before clustering (not advised by documentation).

In [23]:
# lyrics preprocessing
preprocess_lyrics = df['lyrics'] \
    .apply(lambda x : ' '.join(
            ngram_preprocess(
            x, new_nlp, bigram_model, 
            trigram_model, new_stop_words)))

# clean lyrics
docs = preprocess_lyrics.values

In [24]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs)

umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)

# Train BERTopic
topic_model = BERTopic(nr_topics = 30, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)


The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.


The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.


The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-j

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

In [25]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [26]:
topic_model.visualize_documents(docs,reduced_embeddings=reduced_embeddings)

In [27]:
topic_model.visualize_topics()

In [28]:
topic_model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [29]:
topic_model.visualize_barchart()

In [30]:
topic_model.visualize_heatmap()

In [31]:
topic_model.visualize_term_rank()

In [32]:
from modules.bertopic_models import compute_coherence

print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [33]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [34]:
from modules.bertopic_models import save_bertopic_model

save_bertopic_model(topic_model, filename = 'bertopic_prepro_', model_dir = "./models")

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df['lyrics'].values

# apply embeddings to doc
embeddings = sentence_model.encode(docs)

# Train BERTopic
vectorizer_model = CountVectorizer(
    stop_words=list(new_nlp.Defaults.stop_words | new_stop_words))

# Train BERTopic
topic_model = BERTopic(nr_topics = 30,vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

In [36]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [37]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [38]:
topic_model.visualize_topics()

In [39]:
topic_model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [40]:
topic_model.visualize_barchart()

In [41]:
topic_model.visualize_heatmap()

In [42]:
topic_model.visualize_term_rank()

In [43]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [44]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [45]:
save_bertopic_model(topic_model, filename = 'bertopic_', model_dir = "./models")

In [46]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(sorted(df['tag'].unique()))

y = le.transform(df['tag'].tolist())


In [47]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

topic_model = BERTopic(nr_topics = 30, verbose=True, ctfidf_model=ctfidf_model,
                       umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, y=y, embeddings = embeddings)

2023-06-03 17:18:08,304 - BERTopic - Reduced dimensionality
2023-06-03 17:18:08,332 - BERTopic - Clustered reduced embeddings
2023-06-03 17:18:08,652 - BERTopic - Reduced number of topics from 6 to 6


In [48]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [49]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [50]:
topic_model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [51]:
topic_model.visualize_topics()

In [52]:
topic_model.visualize_barchart()

In [53]:
topic_model.visualize_heatmap()

In [54]:
topic_model.visualize_term_rank()

In [55]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

5it [00:00, 21.86it/s]


In [56]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [57]:
save_bertopic_model(topic_model, filename = 'bertopic_semsup_', model_dir = "./working/models")

# Bertopic with brunokreiner lyrics transformer

In [58]:
from sentence_transformers import SentenceTransformer

lyrics_model = SentenceTransformer('brunokreiner/lyrics-bert')

# apply embeddings to doc
embeddings = lyrics_model.encode(docs)

# Train BERTopic
vectorizer_model = CountVectorizer(
    stop_words=list(new_nlp.Defaults.stop_words | new_stop_words))
topic_model = BERTopic(nr_topics = 30, vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)

Batches:   0%|          | 0/35 [00:00<?, ?it/s]

2023-06-03 17:18:51,781 - BERTopic - Reduced dimensionality
2023-06-03 17:18:51,815 - BERTopic - Clustered reduced embeddings
2023-06-03 17:18:52,174 - BERTopic - Reduced number of topics from 3 to 3


In [59]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [60]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [61]:
topic_model.visualize_topics()

ValueError: zero-size array to reduction operation maximum which has no identity

In [62]:
topic_model.visualize_barchart()

In [63]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [64]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [65]:
save_bertopic_model(topic_model, filename = 'bertopic_lyricsBert_', model_dir = "./models")