# Install dependencies

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import plotly.io as io
io.templates.default = 'plotly_white'
# # enable modules import
# sys.path.insert(1, '/kaggle/input/modules')

# decade
decade = [2000, 2010]

# Load and preprocess data

In [2]:
from modules.spark_preprocess import SparkSPreprocessor

# SparkPreprocessor read the big csv file and convert it in the more convenient format parquet
#
sp = SparkSPreprocessor("song_lyrics.csv","./data/", "20g")


23/06/03 20:15:44 WARN Utils: Your hostname, thomas-ThinkPad-T490 resolves to a loopback address: 127.0.1.1; using 192.168.31.207 instead (on interface wlp0s20f3)
23/06/03 20:15:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/03 20:15:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
df = sp.preprocess_data(freq=0.01, seed = 42, sample_by = 'ddecade')

                                                                                

In [4]:
# load data
import pandas as pd

df = pd.read_csv('./data/preprocessed_data.csv')
len(df)

4308

In [5]:
from modules.preprocess_text import clean_lyrics

df = df[df['decade'].isin(decade)]
df = clean_lyrics(df)
len(df)

1078

# Text preprocessing

In [6]:
import spacy
from modules.preprocess_text import ngram_models, ngram_preprocess

# gpu off
print("set gpu: ", spacy.prefer_gpu())

new_nlp = spacy.load('en_core_web_sm')

# get bigram_model
bigram_model, trigram_model = ngram_models(df)

# set personalised stop words
new_stop_words = {
    'like','know','come','get', 'got',
    'go','to','oh','yeah','la', 'lala', 'lalala','ooh','yeah',
    'hey','whoa','woah', 'ohh', 'was', 'mmm',
    'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa',
} 

set gpu:  False


# Grid search based on LDA

In [7]:
from modules.lda_models import LDATopicModeling
import logging

gensim_log = './gensim.log'

with open(gensim_log, 'w'):
    pass

# Remove all handlers associated with the root logger object.
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

#initiate log file
logging.basicConfig(
    filename = gensim_log,
    filemode = 'r+',
    format='%(asctime)s:%(levelname)s:%(message)s',
    level=logging.INFO)

ngram_model = LDATopicModeling(
    df,
    gensim_log=gensim_log,
    decade = decade,
    directory = "./models/",
    lang_preprocess = lambda x : ngram_preprocess(
                        x, new_nlp, bigram_model,
                        trigram_model, new_stop_words),
    grid_search = True,
    n_topics=20,
    chunks=2000,
    worker_nodes=4)

total lda computation:  32
coherence cv:0.2897663113836216, coherence umass:-1.9068068350171288
alpha:symmetric
eta:symmetric
topic:5
coherence cv:0.3563558309656766, coherence umass:-1.8021411961196794
alpha:asymmetric
eta:symmetric
topic:5
coherence cv:0.3447801794915762, coherence umass:-2.910705849977259
alpha:symmetric
eta:symmetric
topic:6
coherence cv:0.33535898292023253, coherence umass:-2.667348514284416
alpha:asymmetric
eta:symmetric
topic:6
coherence cv:0.3242092774065381, coherence umass:-2.2566498159804405
alpha:symmetric
eta:symmetric
topic:7
coherence cv:0.3088190968785892, coherence umass:-2.2883956094283517
alpha:asymmetric
eta:symmetric
topic:7
coherence cv:0.31727709972883644, coherence umass:-2.330358657300537
alpha:symmetric
eta:symmetric
topic:8
coherence cv:0.31341632878850867, coherence umass:-2.965881023122453
alpha:asymmetric
eta:symmetric
topic:8
coherence cv:0.3152761768257226, coherence umass:-2.2400118466260306
alpha:symmetric
eta:symmetric
topic:9
coheren

In [8]:
ngram_model.plot_tsne(2)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1078 samples in 0.012s...
[t-SNE] Computed neighbors for 1078 samples in 0.047s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1078
[t-SNE] Computed conditional probabilities for sample 1078 / 1078
[t-SNE] Mean sigma: 0.009663
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.911465
[t-SNE] KL divergence after 1000 iterations: 0.403649


In [9]:
ngram_model.dashboard_LDAvis()

In [10]:
ngram_model.plot_likelihood(30)

In [11]:
ngram_model.plot_coherence()

In [12]:
ngram_model.get_cv_results

Unnamed: 0,topics,alpha,eta,c_v,u_mass
0,5,symmetric,symmetric,0.289766,-1.906807
1,5,asymmetric,symmetric,0.356356,-1.802141
2,6,symmetric,symmetric,0.34478,-2.910706
3,6,asymmetric,symmetric,0.335359,-2.667349
4,7,symmetric,symmetric,0.324209,-2.25665
5,7,asymmetric,symmetric,0.308819,-2.288396
6,8,symmetric,symmetric,0.317277,-2.330359
7,8,asymmetric,symmetric,0.313416,-2.965881
8,9,symmetric,symmetric,0.315276,-2.240012
9,9,asymmetric,symmetric,0.311518,-2.766915


In [13]:
ngram_model.dashboard()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1078 samples in 0.001s...
[t-SNE] Computed neighbors for 1078 samples in 0.042s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1078
[t-SNE] Computed conditional probabilities for sample 1078 / 1078
[t-SNE] Mean sigma: 0.009644
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.762741
[t-SNE] KL divergence after 1000 iterations: 0.401526


In [14]:
ngram_model.save_current_model()

# U_MASS Coherence grid search

In [15]:
ngram_model = LDATopicModeling(
    df,
    gensim_log=gensim_log,
    decade = decade,
    lang_preprocess = lambda x : ngram_preprocess(
                        x, new_nlp, bigram_model,
                        trigram_model, new_stop_words),
    grid_search = True,
    n_topics=20,
    chunks=2000,
    directory = "./models/",
    metric='u_mass',
    worker_nodes=4)

total lda computation:  32


KeyboardInterrupt: 

In [None]:
ngram_model.plot_tsne(2)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1078 samples in 0.001s...
[t-SNE] Computed neighbors for 1078 samples in 0.018s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1078
[t-SNE] Computed conditional probabilities for sample 1078 / 1078
[t-SNE] Mean sigma: 0.001681
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.063606
[t-SNE] KL divergence after 1000 iterations: 0.225869


In [None]:
ngram_model.dashboard_LDAvis()

In [None]:
ngram_model.plot_likelihood(30)

In [None]:
ngram_model.plot_coherence()

In [None]:
ngram_model.get_cv_results

Unnamed: 0,topics,alpha,eta,c_v,u_mass
0,5,symmetric,symmetric,0.32991,-1.826272
1,5,asymmetric,symmetric,0.334278,-1.770744
2,6,symmetric,symmetric,0.323198,-1.981309
3,6,asymmetric,symmetric,0.289905,-2.137746
4,7,symmetric,symmetric,0.351197,-2.975774
5,7,asymmetric,symmetric,0.300282,-2.009404
6,8,symmetric,symmetric,0.352095,-2.654706
7,8,asymmetric,symmetric,0.320459,-2.193589
8,9,symmetric,symmetric,0.318636,-2.091282
9,9,asymmetric,symmetric,0.344218,-2.814635


In [None]:
ngram_model.dashboard()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1078 samples in 0.001s...
[t-SNE] Computed neighbors for 1078 samples in 0.019s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1078
[t-SNE] Computed conditional probabilities for sample 1078 / 1078
[t-SNE] Mean sigma: 0.001680
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.025101
[t-SNE] KL divergence after 1000 iterations: 0.226227


In [None]:
ngram_model.save_current_model()

# Bertopic

Preprocess data before clustering (not advised by documentation).

In [None]:
# lyrics preprocessing
preprocess_lyrics = df['lyrics'] \
    .apply(lambda x : ' '.join(
            ngram_preprocess(
            x, new_nlp, bigram_model, 
            trigram_model, new_stop_words)))

# clean lyrics
docs = preprocess_lyrics.values

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs)

umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)

# Train BERTopic
topic_model = BERTopic(nr_topics = 30, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)


The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.


The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.


The 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-j

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs,reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
from modules.bertopic_models import compute_coherence

print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
from modules.bertopic_models import save_bertopic_model

save_bertopic_model(topic_model, filename = 'bertopic_prepro_', model_dir = "./models")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df['lyrics'].values

# apply embeddings to doc
embeddings = sentence_model.encode(docs)

# Train BERTopic
vectorizer_model = CountVectorizer(
    stop_words=list(new_nlp.Defaults.stop_words | new_stop_words))

# Train BERTopic
topic_model = BERTopic(nr_topics = 30,vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
save_bertopic_model(topic_model, filename = 'bertopic_', model_dir = "./models")

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(sorted(df['tag'].unique()))

y = le.transform(df['tag'].tolist())


In [None]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

topic_model = BERTopic(nr_topics = 30, verbose=True, ctfidf_model=ctfidf_model,
                       umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, y=y, embeddings = embeddings)

2023-06-03 17:55:36,291 - BERTopic - Reduced dimensionality
2023-06-03 17:55:36,313 - BERTopic - Clustered reduced embeddings
2023-06-03 17:55:36,641 - BERTopic - Reduced number of topics from 5 to 5


In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_hierarchy()


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead


scipy.array is deprecated and will be removed in SciPy 2.0.0, use numpy.array instead



In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

5it [00:00, 20.22it/s]


In [None]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
save_bertopic_model(topic_model, filename = 'bertopic_semsup_', model_dir = "./working/models")

# Bertopic with brunokreiner lyrics transformer

In [None]:
from sentence_transformers import SentenceTransformer

lyrics_model = SentenceTransformer('brunokreiner/lyrics-bert')

# apply embeddings to doc
embeddings = lyrics_model.encode(docs)

# Train BERTopic
vectorizer_model = CountVectorizer(
    stop_words=list(new_nlp.Defaults.stop_words | new_stop_words))
topic_model = BERTopic(nr_topics = 30, vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

2023-06-03 17:56:22,838 - BERTopic - Reduced dimensionality
2023-06-03 17:56:22,876 - BERTopic - Clustered reduced embeddings
2023-06-03 17:56:23,222 - BERTopic - Reduced number of topics from 10 to 10


In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
save_bertopic_model(topic_model, filename = 'bertopic_lyricsBert_', model_dir = "./models")