# Install dependencies

In [None]:
import sys

# enable modules import
sys.path.insert(1, '/kaggle/input/modules')

# decade
decade = [1960, 1970]

In [None]:
# install dependencies
!pip install pyspark
!pip install bertopic

# Load and preprocess data

In [None]:
from spark_preprocess import SparkSPreprocessor

# SparkPreprocessor read the big csv file and convert it in the more convenient format parquet
#
sp = SparkSPreprocessor(
    input_path="/kaggle/input/genius-song-lyrics-with-language-information/song_lyrics.csv",
    parquet_path="/kaggle/working/data/",
    parquet_name = "song_lyrics.parquet",
    driver_memory = "20g"
)

In [None]:
df = sp.preprocess_data(freq=0.01, seed = 42, sample_by = 'ddecade')

In [None]:
# load data
import pandas as pd

df = pd.read_csv('./data/preprocessed_data.csv')
len(df)

In [None]:
from preprocess_text import clean_lyrics

df = df[df['decade'].isin(decade)]
df = clean_lyrics(df)
len(df)

# Text preprocessing

In [None]:
import spacy
from preprocess_text import ngram_models, ngram_preprocess

# gpu off
print("set gpu: ", spacy.prefer_gpu())

new_nlp = spacy.load('en_core_web_sm')

# get bigram_model
bigram_model, trigram_model = ngram_models(df)

# set personalised stop words
new_stop_words = {
    'like','know','come','get', 'got',
    'go','to','oh','yeah','la', 'lala', 'lalala','ooh','yeah',
    'hey','whoa','woah', 'ohh', 'was', 'mmm',
    'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa',
} 

# Grid search based on LDA

In [None]:
from lda_models import LDATopicModeling
import logging

gensim_log = '/kaggle/working/gensim.log'

with open(gensim_log, 'w'):
    pass

# Remove all handlers associated with the root logger object.
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

#initiate log file
logging.basicConfig(
    filename = gensim_log,
    filemode = 'r+',
    format='%(asctime)s:%(levelname)s:%(message)s',
    level=logging.INFO)

ngram_model = LDATopicModeling(
    df,
    gensim_log=gensim_log,
    decade = decade,
    lang_preprocess = lambda x : ngram_preprocess(
                        x, new_nlp, bigram_model,
                        trigram_model, new_stop_words),
    grid_search = True,
    n_topics=20,
    chunks=2000,
    worker_nodes=4)

In [None]:
ngram_model.plot_tsne(2)

In [None]:
ngram_model.dashboard_LDAvis()

In [None]:
ngram_model.plot_likelihood(30)

In [None]:
ngram_model.plot_coherence()

In [None]:
ngram_model.get_cv_results

In [None]:
ngram_model.dashboard()

In [None]:
ngram_model.save_current_model()

# U_MASS Coherence grid search

In [None]:
ngram_model = LDATopicModeling(
    df,
    gensim_log=gensim_log,
    decade = decade,
    lang_preprocess = lambda x : ngram_preprocess(
                        x, new_nlp, bigram_model,
                        trigram_model, new_stop_words),
    grid_search = True,
    n_topics=20,
    chunks=2000,
    metric='u_mass',
    worker_nodes=4)

In [None]:
ngram_model.plot_tsne(2)

In [None]:
ngram_model.dashboard_LDAvis()

In [None]:
ngram_model.plot_likelihood(30)

In [None]:
ngram_model.plot_coherence()

In [None]:
ngram_model.get_cv_results

In [None]:
ngram_model.dashboard()

In [None]:
ngram_model.save_current_model()

# Bertopic

Preprocess data before clustering (not advised by documentation).

In [None]:
# lyrics preprocessing
preprocess_lyrics = df['lyrics'] \
    .apply(lambda x : ' '.join(
            ngram_preprocess(
            x, new_nlp, bigram_model, 
            trigram_model, new_stop_words)))

# clean lyrics
docs = preprocess_lyrics.values

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs)

umap_model = UMAP(n_components=5, n_neighbors=15, min_dist=0.0)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True)

# Train BERTopic
topic_model = BERTopic(nr_topics = 30, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs,reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
from modules.bertopic_models import compute_coherence

print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
from bertopic_models import save_bertopic_model

save_bertopic_model(topic_model, filename = 'bertopic_prepro_', model_dir = "/kaggle/working/models")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

docs = df['lyrics'].values

# apply embeddings to doc
embeddings = sentence_model.encode(docs)

# Train BERTopic
vectorizer_model = CountVectorizer(
    stop_words=list(new_nlp.Defaults.stop_words | new_stop_words))

# Train BERTopic
topic_model = BERTopic(nr_topics = 30,vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
save_bertopic_model(topic_model, filename = 'bertopic_', model_dir = "/kaggle/working/models")

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

le.fit(sorted(df['tag'].unique()))

y = le.transform(df['tag'].tolist())


In [None]:
topic_model = BERTopic(nr_topics = 30, verbose=True, ctfidf_model=ctfidf_model,
                       umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, y=y, embeddings = embeddings)

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_hierarchy()

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_heatmap()

In [None]:
topic_model.visualize_term_rank()

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

In [None]:
save_bertopic_model(topic_model, filename = 'bertopic_semsup_', model_dir = "/kaggle/working/models")

# Bertopic with brunokreiner lyrics transformer

In [None]:
from sentence_transformers import SentenceTransformer

lyrics_model = SentenceTransformer('brunokreiner/lyrics-bert')

# apply embeddings to doc
embeddings = lyrics_model.encode(docs)

# Train BERTopic
vectorizer_model = CountVectorizer(
    stop_words=list(new_nlp.Defaults.stop_words | new_stop_words))
topic_model = BERTopic(nr_topics = 30, vectorizer_model=vectorizer_model, umap_model=umap_model, hdbscan_model=hdbscan_model).fit(docs, embeddings)

In [None]:
# Run the visualization with the original embeddings
topic_model.visualize_documents(docs, embeddings=embeddings)

# Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(docs, reduced_embeddings=reduced_embeddings)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_barchart()

In [None]:
topics_per_class = topic_model.topics_per_class(docs, classes=df['tag'].tolist())
topic_model.visualize_topics_per_class(topics_per_class)

In [None]:
print('cv :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'c_v'))
print('umass :',compute_coherence(topic_model, docs, topic_model.topics_, metric = 'u_mass'))

In [None]:
save_bertopic_model(topic_model, filename = 'bertopic_lyricsBert_', model_dir = "/kaggle/working/models")