# Setup

In [None]:
# TODOs
# (1) Evaluate the topic models ✅️
# (2) Try to calculate sentence similarity
# (3) Create a new model that took the topic models and sentence similarity to create prediction
# (4) Automate them all

In [None]:
# Setup cuML
"""
import sys
!cp ../input/rapids/rapids.21.06 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path += ["/opt/conda/envs/rapids/lib/python3.7/site-packages"]
sys.path += ["/opt/conda/envs/rapids/lib/python3.7"]
sys.path += ["/opt/conda/envs/rapids/lib"]
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/
"""

In [None]:
!ls /kaggle/working

In [4]:
!pip install --quiet bertopic
# !pip install --quiet sentence-transformers

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.1.0 requires markupsafe~=2.0.1, but you have markupsafe 2.1.1 which is incompatible.
allennlp 2.10.0 requires protobuf==3.20.0, but you have protobuf 3.19.4 which is incompatible.[0m[31m
[0m

In [None]:
import time

import pandas as pd

In [None]:
from hn_sentence_similarity_utils import load_data, remove_one_word, \
                                         remove_job_postings, remove_links, \
                                         clean_non_stories, clean_last_word_year_and_pdf, \
                                         lemmatize, finalizes

In [None]:
fp = "../input/hackernews-stories-since-2018/hackernews-stories-since-2018.csv"
df_2018 = load_data(fp)
df_2018.info()

In [None]:
corpus_sentences = (df_2018
                    ['title']
                    .drop_duplicates()
                    .pipe(remove_one_word)
                    .pipe(remove_job_postings)
                    .pipe(remove_links)
                    .pipe(clean_non_stories)
                    .pipe(clean_last_word_year_and_pdf)
                    .pipe(lemmatize)
                    .pipe(finalizes)
).values

In [None]:
del df_2018

In [None]:
# Getting the Embeddings from Sentence Transformer by using GPU
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(corpus_sentences, show_progress_bar=True, convert_to_numpy=True)

In [None]:
import pickle
fn = "/kaggle/working/sentence-transformer-unnormalized-embeddings.pkl"
with open(fn, "wb") as f:
    pickle.dump({'sentences': corpus_sentences,
                 'embeddings': embeddings},f)

!du -h $fn

In [None]:
# Reload embeddings
import pickle
fn = "/kaggle/working/sentence-transformer-unnormalized-embeddings.pkl"
with open(fn, "rb") as f:
    cached_data = pickle.load(f)
    corpus_sentences = cached_data['sentences']
#     embeddings = cached_data['embeddings']

---

# Modeling

In [None]:
from bertopic import BERTopic
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from bertopic.vectorizers import OnlineCountVectorizer

In [None]:
# Batches
umap_model = IncrementalPCA(n_components=200)
cluster_model = MiniBatchKMeans(n_clusters=300, random_state=0)
vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01)

In [None]:
# Prepare model
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=cluster_model, 
    vectorizer_model=vectorizer_model, 
    n_gram_range=(1, 2),
    verbose=True
)

In [None]:
# Split list into N equal length
# https://stackoverflow.com/a/2135920/8996974
def split(a, n):
    k, m = divmod(len(a), n)
    return ((i, a[i*k+min(i, m):(i+1)*k+min(i+1, m)]) for i in range(n))

In [None]:
# 100K split is processed in 20 minutes! That's too long!!
# Let's try splitting it to smaller parts instead
N = 8
doc_chunks = split(corpus_sentences, N)

In [None]:
topics = []
for chunk_no, docs in doc_chunks:
    print(f"Processing chunk no: {chunk_no}")
    start_time = time.time()

    topic_model.partial_fit(docs)
    topics.extend(topic_model.topics_)

    print("--- {} minutes ---".format((time.time() - start_time) / 60))
topic_model.topics_ = topics

In [None]:
model_no = '10'
model_name = '10-lemmatized-BERTopic-ipca90-batchkmean300-8N'
topic_model.save(f'/kaggle/working/{model_no}/{model_name}')

---

# Visualizing Results

In [None]:
from bertopic import BERTopic
topic_model = BERTopic.load('...')

In [None]:
freq = topic_model.get_topic_info(); print(freq.to_markdown())

In [None]:
(
    freq.sort_values('Count', ascending=False)
        .head(20)
)

In [None]:
topic_model.get_topic(87)  # Select the most frequent topic

In [None]:
topic_model.topics_[:10]

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy(top_n_topics=100)

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
topic_model.visualize_heatmap(n_clusters=100, width=1000, height=1000)

In [None]:
samples = [
    "I would like a job writing Haskell",
    "Hybrid recommender systems to improve recommendations for sparse datasets",
    "How poverty changes your mindset",
    "Rust in 2022",
    "PostgreSQL 14",
    "Improved distributed algorithms for fundamental graph problems (2017)",
    "Ask HN: What bits of fundamental knowledge are productivity multipliers?",
    "A first lesson in meta-rationality",
    "Django Newbie Mistakes",
    "Ask HN: Which are the best Go repositories to read to learn the language?",
    "Postgres full-text search: A search engine in a database (2021)",
    "Citybound – A city building game using actor-based distributed simulation",
    "BERTopic: The Future of Topic Modeling",
    "When to use memory safe languages",
    "Being OK with not being extraordinary",
    "TikTok reveals details of how its algorithm works",
    "A general overview of what happens before main() (2019)",
    "Becoming a Centaur",
    "Query serving systems: An emerging category of data systems",
    "Rust – A hard decision pays off ",
]

In [None]:
example_sentence = "Rust – A hard decision pays off "
similar_topics, similarity = topic_model.find_topics(example_sentence, top_n=5)
print(similar_topics)
topic_model.get_topic(similar_topics[0])

---

# Figuring Which Model Make Sense

In [None]:
!ls /kaggle/working

In [None]:
# 5-lemmatized-BERTopic-ipca30-batchkmean100-8N
# 6-lemmatized-BERTopic-ipca60-batchkmean200-8N
# 9-lemmatized-BERTopic-ipca200-batchkmean300-8N

In [None]:
from bertopic import BERTopic

topic_model = BERTopic.load(f'/kaggle/working/{model_no}/{model_name}')

In [None]:
from collections import defaultdict

def _avg_topics(topics):
    result = {}
    for (topic, rate) in topics:
        if topic in result:
            result[topic] += rate
            result[topic] /= 2
            continue
        result[topic] = rate
    return result

def find_topics(sentence):
    topics = []
    multiplier = 1
    rate = .5
    decay = .1
    similar_topics, similarity = topic_model.find_topics(sentence, top_n=1)
    for st in similar_topics:
        ts = topic_model.get_topic(st)
        ts = [(t1, t2 * multiplier) for (t1, t2) in ts]
        topics.extend(ts)
        multiplier *= rate
        rate -= decay
    topics = _avg_topics(topics)
    return sorted(topics.items(), key=lambda item: item[1], reverse=True)[:10]

In [None]:
for sample in samples[:5]:
    print(sample)
    print(find_topics(sample))
    print('-------------')

---

# Coherence Score

In [None]:
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

In [None]:
topic_model = BERTopic(verbose=True, n_gram_range=(1, 2))
topics, _ = topic_model.fit_transform(corpus_sentences)

In [None]:
# Preprocess Documents
documents = pd.DataFrame({"Document": corpus_sentences,
                          "ID": range(len(corpus_sentences)),
                          "Topic": topics})
documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)

In [None]:
# Extract vectorizer and analyzer from BERTopic
vectorizer = topic_model.vectorizer_model
analyzer = vectorizer.build_analyzer()

In [None]:
# Extract features for Topic Coherence evaluation
words = vectorizer.get_feature_names()
tokens = [analyzer(doc) for doc in cleaned_docs]
dictionary = corpora.Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]
topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
               for topic in range(len(set(topics))-1)]

In [None]:
!tree "/kaggle/working/"

In [None]:
import pickle

model_no = '10'
fn = f'/kaggle/working/{model_no}/{model_no}-coherence_params.pkl'

with open(fn, "wb") as f:
    pickle.dump({'topic_words': topic_words,
                 'tokens': tokens,
                 'corpus': corpus,
                 'dictionary': dictionary
                }
                ,f)
    !du -h $fn

# Creating Coherence Params (for computing in CPU)

In [None]:
# https://stackoverflow.com/a/27737385/8996974
from functools import wraps
from time import time

def timing(f):
    @wraps(f)
    def wrap(*args, **kw):
        ts = time()
        result = f(*args, **kw)
        te = time()
        print(f"func: {f.__name__} args: [{args}, {kw}] took: {te-ts:2.4f} sec")
        return result
    return wrap

In [None]:
import pickle

@timing
def save_coherence_params(model, corpus_sentences):
    model_no = model[0]
    full_path = f'/kaggle/working/{model_no}/{model}'
    
    # Load model
    print(f"\nLoading model {model_no}")
    topic_model = BERTopic.load(full_path)

    # Preprocess Documents
    print(f"Preprocessing...")
    topics = topic_model.topics_
    documents = pd.DataFrame({"Document": corpus_sentences,
                              "ID": range(len(corpus_sentences)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values)
    
    # Extract vectorizer and analyzer from BERTopic
    print(f"Extracting features...")
    vectorizer = topic_model.vectorizer_model
    analyzer = vectorizer.build_analyzer()
    
    # Extract features for Topic Coherence evaluation
    words = vectorizer.get_feature_names()
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
                   for topic in range(len(set(topics))-1)]
    
    print(f"Saving the coherence_params...")
    fn = f'/kaggle/working/{model_no}/{model_no}-coherence_params.pkl'
    with open(fn, "wb") as f:
        pickle.dump({'topic_words': topic_words,
                     'tokens': tokens,
                     'corpus': corpus,
                     'dictionary': dictionary
                    }
                    ,f)
    print(f"Done!\n")

In [None]:
models = [
    '5-lemmatized-BERTopic-ipca30-batchkmean100-8N',
    '6-lemmatized-BERTopic-ipca60-batchkmean200-8N',
    '9-lemmatized-BERTopic-ipca200-batchkmean300-8N',
]

In [None]:
for model in models:
    save_coherence_params(model, corpus_sentences)

# Evaluating Using Stored Coherence Params

In [None]:
import pickle

@timing
def evaluate_coherence_scores(model):
    model_no = model.split('-')[0]
    full_path = f'/kaggle/working/{model_no}/{model_no}-coherence_params.pkl'
    
    print(f"\nLoading coherence params for model {model_no}")
    with open(full_path, "rb") as f:
        cached_data = pickle.load(f)
        topic_words = cached_data['topic_words']
        tokens = cached_data['tokens']
        corpus = cached_data['corpus']
        dictionary = cached_data['dictionary']
    
    print(f"Computing coherence score...")
    coherence_model = CoherenceModel(topics=topic_words, 
                                 texts=tokens, 
                                 corpus=corpus,
                                 dictionary=dictionary, 
                                 coherence='c_v')
    coherence = coherence_model.get_coherence()
    print(f"Coherence score for model {model_no}: {coherence}")
    
    return coherence

In [None]:
coherences = [evaluate_coherence_scores(model) for model in models]

In [None]:
bad_models = [
    "10-lemmatized-BERTopic-ipca90-batchkmean300-8N",
    "7-lemmatized-BERTopic-ipca75-batchkmean250-8N",
    "8-lemmatized-BERTopic-ipca225-batchkmean350-8N",
]

bad_model_coherences = [evaluate_coherence_scores(model) for model in bad_models]