In [17]:
import pandas as pd
import pyLDAvis.gensim_models

from gensim.corpora import Dictionary
from gensim.models import LdaMulticore, CoherenceModel
from pyLDAvis import enable_notebook

from pprint import pprint

In [3]:
# Import DataFrame from JSON
df_with_clusters = pd.read_json('data/clustered_questions.json')

# Update column names
update_col_names = {
    col: col[:6] + '_clusters_labels' for col in df_with_clusters.columns 
    if col.startswith('kmeans') or col.startswith('dbscan')
}
df_with_clusters = df_with_clusters.rename(columns=update_col_names)
df_with_clusters.head()

Unnamed: 0,sentence,tags,sentence_bow,sentence_bow_lem,sentence_dl,kmeans_clusters_labels,dbscan_clusters_labels
1987694,"How do I print the full NumPy array, without t...","[python, arrays, numpy, output-formatting]",print full numpy array truncation,print full numpy array truncation,how do i print the full numpy array without tr...,6,0
1988804,What is memoization and how can I use it in Py...,"[python, memoization]",memoization use python,memoization use python,what is memoization and how can i use it in py...,4,1
1993727,Expanding tuples into arguments,"[python, tuples, parameter-passing]",expanding tuples arguments,expand tuple argument,expanding tuples into arguments,1,-1
1995615,How can I format a decimal to always show 2 de...,"[python, string-formatting]",format decimal always show decimal places,format decimal always show decimal place,how can i format a decimal to always show 2 de...,4,-1
2018026,"What are the differences between the urllib, u...","[python, python-requests, urllib, urllib2, url...",differences urllib urllib2 urllib3 requests mo...,difference urllib urllib2 urllib3 request module,what are the differences between the urllib ur...,5,-1


In [4]:
# Group by clusters
group_dbscan = df_with_clusters.groupby('dbscan_clusters_labels')
group_kmeans = df_with_clusters.groupby('kmeans_clusters_labels')

In [5]:
lda_models = {}  # Dict to store LDA models for each cluster
tags_per_question = {}  # Dict to store tags for each question in each cluster
topn = 5

for cluster_label in group_dbscan.groups:
    data = group_dbscan.get_group(cluster_label)["sentence_bow_lem"].str.split()
    id2word = Dictionary(data)
    corpus = [id2word.doc2bow(text) for text in data]

    lda_model = LdaMulticore(
        corpus=corpus,
        id2word=id2word,
        num_topics=topn, 
        passes=10,
        random_state=42,
    )
    lda_models[cluster_label] = {
        'model': lda_model,
        'dictionary': id2word,
        'questions': data,
        'corpus': corpus
    }

    # Assign tags to questions in the current cluster
    tags_per_question[cluster_label] = []
    for question_bow in corpus:
        topic_distribution = lda_model[question_bow]
        # Get the top keywords for each topic
        topic_keywords = lda_model.print_topic(
            max(topic_distribution, key=lambda item: item[1])[0], topn=topn
        )
        # Extract tags from keywords
        tags = [
            word.split("*")[1].replace('"', '') 
            for word in topic_keywords.split(" + ")
        ]
        tags_per_question[cluster_label].append(tags)

In [19]:
for cluster_label in lda_models:
    lda_model = lda_models[cluster_label]
    coherence = CoherenceModel(
        model=lda_model['model'],
        texts=lda_model['questions'],
        dictionary=lda_model['dictionary'],
        coherence='c_v'
    )
    print(cluster_label, ': Coherence =', round(coherence.get_coherence(), 2))
    print("Tags :")
    pprint(tags_per_question[cluster_label][:3])
    print('-' * 60)

-1 : Coherence = 0.59
Tags :
[['python', 'request', 'byte', 'error', 'multiple'],
 ['python', 'package', 'install', 'find', 'panda'],
 ['panda', 'numpy', 'module', 'get', 'group']]
------------------------------------------------------------
0 : Coherence = 0.56
Tags :
[['array', 'numpy', 'value', 'remove', 'frequency'],
 ['numpy', 'array', 'value', 'r', 'index'],
 ['array', 'convert', 'difference', 'numpy', 'display']]
------------------------------------------------------------
1 : Coherence = 0.57
Tags :
[['python', 'use', 'class', 'get', 'selenium'],
 ['python', 'use', 'loop', 'django', 'json'],
 ['use', 'python', 'good', 'practice', 'else']]
------------------------------------------------------------
2 : Coherence = 0.56
Tags :
[['python', 'number', 'equivalent', 'find', 'get'],
 ['python', 'none', 'value', 'c', 'exception'],
 ['python', 'get', 'implement', 'variable', 'environment']]
------------------------------------------------------------
3 : Coherence = 0.66
Tags :
[['func

In [18]:
# Visualization
lda_model = lda_models[0]
enable_notebook()
pyLDAvis.gensim_models.prepare(
    lda_model['model'],
    lda_model['corpus'],
    lda_model['dictionary']
)