In [1]:
import pandas as pd
import os

In [4]:
path_dateset = '../Dataset'

In [3]:
# remove "title" and "content" from the content
# remove "The user" from the beginning of the summary

df_issues = pd.read_json(os.path.join(path_dateset, 'issues_original.json'))
df_questions = pd.read_json(os.path.join(path_dateset, 'questions_original.json'))

df_issues['Issue_original_content'] = df_issues['Issue_original_content'].apply(
    lambda x: x.replace('Title: ', '').replace('Content: ', ''))
df_issues['Issue_original_content_gpt_summary'] = df_issues['Issue_original_content_gpt_summary'].apply(
    lambda x: x.removeprefix('The user '))
df_issues['Issue_preprocessed_content'] = df_issues['Issue_preprocessed_content'].apply(
    lambda x: x.replace('Title: ', '').replace('Content: ', ''))

df_questions['Question_original_content'] = df_questions['Question_original_content'].apply(
    lambda x: x.replace('Title: ', '').replace('Content: ', ''))
df_questions['Question_original_content_gpt_summary'] = df_questions['Question_original_content_gpt_summary'].apply(
    lambda x: x.removeprefix('The user '))
df_questions['Question_preprocessed_content'] = df_questions['Question_preprocessed_content'].apply(
    lambda x: x.replace('Title: ', '').replace('Content: ', ''))

df_issues['Original_content'] = df_issues['Issue_original_content']
df_issues['Original_content_gpt_summary'] = df_issues['Issue_original_content_gpt_summary']
df_issues['Preprocessed_content'] = df_issues['Issue_preprocessed_content']

df_questions['Original_content'] = df_questions['Question_original_content']
df_questions['Original_content_gpt_summary'] = df_questions['Question_original_content_gpt_summary']
df_questions['Preprocessed_content'] = df_questions['Question_preprocessed_content']

del df_issues['Issue_original_content']
del df_issues['Issue_original_content_gpt_summary']
del df_issues['Issue_preprocessed_content']

del df_questions['Question_original_content']
del df_questions['Question_original_content_gpt_summary']
del df_questions['Question_preprocessed_content']

df_challenges = pd.concat([df_issues, df_questions], ignore_index=True)
df_challenges.to_json(os.path.join(path_dateset, 'challenges_original.json'), indent=4, orient='records')


In [5]:
# visualize the best topic model

from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=20, n_components=10,
                  metric='manhattan', low_memory=False)

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, max_cluster_size=100)

# Step 4 - Tokenize topics
vectorizer_model = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - (Optional) Fine-tune topic representation
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,            # Step 1 - Extract embeddings
    umap_model=umap_model,                      # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
    # Step 6 - (Optional) Fine-tune topic represenations
    representation_model=representation_model,
    # verbose=True                              # Step 7 - Track model stages
)

df_issues = pd.read_json(os.path.join(path_dateset, 'issues_original.json'))
docs = df_issues['Issue_original_content_gpt_summary'].tolist()

topic_model = topic_model.fit(docs)
# topic_model.save(os.path.join(path_labeling_issue, 'Topic model'))

# fig = topic_model.visualize_topics()
# fig.write_html(os.path.join(path_dateset, 'Topic visualization.html'))

# fig = topic_model.visualize_barchart()
# fig.write_html(os.path.join(path_dateset, 'Term visualization.html'))

# fig = topic_model.visualize_heatmap()
# fig.write_html(os.path.join(path_dateset, 'Topic similarity visualization.html'))

# fig = topic_model.visualize_term_rank()
# fig.write_html(os.path.join(path_dateset, 'Term score decline visualization.html'))

# hierarchical_topics = topic_model.hierarchical_topics(docs)
# fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
# fig.write_html(os.path.join(path_dateset, 'Hierarchical clustering visualization.html'))

# embeddings = embedding_model.encode(docs, show_progress_bar=False)
# fig = topic_model.visualize_documents(docs, embeddings=embeddings)
# fig.write_html(os.path.join(path_dateset, 'Document visualization.html'))

info_df = topic_model.get_topic_info()
info_df


Unnamed: 0,Topic,Count,Name
0,-1,140,-1_attributeerror attempting_encountered attri...
1,0,40,0_git_local s3_file git_gitignore file
2,1,32,1_inconsistency logger_challenge logger_inside...
3,2,20,2_issue azure_azure account_subscriptions azur...
4,3,20,3_issue user_interface checkpoints_issue code_...
5,4,19,4_kubeflow_pipelines_kubeflow pipeline_kubeflo...
6,5,19,5_logger pytorch_bug pytorch_causing pytorch_p...
7,6,15,6_pipeline_ml_factory_generated pipeline_ml_fa...
8,7,13,7_kedro cli_failed kedro_kedro init_kedro yml
9,8,10,8_pycaret runs_issue pycaret_challenges pycare...
