In [2]:
import pandas as pd
import os

In [5]:
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [3]:
path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

In [3]:
# remove "title" and "content" from the content
# remove "The user" from the beginning of the summary

df_issues = pd.read_json(os.path.join(path_dataset, 'issues_original.json'))
df_questions = pd.read_json(os.path.join(
    path_dataset, 'questions_original.json'))

df_issues['Issue_original_content'] = df_issues['Issue_original_content'].apply(
    lambda x: x.replace('Title: ', '').replace('; Content:', ''))
df_issues['Issue_original_content_gpt_summary'] = df_issues['Issue_original_content_gpt_summary'].apply(
    lambda x: x.removeprefix('The user '))
df_issues['Issue_preprocessed_content'] = df_issues['Issue_preprocessed_content'].apply(
    lambda x: x.replace('Title: ', '').replace('; Content:', ''))

df_questions['Question_original_content'] = df_questions['Question_original_content'].apply(
    lambda x: x.replace('Title: ', '').replace('; Content:', ''))
df_questions['Question_original_content_gpt_summary'] = df_questions['Question_original_content_gpt_summary'].apply(
    lambda x: x.removeprefix('The user '))
df_questions['Question_preprocessed_content'] = df_questions['Question_preprocessed_content'].apply(
    lambda x: x.replace('Title: ', '').replace('; Content:', ''))

df_issues['Challenge_original_content'] = df_issues['Issue_original_content']
df_issues['Challenge_original_content_gpt_summary'] = df_issues['Issue_original_content_gpt_summary']
df_issues['Challenge_preprocessed_content'] = df_issues['Issue_preprocessed_content']

df_questions['Challenge_original_content'] = df_questions['Question_original_content']
df_questions['Challenge_original_content_gpt_summary'] = df_questions['Question_original_content_gpt_summary']
df_questions['Challenge_preprocessed_content'] = df_questions['Question_preprocessed_content']

df_questions['Solution_original_content'] = df_questions['Answer_original_content']
df_questions['Solution_original_content_gpt_summary'] = df_questions['Answer_original_content_gpt_summary']
df_questions['Solution_preprocessed_content'] = df_questions['Answer_preprocessed_content']

del df_issues['Issue_original_content']
del df_issues['Issue_original_content_gpt_summary']
del df_issues['Issue_preprocessed_content']

del df_questions['Question_original_content']
del df_questions['Question_original_content_gpt_summary']
del df_questions['Question_preprocessed_content']

del df_questions['Answer_original_content']
del df_questions['Answer_original_content_gpt_summary']
del df_questions['Answer_preprocessed_content']

df_all = pd.concat([df_issues, df_questions], ignore_index=True)
df_all.to_json(os.path.join(path_dataset, 'all_original.json'),
               indent=4, orient='records')


In [4]:
df_all = pd.read_json(os.path.join(path_dataset, 'all_original.json'))

# remove issues with uninformed content
for index, row in df_all.iterrows():
    if len(row['Challenge_original_content'].split()) < 4 or len(row['Challenge_original_content']) < 20:
        print(row['Challenge_original_content'])
        df_all.drop(index, inplace=True)
    elif row['Solution_original_content'] and (len(row['Solution_original_content'].split()) < 6 or len(row['Solution_original_content']) < 30):
        print(row['Solution_original_content'])
        df_all.drop(index, inplace=True)

df_all.to_json(os.path.join(path_dataset, 'all_filtered.json'),
               indent=4, orient='records')


error loading
fix import issue
logger none
i solved it, thank you.
azurebug1 point
you can try https://docs.microsoft.com/en-us/azure/machine-learning/how-to-debug-visual-studio-code#debug-and-troubleshoot-deployments
this is now solved. thanks!
try this in postman.
answered on github
yes that looks correct!
thank you! all fixed.


In [8]:
# visualize the best challenge topic model

from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_components=5, metric='manhattan',
                  random_state=42, low_memory=False)

# Step 3 - Cluster reduced embeddings
min_samples = int(35 * 0.5)
hdbscan_model = HDBSCAN(min_cluster_size=35, min_samples=min_samples, prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - (Optional) Fine-tune topic representation
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    calculate_probabilities=True
)

df_all = pd.read_json(os.path.join(path_dataset, 'all_filtered.json'))
docs = df_all['Challenge_original_content_gpt_summary'].tolist()

topics, probs = topic_model.fit_transform(docs)
# This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.
new_topics_challenge = topic_model.reduce_outliers(
    docs, topics, probabilities=probs, strategy="probabilities")

# topic_model.save(os.path.join(path_dataset, 'Topic model'))

# fig = topic_model.visualize_topics()
# fig.write_html(os.path.join(path_dataset, 'Topic visualization.html'))

# fig = topic_model.visualize_barchart()
# fig.write_html(os.path.join(path_dataset, 'Term visualization.html'))

# fig = topic_model.visualize_heatmap()
# fig.write_html(os.path.join(path_dataset, 'Topic similarity visualization.html'))

# fig = topic_model.visualize_term_rank()
# fig.write_html(os.path.join(path_dataset, 'Term score decline visualization.html'))

# hierarchical_topics = topic_model.hierarchical_topics(docs)
# fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
# fig.write_html(os.path.join(path_dataset, 'Hierarchical clustering visualization.html'))

# embeddings = embedding_model.encode(docs, show_progress_bar=False)
# fig = topic_model.visualize_documents(docs, embeddings=embeddings)
# fig.write_html(os.path.join(path_dataset, 'Document visualization.html'))

info_df = topic_model.get_topic_info()
info_df

Unnamed: 0,Topic,Count,Name
0,-1,4870,-1_error attempting_error message_azure_endpoi...
1,0,332,0_accessing s3_s3 bucket_access s3_s3 buckets
2,1,297,1_git repository_git lfs_git repo_version control
3,2,272,2_line plots_plot user_line chart_bar chart
4,3,261,3_file batch_dataset csv_batch transform_csv f...
5,4,237,4_running pytorch_pytorch_model bin_deploy pyt...
6,5,201,5_modulenotfounderror attempting_encountering ...
7,6,192,6_jupyter notebooks_jupyter notebook_managed j...
8,7,182,7_annotation job_labelling job_labeling job_la...
9,8,182,8_dataset azure_azure file_azure blob_azure data


In [9]:
df_all = pd.read_json(os.path.join(path_dataset, 'all_filtered.json'))
df_all['Challenge_topic'] = ''

for index, row in df_all.iterrows():
  df_all.at[index, 'Challenge_topic'] = new_topics_challenge.pop(0)

df_all.to_json(os.path.join(path_dataset, 'all_topics.json'),
               indent=4, orient='records')

In [10]:
# visualize the best solution topic model

from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from sentence_transformers import SentenceTransformer
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_components=5, metric='manhattan',
                  random_state=42, low_memory=False)

# Step 3 - Cluster reduced embeddings
min_samples = int(15 * 0.5)
hdbscan_model = HDBSCAN(min_cluster_size=15, min_samples=min_samples, prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Step 6 - (Optional) Fine-tune topic representation
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    calculate_probabilities=True
)

df_all = pd.read_json(os.path.join(path_dataset, 'all_filtered.json'))
df_all = df_all[df_all['Solution_original_content'].isnull() == False]
df_all = df_all[df_all['Solution_original_content'] != '']
docs = df_all['Challenge_original_content_gpt_summary'].tolist()

topics, probs = topic_model.fit_transform(docs)
# This uses the soft-clustering as performed by HDBSCAN to find the best matching topic for each outlier document.
new_topics_solution = topic_model.reduce_outliers(
    docs, topics, probabilities=probs, strategy="probabilities")

# topic_model.save(os.path.join(path_dataset, 'Topic model'))

# fig = topic_model.visualize_topics()
# fig.write_html(os.path.join(path_dataset, 'Topic visualization.html'))

# fig = topic_model.visualize_barchart()
# fig.write_html(os.path.join(path_dataset, 'Term visualization.html'))

# fig = topic_model.visualize_heatmap()
# fig.write_html(os.path.join(path_dataset, 'Topic similarity visualization.html'))

# fig = topic_model.visualize_term_rank()
# fig.write_html(os.path.join(path_dataset, 'Term score decline visualization.html'))

# hierarchical_topics = topic_model.hierarchical_topics(docs)
# fig = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
# fig.write_html(os.path.join(path_dataset, 'Hierarchical clustering visualization.html'))

# embeddings = embedding_model.encode(docs, show_progress_bar=False)
# fig = topic_model.visualize_documents(docs, embeddings=embeddings)
# fig.write_html(os.path.join(path_dataset, 'Document visualization.html'))

info_df = topic_model.get_topic_info()
info_df

Unnamed: 0,Topic,Count,Name
0,-1,2076,-1_ai platform_lifecycle configuration_ml_aws
1,0,207,0_bar chart_line chart_plot user_metrics run
2,1,192,1_data git_git lfs_local git_version control
3,2,160,2_accessing s3_access s3_s3 bucket_s3 user
4,3,134,3_hyperparameter sweep_parameters sweep_hyperp...
5,4,99,4_azure blob_azure dataset_dataset azure_azure...
6,5,99,5_azure kubernetes_endpoint deployment_deployi...
7,6,77,6_launching jupyterlabs_jupyter notebooks_run ...
8,7,77,7_web services_web service_webservice_webservi...
9,8,75,8_azure automl_azure auto_automl forecasting_a...


In [15]:
df_all = pd.read_json(os.path.join(path_dataset, 'all_topics.json'))
df_all['Solution_topic'] = -1

for index, row in df_all.iterrows():
  if not row['Solution_original_content_gpt_summary']:
    continue
  df_all.at[index, 'Solution_topic'] = new_topics_solution.pop(0)

df_all.to_json(os.path.join(path_dataset, 'all_topics.json'),
               indent=4, orient='records')