In [1]:
from contextualized_topic_models.evaluation.measures import InvertedRBO, TopicDiversity, CoherenceCV, CoherenceNPMI, CoherenceUMASS, CoherenceUCI
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP

import pandas as pd
import time
import glob


In [2]:
import os

path_dataset = '../Dataset'

path_so = os.path.join(path_dataset, 'Stack Overflow')
path_ts = os.path.join(path_dataset, 'Tool-specific Others')
path_labeling = os.path.join(path_dataset, 'Labeling')

path_so_raw = os.path.join(path_so, 'Raw')
path_ts_raw = os.path.join(path_ts, 'Raw')
path_so_filtered = os.path.join(path_so, 'Filtered')
path_ts_filtered = os.path.join(path_ts, 'Filtered')
    
if not os.path.exists(path_dataset):
    os.makedirs(path_dataset)

if not os.path.isdir(path_so):
    os.mkdir(path_so)

if not os.path.isdir(path_ts):
    os.mkdir(path_ts)

if not os.path.isdir(path_labeling):
    os.mkdir(path_labeling)

if not os.path.isdir(path_so_raw):
    os.mkdir(path_so_raw)

if not os.path.isdir(path_ts_raw):
    os.mkdir(path_ts_raw)

if not os.path.isdir(path_so_filtered):
    os.mkdir(path_so_filtered)

if not os.path.isdir(path_ts_filtered):
    os.mkdir(path_ts_filtered)


In [4]:
tool2tag = {
    'Amazon SageMaker': {'amazon-sagemaker', 'amazon-sagemaker-experiments', 'amazon-sagemaker-studio'},
    'Azure Machine Learning': {'azure-machine-learning-service', 'azure-machine-learning-studio', 'azure-machine-learning-workbench'},
    'ClearML': {'clearml'},
    'Comet': {'comet-ml'},
    'DVC': {'dvc'},
    'Kedro': {'kedro'},
    'MLflow': {'mlflow'},
    'MLRun': {'mlrun'},
    'Neptune': {'neptune'},
    'Sacred': {'python-sacred'},
    'Vertex AI': {'google-cloud-vertex-ai'},
    'Weights & Biases': {'wandb'}
}

tools_keywords = {
    'Amazon SageMaker': ['amazon sagemaker', 'aws sagemaker', 'sagemaker'],
    'Azure Machine Learning': ['azureml', 'azure ml', 'azure machine learning'],
    'ClearML': ['clearml'],
    'Comet': ['comet'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'Neptune': ['neptune'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Vertex AI': ['vertex ai'],
    'Weights & Biases': ['weights & biases', 'weights and biases', 'wandb']
}

ignore_tools = {
    
}


In [3]:
import openai

openai.api_key = os.getenv('OPENAI_API_KEY')

In [3]:
import wandb

wandb.login(key=os.getenv('WANDB_API_KEY'))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mzhiminy[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
import requests
import json

# function to scrape the posts from the tool-specific discussion fora

def scrape_post(base_url, page_suffix, file_name):
    page = -1
    post_list = []
    
    while True:
        page = page + 1
        page_url = base_url + page_suffix + str(page)
        topic_list = requests.get(page_url).json()['topic_list']

        for topic in topic_list['topics']:
            post_url = base_url + 't/' + \
            topic['slug'] + '/' + str(topic['id'])

            post = {}
            post['Question_title'] = topic['title']
            post['Question_link'] = post_url
            post['Question_creation_time'] = topic['created_at']
            post['Question_answer_count'] = topic['posts_count'] - 1
            post['Question_score'] = topic['like_count']
            post['Question_view_count'] = topic['views']
            post['Question_has_accepted_answer'] = topic['has_accepted_answer']
            comments = requests.get(post_url + '.json').json()['post_stream']['posts']
            post['Question_body'] = comments[0]['cooked']
            post['Answer_list'] = comments[1:]
            post_list.append(post)

            time.sleep(2)

        if 'more_topics_url' not in topic_list.keys():
            break
        
    with open(os.path.join(path_ts_raw, file_name), 'w') as outfile:
        json_post_list = json.dumps(post_list, indent='\t')
        outfile.write(json_post_list)

In [8]:
# scrape posts from Guild AI
base_url = 'https://my.guild.ai/'
page_suffix = 'c/troubleshooting/6.json?page='
file_name = 'Guild AI.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [7]:
# scrape posts from Weights & Biases
base_url = 'https://community.wandb.ai/'
page_suffix = 'c/w-b-support/36.json?page='
file_name = 'Weights & Biases.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [5]:
# scrape posts from SigOpt
base_url = 'https://community.sigopt.com/'
page_suffix = 'c/general-discussion/9.json?page='
file_name = 'SigOpt.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [5]:
# scrape posts from DVC
base_url = 'https://discuss.dvc.org/'
page_suffix = 'c/questions/9.json?page='
file_name = 'DVC.json'
post_list = scrape_post(base_url, page_suffix, file_name)

In [28]:
df_ts_questions = pd.DataFrame()

# exclude tool-specific posts with negative upvote count
for file_name in glob.glob(os.path.join(path_ts_raw, '*.json')):
    repos = pd.read_json(file_name)
    if 'Question_score' in repos.columns:
        repos = repos[repos['Question_score'] > -1]
    repos['Tool'] = os.path.split(file_name)[1].split('.')[0]
    df_ts_questions = pd.concat([df_ts_questions, repos], ignore_index=True)
    
df_ts_answers = df_ts_questions[df_ts_questions['Question_answer_count'] > 0]
for tool in df_ts_answers['Tool'].unique().tolist():
    number_accepted_answer = df_ts_answers[df_ts_answers['Tool']
                                            == tool]['Question_has_accepted_answer'].sum()
    if number_accepted_answer > 0:
        df_ts_answers = df_ts_answers.drop(df_ts_answers[(df_ts_answers['Tool'] == tool) & (
            df_ts_answers['Question_has_accepted_answer'] == False)].index)

df_ts_questions.to_json(os.path.join(path_ts_filtered,
              'questions.json'), orient='records', indent=4)

In [94]:
# keep only posts with at least one answer
df_ts_question_summary = df_ts_questions.groupby(
    'Tool').count()['Question_title'].reset_index()
df_ts_answer_summary = df_ts_answers.groupby(
    'Tool').count()['Question_title'].reset_index()

df_ts_question_summary.columns = ['Tool', '#Question']
df_ts_answer_summary.columns = ['Tool', '#Answered']

df_summary = pd.merge(df_ts_question_summary, df_ts_answer_summary, on='Tool')
df_summary


Unnamed: 0,Tool,#Question,#Answered
0,Amazon SageMaker,528,167
1,Azure Machine Learning,1435,343
2,DVC,315,300
3,Domino,13,4
4,Guild AI,115,108
5,MLFlow,280,143
6,Polyaxon,43,34
7,SigOpt,15,7
8,Vertex AI,297,32
9,Weights & Biases,583,92


In [10]:
# # After having the population for each tool and discussion channel, we then find out the minimum number of necessary samples with the [calculator](https://www.calculator.net/sample-size-calculator.html).
# df_summary = pd.read_csv(os.path.join(path_ts, 'summary.csv'))
# df_summary

Unnamed: 0,Tool,#Question,#Answered,#Sample Question,#Sample Answered
0,Amazon SageMaker,528,167,223,117
1,Azure Machine Learning,1435,343,304,182
2,DVC,315,300,174,169
3,Domino,13,4,13,4
4,Guild AI,115,108,89,85
5,MLFlow,280,143,163,105
6,Polyaxon,43,34,39,32
7,SigOpt,15,7,15,7
8,Vertex AI,297,32,168,30
9,Weights & Biases,583,92,232,75


In [17]:
# df_ts = pd.read_json(os.path.join(
#     path_ts_filtered, 'non_negative_scored.json'))
# df_ts_answered = pd.read_json(os.path.join(
#     path_ts_filtered, 'completed_non_negative_scored.json'))

In [18]:
# # sample tool-specific posts accordingly
# df_question_samples = pd.DataFrame()
# df_answer_samples = pd.DataFrame()

# for index, row in df_summary.iterrows():
#     df_question_sample = df_ts[df_ts['Tool'] == row['Tool']].sample(
#         n=row['#Sample Question'], random_state=0)
#     df_answer_sample = df_ts_answered[df_ts_answered['Tool'] == row['Tool']].sample(
#         n=row['#Sample Answered'], random_state=0)
#     df_question_samples = pd.concat(
#         [df_question_samples, df_question_sample], ignore_index=True)
#     df_answer_samples = pd.concat(
#         [df_answer_samples, df_answer_sample], ignore_index=True)

# df_question_samples.to_json(os.path.join(
#     path_ts_sampled, 'questions.json'), indent=4, orient='records')
# df_answer_samples.to_json(os.path.join(
#     path_ts_sampled, 'answers.json'), indent=4, orient='records')

In [21]:
# # concatenate tool-specific answered and unanswered posts
# df_question_samples = pd.read_json(os.path.join(path_ts_sampled, 'questions.json'))
# df_answer_samples = pd.read_json(os.path.join(path_ts_sampled, 'answers.json'))

# df_question_samples.drop(['Question_topic', 'Question_tag', 'Answers'], axis=1, inplace=True)
# df_answer_samples.drop(['Question_topic', 'Question_tag', 'Answers'], axis=1, inplace=True)

# df_question_samples_ts = pd.merge(df_question_samples, df_answer_samples, on=df_question_samples.columns.tolist(), how='outer')
# df_question_samples_ts.to_json(os.path.join(path_labeling, 'questions_ts.json'), indent=4, orient='records')

In [16]:
df = pd.read_json(os.path.join(
    path_so_raw, 'bq-results-20230201-032754-1675222092237.json'), lines=True)
df['Question_tags'] = df['Question_tags'].str.split('|')
df.head()

Unnamed: 0,Question_id,Question_title,Question_body,Question_answer_count,Question_comment_count,Question_creation_time,Question_favorite_count,Question_score,Question_tags,Question_view_count,...,Owner_up_votes,Owner_down_votes,Owner_views,Answer_body,Answer_comment_count,Answer_creation_time,Answer_score,Owner_location,Question_last_edit_time,Answer_last_edit_time
0,70098779,How to connect to MLFlow tracking server that ...,<p>I want to connect to remote tracking server...,1,0,2021-11-24 15:30:11.310000+00:00,1.0,1,"[authorization, tracking, mlflow]",2102,...,0,0,11,"<p><a href=""https://mlflow.org/docs/latest/tra...",2.0,2021-11-24 17:01:13.483000+00:00,2.0,,NaT,NaT
1,38927230,"Panda AssertionError columns passed, passed da...",<p>I am working on Azure ML implementation on ...,1,0,2016-08-12 22:23:17.197000+00:00,3.0,7,"[python, pandas, dataframe, nltk, azure-machin...",48200,...,136,55,339,<p>Try this:</p>\n\n<pre><code>dataframe_outpu...,0.0,2016-08-12 22:26:09.603000+00:00,13.0,"Toronto, ON, Canada",NaT,NaT
2,68773463,AccessDeniedException on sagemaker:CreateDomai...,<p>I am trying to use the AWS SageMaker Studio...,1,0,2021-08-13 13:49:08.683000+00:00,,0,"[amazon-web-services, amazon-iam, amazon-sagem...",366,...,0,0,11,,,NaT,,,NaT,NaT
3,67701971,How to label a text with multiple paragraphs i...,<p>I was trying setup a single label labeling ...,0,2,2021-05-26 09:16:33.420000+00:00,,1,"[amazon-web-services, text, amazon-sagemaker, ...",161,...,75,10,147,,,NaT,,"Zürich, Suïssa",2021-05-26 11:54:00.030000+00:00,NaT
4,48398509,How to Invoke AWS Sagemaker API with c# .NET?,<p>I have trained and deployed a model in AWS ...,1,0,2018-01-23 09:42:48.607000+00:00,,0,"[c#, asp.net, amazon-web-services, aws-sdk, am...",743,...,34,1,124,,,NaT,,Pune India,NaT,NaT


In [25]:
# create tag collection
tags = set()
for key, value in tool2tag.items():
    tags = tags.union(value)

In [26]:
# split tags
df['Question_valid_tags'] = [[] for _ in range(len(df))]
for index, row in df.iterrows():
    df.at[index, 'Question_valid_tags'] = list(tags.intersection(set(row['Question_tags'])))

In [27]:
# count post number with different tags
arity = 0
while True:
    post_number = df[df['Question_valid_tags'].map(len) > arity].shape[0]
    if post_number < 1:
        break
    arity = arity + 1
    print(f'Posts with at least {arity} tags has {post_number} in total.')


Posts with at least 1 tags has 5130 in total.
Posts with at least 2 tags has 220 in total.
Posts with at least 3 tags has 18 in total.


In [28]:
# exclude Stack Overflow posts with unrelated tags
df_valid = df[df['Question_valid_tags'].map(len) > 0]
df_valid['Question_link'] = df_valid['Question_id'].apply(
    lambda x: f'https://stackoverflow.com/questions/{x}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['Question_link'] = df_valid['Question_id'].apply(


In [29]:
# exclude Stack Overflow posts with negative upvote count
df_qualified = df_valid[df_valid['Question_score'] > -1]

In [30]:
# create a map from tag to tool
tag2tool = dict()
for key, value in tool2tag.items():
    for elem in value:
        tag2tool.setdefault(elem, key)

In [31]:
# extract Stack Overflow post collection with multiple tags based on the tool map
for index, row in df_qualified.iterrows():
    tags = set()
    for tag in row['Question_valid_tags']:
        tags.add(tag2tool[tag])
    df_qualified.at[index, 'Question_valid_tags'] = sorted(list(tags))

In [32]:
# check how the posts with more than one tags look like
df_multiply_tagged = df_qualified[df_qualified['Question_valid_tags'].map(
    len) > 1]
df_multiply_tagged['Question_valid_tags'].value_counts()

[Amazon SageMaker, MLFlow]                 16
[Azure Machine Learning, MLFlow]           11
[Kedro, MLFlow]                             4
[Azure Machine Learning, Kedro, MLFlow]     2
[DVC, MLFlow]                               1
[MLFlow, Sacred]                            1
[Kedro, Neptune]                            1
Name: Question_valid_tags, dtype: int64

In [33]:
# create Stack Overflow post collection with exclusive tags
multiply_tagged_posts_split = []
df_qualified.assign(Tool='')

for index, row in df_qualified.iterrows():
    tags = row['Question_valid_tags']
    df_qualified.at[index, 'Tool'] = tags[0]
    if len(tags) > 1:
        for tag in tags[1:]:
            series = row.copy()
            series['Tool'] = tag
            multiply_tagged_posts_split.append(series)

df_multiply_tagged_posts_split = pd.DataFrame(multiply_tagged_posts_split)
df_qualified_exclusive_tagged = pd.concat(
    [df_qualified, df_multiply_tagged_posts_split], ignore_index=True)
del df_qualified_exclusive_tagged['Question_valid_tags']

# keep Stack Overflow posts with accepted answers
df_qualified_exclusive_tagged_completed = df_qualified_exclusive_tagged.dropna(
    subset=['Answer_body'])

df_qualified_exclusive_tagged.to_json(os.path.join(
    path_so_filtered, 'questions.json'), indent=4, orient='records')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_qualified.at[index, 'Tool'] = tags[0]


In [34]:
df_so_question_summary = df_qualified_exclusive_tagged.groupby(
    'Tool').count()['Question_id'].reset_index()
df_so_answer_summary = df_qualified_exclusive_tagged_completed.groupby(
    'Tool').count()['Question_id'].reset_index()

df_so_question_summary.columns = ['Tool', '#Question']
df_so_answer_summary.columns = ['Tool', '#Answered']

df_summary = pd.merge(df_so_question_summary, df_so_answer_summary, on='Tool')
df_summary


Unnamed: 0,Tool,#Question,#Answered
0,Amazon SageMaker,2233,737
1,Azure Machine Learning,1530,586
2,ClearML,40,20
3,Comet,10,4
4,DVC,91,49
5,Kedro,149,60
6,MLFlow,551,129
7,Neptune,8,3
8,Sacred,10,7
9,Vertex AI,341,112


In [17]:
# # After having the population for each tool and discussion channel, we then find out the minimum number of necessary samples with the [calculator](https://www.calculator.net/sample-size-calculator.html).
# df_summary = pd.read_csv(os.path.join(path_so, 'summary.csv'))
# df_summary

Unnamed: 0,Tool,#Question,#Answered,#Sample Question,#Sample Answered
0,Amazon SageMaker,2233,737,328,253
1,Azure Machine Learning,1530,586,308,233
2,ClearML,40,20,37,20
3,Comet,10,4,10,4
4,DVC,91,49,74,44
5,Kedro,149,60,108,53
6,MLFlow,551,129,227,97
7,Neptune,8,3,8,3
8,Sacred,10,7,10,7
9,Vertex AI,341,112,181,87


In [18]:
# df_qualified_exclusive_tagged = pd.read_json(os.path.join(
#     path_so_filtered, 'questions.json'))
# df_qualified_exclusive_tagged_completed = pd.read_json(os.path.join(
#     path_so_filtered, 'answers.json'))

In [15]:
# # sample Stack Overflow posts accordingly
# df_question_samples = pd.DataFrame()
# df_answer_samples = pd.DataFrame()

# for index, row in df_summary.iterrows():
#     df_question_sample = df_qualified_exclusive_tagged[df_qualified_exclusive_tagged['Tool'] == row['Tool']].sample(
#         n=row['#Sample Question'], random_state=0)
#     df_answer_sample = df_qualified_exclusive_tagged_completed[df_qualified_exclusive_tagged_completed['Tool'] == row['Tool']].sample(
#         n=row['#Sample Answered'], random_state=0)
#     df_question_samples = pd.concat(
#         [df_question_samples, df_question_sample], ignore_index=True)
#     df_answer_samples = pd.concat(
#         [df_answer_samples, df_answer_sample], ignore_index=True)

# df_question_samples.to_json(os.path.join(
#     path_so_sampled, 'questions.json'), indent=4, orient='records')
# df_answer_samples.to_json(os.path.join(
#     path_so_sampled, 'answers.json'), indent=4, orient='records')

In [22]:
# # concatenate Stack Overflow answered and unanswered posts
# df_question_samples = pd.read_json(os.path.join(path_so_sampled, 'questions.json'))
# df_answer_samples = pd.read_json(os.path.join(path_so_sampled, 'answers.json'))
# df_question_samples_so = pd.merge(df_question_samples, df_answer_samples, on=df_question_samples.columns.tolist(), how='outer')
# df_question_samples_so.to_json(os.path.join(path_labeling, 'questions_so.json'), indent=4, orient='records')

In [25]:
# # concatenate Stack Overflow and tool-specific posts
# df_question_samples_ts = pd.read_json(os.path.join(path_labeling, 'questions_ts.json'))
# df_question_samples_so = pd.read_json(os.path.join(path_labeling, 'questions_so.json'))

# df_answer_samples_ts = pd.read_json(os.path.join(path_ts_sampled, 'answers.json'))
# df_answer_samples_so = pd.read_json(os.path.join(path_so_sampled, 'answers.json'))

# df_question_samples_all = pd.concat([df_question_samples_ts, df_question_samples_so], ignore_index=True)
# df_answer_samples_all = pd.concat([df_answer_samples_ts, df_answer_samples_so], ignore_index=True)

# df_question_samples_all.to_json(os.path.join(path_labeling, 'questions_all.json'), indent=4, orient='records')
# df_answer_samples_all.to_json(os.path.join(path_labeling, 'answers_all.json'), indent=4, orient='records')

In [43]:
# combine Stack Overflow and tool-specific fora posts
df_question_so = pd.read_json(os.path.join(path_so_filtered, 'questions.json'))
df_question_ts = pd.read_json(os.path.join(path_ts_filtered, 'questions.json'))

df_question_so['Forum'] = 'Stack Overflow'
df_question_ts['Forum'] = 'Tool-specific'

df_questions = pd.concat([df_question_so, df_question_ts], ignore_index=True)
df_questions.to_json(os.path.join(path_labeling, 'questions_original.json'), indent=4, orient='records')

In [44]:
# add potential field to questions for later filling
df_questions = pd.read_json(os.path.join(path_labeling, 'questions_original.json'))

# Experiment 1: feed the original content to BerTopic
df_questions['Question_original_content_preprocessed_text'] = ''

# Experiment 2: feed the original content to text-davinci-003 model and get the generated summary, then feed the summary to BerTopic
df_questions['Question_original_content_gpt_summary'] = ''

# Experiment 3: feed the preprocessed content to BerTopic
df_questions['Question_preprocessed_content'] = ''

# Experiment 4: feed the preprocessed content to text-davinci-003 model and get the generated summary, then feed the summary to BerTopic
df_questions['Question_preprocessed_content_gpt_summary'] = ''

df_questions.to_json(os.path.join(path_labeling, 'questions_topic_modeling.json'), indent=4, orient='records')

In [5]:
# # create summary field for all issues
# question_prefill_so = pd.read_json(os.path.join(path_so_filtered, 'questions.json'))
# question_prefill_ts = pd.read_json(os.path.join(path_ts_filtered, 'questions.json'))
# question_prefill_so['Question_summary'] = ''
# question_prefill_ts['Question_summary'] = ''
# question_prefill_so.to_json(os.path.join(path_labeling, 'question_prefill_so.json'), indent=4, orient='records')
# question_prefill_ts.to_json(os.path.join(path_labeling, 'question_prefill_ts.json'), indent=4, orient='records')

In [15]:
# # manually preprocess the content of the issues and feed them into "text-davinci-003" for summary generation
# for index, row in question_prefill_so.iterrows():
#     if (len(str(row['Question_body'])) > 10000):
#         print(row['Question_link'])

https://stackoverflow.com/questions/61285248
https://stackoverflow.com/questions/62993397
https://stackoverflow.com/questions/68955197
https://stackoverflow.com/questions/40907303
https://stackoverflow.com/questions/73064569
https://stackoverflow.com/questions/56046428
https://stackoverflow.com/questions/63339703
https://stackoverflow.com/questions/65577286
https://stackoverflow.com/questions/51064366
https://stackoverflow.com/questions/60957084
https://stackoverflow.com/questions/68150444
https://stackoverflow.com/questions/59762829
https://stackoverflow.com/questions/62813017
https://stackoverflow.com/questions/62836278
https://stackoverflow.com/questions/69466354
https://stackoverflow.com/questions/68489311
https://stackoverflow.com/questions/67599026
https://stackoverflow.com/questions/69721067
https://stackoverflow.com/questions/68192602
https://stackoverflow.com/questions/73085199
https://stackoverflow.com/questions/62569747
https://stackoverflow.com/questions/51088145
https://st

In [14]:
# # manually preprocess the content of the issues and feed them into "text-davinci-003" for summary generation
# for index, row in question_prefill_ts.iterrows():
#     if (len(str(row['Question_body'])) > 10000):
#         print(row['Question_link'])

https://repost.aws/questions/QU6Ahf5zWZRZq63k1TOQ_48w/using-hugging-face-in-sagemaker-studio-as-part-of-a-project
https://repost.aws/questions/QUVfbc_AsXRzaxAl69MMFlsQ/error-for-training-job-catboost-classification-model-error-message-type-error-cannot-convert-xxx-to-float
https://repost.aws/questions/QU8UWsqxW8RbejgzLHYIFduA/inconsistent-keras-model-summary-output-shapes-on-aws-sage-maker-and-ec-2
https://repost.aws/questions/QULAis68RtShua1Wg8A5EFXg/deploy-yol-ov-5-in-sagemaker-model-error-invoke-endpoint-operation-received-server-error-0
https://repost.aws/questions/QUT86tGF5tRB2hC168_n5MAQ/failed-ping-healthcheck-after-deploying-tf-2-1-model-with-tf-serving-contain
https://learn.microsoft.com/answers/questions/893526/acideploymentfailed.html
https://learn.microsoft.com/answers/questions/981325/azureml-core-1440-fails-to-deploy-model-to-webserv.html
https://learn.microsoft.com/answers/questions/925083/az-ml-designer-swagger-file-missing-on-deployment.html
https://learn.microsoft.com

In [17]:
# question_so = pd.read_json(os.path.join(path_labeling, 'question_prefill_so.json'))
# question_ts = pd.read_json(os.path.join(path_labeling, 'question_prefill_ts.json'))
# question_todo_so = question_so[question_so['Question_summary'] == '']
# question_todo_ts = question_ts[question_ts['Question_summary'] == '']
# question_done_so = question_so[question_so['Question_summary'] != '']
# question_done_ts = question_ts[question_ts['Question_summary'] != '']
# question_todo_so.shape[0], question_todo_ts.shape[0]

(4967, 3595)

In [46]:
# Experiment 1

df_questions = pd.read_json(os.path.join(path_labeling, 'questions_topic_modeling.json'))

for index, row in df_questions.iterrows():
    title = row['Question_title']
    body = str(row['Question_body'])    
    content = ('Title: ' + title + '; Content:' + body if len(body) else title).lower()
    
    for tool_keyword in tools_keywords[row['Tool']]:
        if tool_keyword in content:
            content = content.replace(tool_keyword, '')
    
    df_questions.at[index, 'Question_original_content_preprocessed_text'] = content
    
df_questions.to_json(os.path.join(path_labeling, 'questions_topic_modeling.json'), indent=4, orient='records')

In [None]:
# Experiment 1

sweep_configuration = {
    "name": "question-experiment-1",
    "metric": {
        'name': 'CoherenceCV',
        'goal': 'maximize'
    },
    "method": "grid",
    "parameters": {
        'n_neighbors': {
            'values': list(range(10, 22, 2))
        },
        'n_components': {
            'values': list(range(4, 12, 2))
        }
    }
}

# set default values for hyperparameters
config_defaults = {
    'model_name': 'all-mpnet-base-v2',
    'metric_distane': 'manhattan',
    'low_memory': False,
    'stop_words': 'english',
    'ngram_range': (1, 5),
    'reduce_frequent_words': True
}

df_issues = pd.read_json(os.path.join(
    path_labeling, 'questions_topic_modeling.json'))
docs = df_issues['Question_original_content_preprocessed_text'].tolist()

def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config) as run:
        # update any values not set by sweep
        run.config.setdefaults(config_defaults)

        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        # Step 1 - Extract embeddings
        embedding_model = SentenceTransformer(run.config.model_name)

        # Step 2 - Reduce dimensionality
        umap_model = UMAP(n_neighbors=config.n_neighbors, n_components=config.n_components,
                          metric=run.config.metric_distane, low_memory=run.config.low_memory)

        # Step 3 - Cluster reduced embeddings
        hdbscan_model = HDBSCAN()

        # Step 4 - Tokenize topics
        vectorizer_model = TfidfVectorizer(
            stop_words=run.config.stop_words, ngram_range=run.config.ngram_range)

        # Step 5 - Create topic representation
        ctfidf_model = ClassTfidfTransformer(
            reduce_frequent_words=run.config.reduce_frequent_words)

        # Step 6 - (Optional) Fine-tune topic representations with a `bertopic.representation` model
        representation_model = KeyBERTInspired()

        # All steps together
        topic_model = BERTopic(
            embedding_model=embedding_model,            # Step 1 - Extract embeddings
            umap_model=umap_model,                      # Step 2 - Reduce dimensionality
            hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
            vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
            ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
            representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
            # verbose=True                              # Step 7 - Track model stages
        )

        topic_model = topic_model.fit(docs)

        tokenized_docs = [doc.tolist() for doc in vectorizer_model.inverse_transform(
            vectorizer_model.transform(docs))]
        topic_words = [[words for words, _ in topic_model.get_topic(
            topic)] for topic in range(len(topic_model.get_topics())-1)]

        wandb.log({'CoherenceCV': CoherenceCV(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'CoherenceNPMI': CoherenceNPMI(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'CoherenceUMASS': CoherenceUMASS(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'CoherenceUCI': CoherenceUCI(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'InvertedRBO': InvertedRBO(topics=topic_words).score()})
        wandb.log({'TopicDiversity': TopicDiversity(
            topics=topic_words).score(10)})


sweep_id = wandb.sweep(sweep_configuration, project='asset-management-project')
# Create sweep with ID: 232p6rp5
wandb.agent(sweep_id=sweep_id, function=train)


In [4]:
# Experiment 2 & 4

import random

prompt = 'Please use 1 to 2 sentences to summarize the following issues beginning with "The user".\n"""'

def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0

    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1


In [None]:
# Experiment 2

df_questions = pd.read_json(os.path.join(
    path_labeling, 'questions_topic_modeling.json'))

for index, row in df_questions.iterrows():
    if row['Question_original_content_gpt_summary']:
        continue
    
    try:
        response = retry_with_backoff(
            openai.Completion.create,
            model='text-davinci-003',
            prompt=prompt +
            row['Question_original_content_preprocessed_text'] + '"""\n',
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        df_questions.at[index, 'Question_original_content_gpt_summary'] = response['choices'][0]['text'].strip()
    except Exception as e:
        print(f'{e} on question {index}')
        
    if index % 50 == 0:
        print(f'persisting on question {index}')
        df_questions.to_json(os.path.join(
            path_labeling, 'questions_topic_modeling.json'), indent=4, orient='records')
        
    time.sleep(5)

df_questions.to_json(os.path.join(
    path_labeling, 'questions_topic_modeling.json'), indent=4, orient='records')


In [6]:
# Experiment 2

# output unsuccesful summary requests
for index, row in df_questions.iterrows():
    if not row['Question_original_content_gpt_summary']:
        print(row['Question_link'])

https://stackoverflow.com/questions/40907303
https://stackoverflow.com/questions/63339703
https://stackoverflow.com/questions/65577286
https://stackoverflow.com/questions/51064366
https://stackoverflow.com/questions/68150444
https://stackoverflow.com/questions/59762829
https://stackoverflow.com/questions/62813017
https://stackoverflow.com/questions/62836278
https://stackoverflow.com/questions/69466354
https://stackoverflow.com/questions/68489311
https://stackoverflow.com/questions/67599026
https://stackoverflow.com/questions/69721067
https://stackoverflow.com/questions/73085199
https://stackoverflow.com/questions/62569747
https://stackoverflow.com/questions/70567307
https://stackoverflow.com/questions/73462205
https://stackoverflow.com/questions/73812159
https://stackoverflow.com/questions/68397384
https://stackoverflow.com/questions/70968412
https://stackoverflow.com/questions/63204081
https://stackoverflow.com/questions/63518174
https://stackoverflow.com/questions/73650387
https://st

In [None]:
# Experiment 2

sweep_configuration = {
    "name": "question-experiment-2",
    "metric": {
        'name': 'CoherenceCV',
        'goal': 'maximize'
    },
    "method": "grid",
    "parameters": {
        'n_neighbors': {
            'values': list(range(10, 22, 2))
        },
        'n_components': {
            'values': list(range(4, 12, 2))
        }
    }
}

# set default values for hyperparameters
config_defaults = {
    'model_name': 'all-mpnet-base-v2',
    'metric_distane': 'manhattan',
    'low_memory': False,
    'stop_words': 'english',
    'ngram_range': (1, 5),
    'reduce_frequent_words': True
}

df_issues = pd.read_json(os.path.join(
    path_labeling, 'questions_topic_modeling.json'))
docs = df_issues['Question_original_content_gpt_summary'].tolist()


def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config) as run:
        # update any values not set by sweep
        run.config.setdefaults(config_defaults)

        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        # Step 1 - Extract embeddings
        embedding_model = SentenceTransformer(run.config.model_name)

        # Step 2 - Reduce dimensionality
        umap_model = UMAP(n_neighbors=config.n_neighbors, n_components=config.n_components,
                          metric=run.config.metric_distane, low_memory=run.config.low_memory)

        # Step 3 - Cluster reduced embeddings
        hdbscan_model = HDBSCAN()

        # Step 4 - Tokenize topics
        vectorizer_model = TfidfVectorizer(
            stop_words=run.config.stop_words, ngram_range=run.config.ngram_range)

        # Step 5 - Create topic representation
        ctfidf_model = ClassTfidfTransformer(
            reduce_frequent_words=run.config.reduce_frequent_words)

        # Step 6 - (Optional) Fine-tune topic representations with a `bertopic.representation` model
        representation_model = KeyBERTInspired()

        # All steps together
        topic_model = BERTopic(
            embedding_model=embedding_model,            # Step 1 - Extract embeddings
            umap_model=umap_model,                      # Step 2 - Reduce dimensionality
            hdbscan_model=hdbscan_model,                # Step 3 - Cluster reduced embeddings
            vectorizer_model=vectorizer_model,          # Step 4 - Tokenize topics
            ctfidf_model=ctfidf_model,                  # Step 5 - Extract topic words
            representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
            # verbose=True                              # Step 7 - Track model stages
        )
        
        topic_model = topic_model.fit(docs)

        tokenized_docs = [doc.tolist() for doc in vectorizer_model.inverse_transform(
            vectorizer_model.transform(docs))]
        topic_words = [[words for words, _ in topic_model.get_topic(
            topic)] for topic in range(len(topic_model.get_topics())-1)]

        wandb.log({'CoherenceCV': CoherenceCV(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'CoherenceNPMI': CoherenceNPMI(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'CoherenceUMASS': CoherenceUMASS(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'CoherenceUCI': CoherenceUCI(
            topics=topic_words, texts=tokenized_docs).score()})
        wandb.log({'InvertedRBO': InvertedRBO(topics=topic_words).score()})
        wandb.log({'TopicDiversity': TopicDiversity(
            topics=topic_words).score(10)})


sweep_id = wandb.sweep(sweep_configuration, project='asset-management-project')
# Create sweep with ID: 232p6rp5
wandb.agent(sweep_id=sweep_id, function=train)


In [3]:
# from revChatGPT.V1 import Chatbot

# prompt = 'Please use 1 to 2 sentences to summarize the following issues beginning with "The user".\n"""'

# chatbot = Chatbot(
#     config={
#         'email': os.getenv('CHATGPT_EMAIL'),
#         'password': os.getenv('CHATGPT_PASSWORD'),
#         'paid': True
#     }
# )


In [None]:
# # Experiment 2

# df_questions = pd.read_json(os.path.join(
#     path_labeling, 'questions_topic_modeling.json'))
# last_summary = summary = ''

# for index, row in df_questions.iterrows():
#     if row['Question_original_content_gpt_summary']:
#         continue
        
#     question = prompt + \
#         row['Question_original_content_preprocessed_text'] + '"""\n'
    
#     try:
#         for data in chatbot.ask(question):
#             summary = data['message']
#         if last_summary == summary:
#             raise Exception('no response')        
#         last_summary = summary
#         df_questions.at[index, 'Question_original_content_gpt_summary'] = summary        
#     except Exception as e:
#         print(f'{e} on question {index}')
        
#     if index % 50 == 0:
#         print(f'persistence on question {index}')
#         df_questions.to_json(os.path.join(
#             path_labeling, 'questions_topic_modeling.json'), indent=4, orient='records')
        
#     time.sleep(100)

# df_questions.to_json(os.path.join(
#     path_labeling, 'questions_topic_modeling.json'), indent=4, orient='records')


In [6]:
# output the number of asset-management-related discussion posts
df_questions.shape[0]

8664

In [7]:
# sample size is based on the recommendation from https://www.calculator.net/sample-size-calculator.html

sample_size = 368

df_questions = pd.read_json(os.path.join(
    path_labeling, 'questions_topic_modeling.json'))

df_sample = df_questions.sample(n=sample_size, random_state=42)

df_sample.to_json(os.path.join(
    path_labeling, 'questions_sample.json'), indent=4, orient='records')


In [None]:
import re

# content filtering patterns
regex_filter = r"(<.+?>)|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|({.*?})|(\\u[^\s]+)|((!)?\[.*?\])|(\(.+?\))|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|((https?:\/)?\/[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]]+)|(info(rmation)?)"
