In [2]:
from GHMiner import GitHubMiner
from GLMiner import GitLabMiner
import pandas as pd
import numpy as np
import openai
import glob
import json
import time
import os


In [3]:
tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'DVC': 'iterative/dvc',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'Guild AI': 'guildai/guildai',
    'Kedro': 'kedro-org/kedro',
    'MLflow': 'mlflow/mlflow',
    'MLRun': 'mlrun/mlrun',
    'ModelDB': 'VertaAI/modeldb',
    'Neptune': 'neptune-ai/neptune-client',
    'Polyaxon': 'polyaxon/polyaxon',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Amazon SageMaker': '2017-11-19',
    'Azure Machine Learning': '2015-02-18',
    'cnvrg.io': '2020-03-31',
    'Comet': '2017-01-01',
    'Iterative Studio': '2021-05-12',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Vertex AI': '2019-03-01'
}

tools_link = {
    'cnvrg.io': 'https://github.com/cnvrg',
    'Comet': 'https://github.com/comet-ml',
    'Iterative Studio': 'https://studio.iterative.ai',
    'SigOpt': 'https://github.com/sigopt',
    'Vertex AI': 'https://cloud.google.com/vertex-ai'
}

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sagemaker', 'amazon sagemaker'],
    'Azure Machine Learning': ['azureml', 'azure machine learning'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex ai'],
    'Weights & Biases': ['wandb', 'weights & biases', 'weights and biases']
}

ignore_tools = {

}

issue_labels = {
    'bug',
    'error',
    'invalid',
    'looking into it',
    'waiting feedback',
}


In [4]:
path_dataset = '../Dataset'

path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')
path_labeling = os.path.join(path_dataset, 'Labeling')

path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_repo_raw = os.path.join(path_github_repo, 'Raw')
path_gitlab_repo_raw = os.path.join(path_gitlab_repo, 'Raw')
path_github_repo_scraped = os.path.join(path_github_repo, 'Scraped')
path_gitlab_repo_scraped = os.path.join(path_gitlab_repo, 'Scraped')

path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')
path_github_issue_raw = os.path.join(path_github_issue, 'Raw')
path_gitlab_issue_raw = os.path.join(path_gitlab_issue, 'Raw')
path_github_issue_filtered = os.path.join(path_github_issue, 'Filtered')
path_gitlab_issue_filtered = os.path.join(path_gitlab_issue, 'Filtered')

path_labeling_issue = os.path.join(path_labeling, 'Issue')
path_labeling_issue_gpt = os.path.join(path_labeling_issue, 'GPT')
path_labeling_issue_gpt_text = os.path.join(path_labeling_issue_gpt, 'Text')
path_labeling_issue_gpt_code = os.path.join(path_labeling_issue_gpt, 'Code')
path_labeling_issue_native = os.path.join(path_labeling_issue, 'Native')
path_labeling_issue_native_text = os.path.join(path_labeling_issue_native, 'Text')
path_labeling_issue_native_code = os.path.join(path_labeling_issue_native, 'Code')

path_labeling_fix = os.path.join(path_labeling, 'Fix')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_labeling):
    os.makedirs(path_labeling)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

if not os.path.exists(path_github_repo_raw):
    os.makedirs(path_github_repo_raw)

if not os.path.exists(path_gitlab_repo_raw):
    os.makedirs(path_gitlab_repo_raw)

if not os.path.exists(path_github_issue_raw):
    os.makedirs(path_github_issue_raw)

if not os.path.exists(path_gitlab_issue_raw):
    os.makedirs(path_gitlab_issue_raw)

if not os.path.exists(path_github_issue_filtered):
    os.makedirs(path_github_issue_filtered)

if not os.path.exists(path_gitlab_issue_filtered):
    os.makedirs(path_gitlab_issue_filtered)

if not os.path.exists(path_github_repo_scraped):
    os.makedirs(path_github_repo_scraped)

if not os.path.exists(path_gitlab_repo_scraped):
    os.makedirs(path_gitlab_repo_scraped)

if not os.path.exists(path_labeling_issue):
    os.makedirs(path_labeling_issue)

if not os.path.exists(path_labeling_fix):
    os.makedirs(path_labeling_fix)

if not os.path.exists(path_labeling_issue_gpt):
    os.makedirs(path_labeling_issue_gpt)

if not os.path.exists(path_labeling_issue_native):
    os.makedirs(path_labeling_issue_native)

if not os.path.exists(path_labeling_issue_gpt_text):
    os.makedirs(path_labeling_issue_gpt_text)

if not os.path.exists(path_labeling_issue_gpt_code):
    os.makedirs(path_labeling_issue_gpt_code)

if not os.path.exists(path_labeling_issue_native_text):
    os.makedirs(path_labeling_issue_native_text)

if not os.path.exists(path_labeling_issue_native_code):
    os.makedirs(path_labeling_issue_native_code)

In [5]:
openai.api_key = os.getenv('OPENAI_API_KEY')

GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
GITLAB_TOKEN = os.getenv('GITLAB_TOKEN')

github_miner = GitHubMiner(private_token=GITHUB_TOKEN)
gitlab_miner = GitLabMiner(private_token=GITLAB_TOKEN)

In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


In [6]:
tools_data = pd.DataFrame()

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    if tool_name in tools_release_date:
        tool_data, error_data = github_miner.scrape_repo(
            repo_name=tool_repo, real_name=tool_name, release_date=pd.to_datetime(tools_release_date[tool_name]))
    else:
        tool_data, error_data = github_miner.scrape_repo(
            repo_name=tool_repo, real_name=tool_name)

    if not tool_data.empty:
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)
    else:
        print(error_data)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')
tools_data


Unnamed: 0,Repo,Link,Repo Creation Date,Last Commit Date,Topics,Language,Size,#Star,#Watch,#Fork,#Contributors,#Branches,#Releases,#Commits,#Pull Requests,#Pull Requests (Open),#Issues,#Issues (Open),Name,First Release Date
0,aimhubio/aim,https://github.com/aimhubio/aim,2019-05-31 18:25:07,2023-01-31 12:13:08,"[python, ai, data-science, data-visualization,...",TypeScript,59947.0,3065.0,36.0,191.0,50.0,77.0,46.0,2009.0,1708.0,21.0,2491.0,223.0,Aim,2022-01-22 13:45:58
1,aws/sagemaker-python-sdk,https://github.com/aws/sagemaker-python-sdk,2017-11-14 01:03:33,2023-01-31 01:53:20,"[aws, mxnet, tensorflow, machine-learning, pyt...",Python,108723.0,1770.0,132.0,926.0,309.0,15.0,479.0,2850.0,2294.0,51.0,3458.0,455.0,Amazon SageMaker,2017-11-19 00:00:00
2,Azure/azure-sdk-for-python,https://github.com/Azure/azure-sdk-for-python,2012-04-24 16:46:12,2023-02-01 02:54:36,"[python, azure, azure-sdk, hacktoberfest]",Python,537971.0,3473.0,362.0,2186.0,398.0,583.0,2697.0,13516.0,20859.0,145.0,28484.0,927.0,Azure Machine Learning,2015-02-18 00:00:00
3,allegroai/clearml,https://github.com/allegroai/clearml,2019-06-10 08:18:32,2023-01-26 17:11:47,"[version-control, experiment-manager, version,...",Python,42437.0,4027.0,83.0,542.0,62.0,3.0,75.0,1979.0,180.0,2.0,890.0,333.0,ClearML,2019-06-11 17:27:11
4,codalab/codalab-worksheets,https://github.com/codalab/codalab-worksheets,2014-11-30 22:33:18,2023-01-29 22:07:50,[],Python,28204.0,135.0,18.0,79.0,54.0,133.0,114.0,4541.0,2242.0,30.0,4366.0,382.0,Codalab,2017-05-14 00:32:55
5,iterative/dvc,https://github.com/iterative/dvc,2017-03-04 08:16:33,2023-02-01 03:25:50,"[data-science, machine-learning, reproducibili...",Python,17626.0,11008.0,136.0,1024.0,254.0,10.0,415.0,8365.0,4544.0,8.0,8652.0,626.0,DVC,2017-05-04 08:03:08
6,determined-ai/determined,https://github.com/determined-ai/determined,2020-04-07 16:12:29,2023-02-01 03:19:51,"[deep-learning, machine-learning, ml-platform,...",Python,108561.0,2024.0,62.0,276.0,73.0,182.0,76.0,4923.0,5637.0,65.0,5891.0,86.0,Determined,2020-04-08 20:01:20
7,dominodatalab/python-domino,https://github.com/dominodatalab/python-domino,2016-05-16 22:58:02,2023-01-17 21:37:32,[],Python,488.0,51.0,28.0,50.0,32.0,52.0,14.0,201.0,132.0,4.0,167.0,15.0,Domino,2020-08-05 05:16:39
8,guildai/guildai,https://github.com/guildai/guildai,2017-09-27 18:57:50,2023-01-25 14:47:47,[],Python,16971.0,771.0,13.0,71.0,20.0,65.0,2.0,5382.0,69.0,1.0,467.0,185.0,Guild AI,2022-04-28 14:31:07
9,kedro-org/kedro,https://github.com/kedro-org/kedro,2019-04-18 10:29:56,2023-01-30 10:11:12,"[pipeline, kedro, hacktoberfest, mlops, experi...",Python,165970.0,8041.0,102.0,760.0,169.0,32.0,34.0,2148.0,972.0,16.0,2064.0,245.0,Kedro,2019-06-03 16:15:43


In [None]:
dependents = pd.DataFrame()

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_repo_raw, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # either search by sourcegraph
            if 'Results' in json_data:
                for repo_file in json_data['Results']:
                    # file name match pattern
                    if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('github'):
                        repo_name = repo_file['repository']['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
                    # code usage match pattern
                    elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('github'):
                        repo_name = repo_file['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
            # or search by dependent graph
            elif 'all_public_dependent_repos' in json_data:
                for repo_file in json_data['all_public_dependent_repos']:
                    github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo_raw, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # search by sourcegraph exclusively
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('gitlab'):
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'gitlab.com/')
                    gitlab_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('gitlab'):
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_dependents.append(repo_name)

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.remove(tools_repo[tool_name])

    # no need to add tools without dependents
    if not len(github_dependents) and not len(gitlab_dependents):
        continue

    dependent = {
        'Tool': tool_name,
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }

    dependents = pd.concat(
        [dependents, pd.DataFrame([dependent])], ignore_index=True)

dependents.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')


In [10]:
dependents_summary = pd.DataFrame(
    columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat(
        [dependents_summary, dependent_data], ignore_index=True)
# dependents_summary.sort_values(by=['#GitHub Dependents', '#GitLab Dependents'], ascending=False, inplace=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,Aim,92,1
1,Amazon SageMaker,931,3
2,Azure Machine Learning,689,0
3,ClearML,303,0
4,Codalab,30,0
5,Comet,480,0
6,Determined,44,0
7,Domino,2,0
8,DVC,4229,0
9,Guild AI,53,4


In [5]:
df_dependents = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))
df_tools = pd.read_json(os.path.join(path_dataset, 'Tools.json'))


In [None]:
# scrape Gitlab dependents general information for each tool
for index, row in df_dependents.iterrows():
    print(f'{index}: {row["Tool"]}')
    repos_data, errors_data = gitlab_miner.scrape_repo_list(
        row['GitLab Dependents'])

    if not repos_data.empty:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_gitlab_repo_scraped, f'{row["Tool"]}.json'), indent=4, orient='records')

    if not errors_data.empty:
        errors_data.to_json(os.path.join(path_gitlab_repo_scraped,
                            f'Discarded.{row["Tool"]}.json'), indent=4, orient='records')

In [None]:
# scrape issues of Gitlab dependents for each tool
for index, row in df_tools.iterrows():
    file_name = os.path.join(path_gitlab_repo_scraped, f'{row["Name"]}.json')
    if os.path.exists(file_name):
        repos = pd.read_json(file_name)
        # filter out repos without any issues
        repos = repos[repos['#Issues'] > 0]
        # filter out repos created before the tool's first release date
        repos = repos[repos['Repo Creation Date'] > row['First Release Date']]
        print(f'{row["Name"]}: {repos["#Issues"].sum()}')
        # scrape issues for the current tool
        issues = gitlab_miner.scrape_issue_list(repos['Repo'].tolist())
        if not issues.empty:
            issues.to_json(os.path.join(path_gitlab_issue_raw,
                                        f'{row["Name"]}.json'), indent=4, orient='records')


In [14]:
# exclude Gitlab issues that are not related to each tool
valid_issues_all = pd.DataFrame()
valid_fixes_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_gitlab_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    valid_fixes = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]
    
    for index, issue in issues.iterrows():
        for keyword in tools_keywords[tool_name]:
            if 'amazon' in keyword:
                continue
            if keyword in issue['Issue_title'].lower():
                valid_issue = pd.DataFrame([issue])
                valid_issues = pd.concat(
                    [valid_issues, valid_issue], ignore_index=True)
                if not pd.isnull(issue['Issue_closed_time']):
                    valid_fixes = pd.concat(
                        [valid_fixes, valid_issue], ignore_index=True)
                break

    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        if not valid_fixes.empty:
            valid_fixes['Tool'] = tool_name
            valid_fixes_all = pd.concat(
                [valid_fixes_all, valid_fixes], ignore_index=True)

valid_issues_all = valid_issues_all[~valid_issues_all['Tool'].isin(
    ignore_tools)]
valid_fixes_all = valid_fixes_all[~valid_fixes_all['Tool'].isin(ignore_tools)]
valid_issues_all.head()


Unnamed: 0,Issue_link,Issue_title,Issue_label,Issue_creation_time,Issue_closed_time,Issue_upvote_count,Issue_downvote_count,Issue_body,Tool
0,https://gitlab.com/fluidattacks/universe/-/iss...,[Sorts] Add sagemaker dependencies,"[arena::security, product::sorts, type::bug]",2022-12-19 20:28:18.985,NaT,0,0,"<!-- Issues are public, they should not contai...",Amazon SageMaker
1,https://gitlab.com/fluidattacks/universe/-/iss...,Enable sagemaker,[],2020-04-21 18:40:24.230,2020-05-07 21:34:48.408,0,0,https://docs.aws.amazon.com/sagemaker/,Amazon SageMaker
2,https://gitlab.com/librespacefoundation/polari...,Saving behave logs in MLflow,[],2021-07-06 19:33:14.309,NaT,0,0,"Like we do in learn, we should also save the p...",MLflow
3,https://gitlab.com/librespacefoundation/polari...,ML Database vs. MLflow,[learn],2020-04-25 17:51:29.061,NaT,1,0,I am mainly working on the feature selection p...,MLflow
4,https://gitlab.com/librespacefoundation/polari...,Namespacing polaris runs for logging purposes ...,"[improvement, learn]",2020-01-31 21:57:59.518,NaT,0,0,Background\n==========\n\nEverytime analysis i...,MLflow


In [15]:
final = set()
for _, row in valid_issues_all['Issue_label'].map(set).items():
    final = final.union(row)
final

{'arena::security', 'improvement', 'learn', 'product::sorts', 'type::bug'}

In [20]:
# filter out issues that are not related to challenges
valid_issues_filtered = []
valid_fixes_filtered = []

for index, row in valid_issues_all.iterrows():
    if not row['Issue_title'].isascii():
        continue
       
    break_sign = False
    for label_repo in row['Issue_label']:
        for label_question in issue_labels:
            if not break_sign and label_question in label_repo.lower():
                valid_issues_filtered.append(row)
                break_sign = True

for index, row in valid_fixes_all.iterrows():
    if not row['Issue_title'].isascii():
        continue
       
    break_sign = False
    for label_repo in row['Issue_label']:
        for label_question in issue_labels:
            if not break_sign and label_question in label_repo.lower():
                valid_fixes_filtered.append(row)
                break_sign = True

valid_issues_filtered = pd.concat(valid_issues_filtered, axis=1).T
valid_issues_filtered.to_json(os.path.join(
    path_gitlab_issue_filtered, 'issues.json'), indent=4, orient='records')

if valid_fixes_filtered:
    valid_fixes_filtered = pd.concat(valid_fixes_filtered, axis=1).T
    valid_fixes_filtered.to_json(os.path.join(
        path_gitlab_issue_filtered, 'fixes.json'), indent=4, orient='records')


In [22]:
summary_issues = valid_issues_filtered.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_issues.rename(columns={'Issue_title': '#Issue'}, inplace=True)
summary_issues = summary_issues.astype({'#Issue': 'int32'})
summary_issues.to_csv(os.path.join(
    path_gitlab_issue, 'summary.csv'), index=False)
summary_issues


Unnamed: 0,Tool,#Issue
0,Amazon SageMaker,1


In [None]:
# scrape Github dependents general information for each tool
for index, row in df_dependents.iterrows():
    print(f'{index}: {row["Tool"]}')
    repos_data, errors_data = github_miner.scrape_repo_list(
        row['GitHub Dependents'])

    if not repos_data.empty:
        repos_data = repos_data.sort_values(by='#Issues', ascending=False)
        repos_data.to_json(os.path.join(
            path_github_repo_scraped, f'{row["Tool"]}.json'), indent=4, orient='records')

    if not errors_data.empty:
        errors_data.to_json(os.path.join(path_github_repo_scraped,
                            f'Discarded.{row["Tool"]}.json'), indent=4, orient='records')

In [None]:
# scrape issues of Github dependents for each tool
for index, row in df_tools.iterrows():
    file_name = os.path.join(path_github_repo_scraped, f'{row["Name"]}.json')
    if os.path.exists(file_name):
        repos = pd.read_json(file_name)
        # filter out repos with only pr-based issues
        repos = repos[repos['#Issues'] > repos['#Pull Requests']]
        # filter out repos created before the tool's first release date
        repos = repos[repos['Repo Creation Date'] > row['First Release Date']]
        print(
            f'{row["Name"]}: {repos["#Issues"].sum() - repos["#Pull Requests"].sum()}')
        # scrape issues for the current tool
        issues = github_miner.scrape_issue_list(repos['Repo'].tolist())
        if not issues.empty:
            issues.to_json(os.path.join(path_github_issue_raw,
                           f'{row["Name"]}.json'), indent=4, orient='records')


In [57]:
# exclude Github issues that are not related to each tool
valid_issues_all = pd.DataFrame()
valid_fixes_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_github_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    valid_fixes = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]

    for index, issue in issues.iterrows():
        for keyword in tools_keywords[tool_name]:
            if 'amazon' in keyword:
                continue
            if keyword in issue['Issue_title'].lower():
                valid_issue = pd.DataFrame([issue])
                valid_issues = pd.concat(
                    [valid_issues, valid_issue], ignore_index=True)
                if not pd.isnull(issue['Issue_closed_time']):
                    valid_fixes = pd.concat(
                        [valid_fixes, valid_issue], ignore_index=True)
                break

    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        if not valid_fixes.empty:
            valid_fixes['Tool'] = tool_name
            valid_fixes_all = pd.concat(
                [valid_fixes_all, valid_fixes], ignore_index=True)

valid_issues_all = valid_issues_all[~valid_issues_all['Tool'].isin(
    ignore_tools)]
valid_fixes_all = valid_fixes_all[~valid_fixes_all['Tool'].isin(ignore_tools)]
valid_issues_all.head()


Unnamed: 0,Issue_link,Issue_title,Issue_label,Issue_creation_time,Issue_closed_time,Issue_upvote_count,Issue_downvote_count,Issue_comment_count,Issue_body,Tool
0,https://github.com/h2oai/dai-deployment-templa...,[Feature Request] Propagate mojo scorer images...,[WIP - Susankha],2021-11-10 19:28:00,NaT,0,0,2,### Request:\r\nThis request is a follow up to...,Amazon SageMaker
1,https://github.com/h2oai/dai-deployment-templa...,[AWS/Sagemaker] Reduce Code Base,[],2020-02-13 21:07:27,NaT,0,0,0,After research stemming from gcp deployment me...,Amazon SageMaker
2,https://github.com/utterworks/fast-bert/issues...,Issue with finetuning pretraining Language mo...,[],2020-10-21 14:28:14,NaT,0,0,0,Hi Iam using the container_lm code base to fin...,Amazon SageMaker
3,https://github.com/utterworks/fast-bert/issues...,Error for training job failed. reason: algorit...,[],2020-10-03 13:37:14,2020-10-09 17:13:38,0,0,1,"Hello, \r\n\r\nI was training a DistilBERT mod...",Amazon SageMaker
4,https://github.com/utterworks/fast-bert/issues...,Using multiple training instances in AWS Sagem...,[],2020-04-23 12:22:33,NaT,0,0,4,Is it possible to speedup BERT training by usi...,Amazon SageMaker


In [58]:
final = set()
for _, row in valid_issues_all['Issue_label'].map(set).items():
    final = final.union(row)
final

{'"experiments"',
 '0.4.6',
 '1.1',
 '1.4',
 '1.6',
 '1.7',
 '3 - Quality of Life',
 '3rd party',
 '3rd party update',
 ':bridge_at_night:  Bridge',
 ':bug: bug',
 ':rotating_light:',
 '? - Needs Triage',
 'A: example-dvc-experiments',
 'A: example-get-started',
 'ADO',
 'AI\u202fFrameworks/ONNX',
 'AML Compute Instance',
 'API',
 'API & Doc',
 'Auto\u202fML',
 'Cloud',
 'Community',
 'Compute',
 'Core UI',
 'DRL',
 'Data4ML',
 'Data\u202fDrift',
 'Data\u202fPrep\u202fServices',
 'Documentation',
 'ERRATA_CANDIDATE',
 'Enhancement',
 'Environments',
 'Evaluation',
 'Experimentation UI',
 'FAQ',
 'Feature - Medium Priority',
 'HIGH',
 'HPO',
 'Hyperdrive',
 'Important',
 'In the roadmap',
 'Inf1',
 'Inference',
 'Ingestion',
 'Issue: Bug Report 🐞',
 'Issue: Feature Request',
 'L',
 'LOE: S',
 'Localized',
 'MLOps',
 'NLP',
 'NUM',
 'Needs Triage',
 'Not related to PyCaret',
 'Notebook',
 'Optional',
 'P0',
 'P1',
 'P2',
 'Pipelines',
 'Priority 1',
 'Reinforcement Learning',
 'RepoOffic

In [60]:
# filter out issues that are not related to challenges
valid_issues_filtered = []
valid_fixes_filtered = []

for index, row in valid_issues_all.iterrows():
    if not row['Issue_title'].isascii():
        continue
    
    break_sign = False
    for label_repo in row['Issue_label']:
        for label_question in issue_labels:
            if not break_sign and label_question in label_repo.lower():
                valid_issues_filtered.append(row)
                break_sign = True

for index, row in valid_fixes_all.iterrows():
    if not row['Issue_title'].isascii():
        continue
       
    break_sign = False
    for label_repo in row['Issue_label']:
        for label_question in issue_labels:
            if not break_sign and label_question in label_repo.lower():
                valid_fixes_filtered.append(row)
                break_sign = True

valid_issues_filtered = pd.concat(valid_issues_filtered, axis=1).T
valid_fixes_filtered = pd.concat(valid_fixes_filtered, axis=1).T

valid_issues_filtered.to_json(os.path.join(
    path_github_issue_filtered, 'issues.json'), indent=4, orient='records')
valid_fixes_filtered.to_json(os.path.join(
    path_github_issue_filtered, 'fixes.json'), indent=4, orient='records')


In [61]:
summary_issues = valid_issues_filtered.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_fixes = valid_fixes_filtered.groupby(
    'Tool').count()['Issue_title'].reset_index()
summary_issues.rename(columns={'Issue_title': '#Issue'}, inplace=True)
summary_fixes.rename(columns={'Issue_title': '#Closed'}, inplace=True)
summary_github = summary_issues.merge(
    summary_fixes, on='Tool', how='outer').fillna(0)
summary_github = summary_github.astype({'#Issue': 'int32', '#Closed': 'int32'})
summary_github


Unnamed: 0,Tool,#Issue,#Closed
0,Amazon SageMaker,69,57
1,Azure Machine Learning,36,19
2,ClearML,3,2
3,Comet,22,20
4,DVC,29,22
5,Kedro,18,14
6,MLflow,105,89
7,Neptune,12,10
8,SigOpt,4,3
9,Vertex AI,5,4


In [64]:
# concatenate Github and Gitlab issues
df_issue_github = pd.read_json(os.path.join(path_github_issue_filtered, 'issues.json'))
df_issue_gitlab = pd.read_json(os.path.join(path_gitlab_issue_filtered, 'issues.json'))

df_issue_github['Platform'] = 'Github'
df_issue_gitlab['Platform'] = 'Gitlab'

df_issue_all = pd.concat([df_issue_github, df_issue_gitlab], ignore_index=True)
df_issue_all.to_json(os.path.join(path_labeling_issue_native, 'issues.json'), indent=4, orient='records')

df_issue_all['Issue_summary'] = ''
df_issue_all.to_json(os.path.join(path_labeling_issue_gpt, 'issues.json'), indent=4, orient='records')

# concatenate Github and Gitlab fixes
df_fix_github = pd.read_json(os.path.join(path_github_issue_filtered, 'fixes.json'))
df_fix_github['Platform'] = 'Github'
df_fix_github['Fix_summary'] = ''
df_fix_github.to_json(os.path.join(path_labeling_fix, 'fixes_prefill.json'), indent=4, orient='records')

In [8]:
# prepare tool-related keywords that would be removed from the issue content for better topic modeling 
tool_keyword_list = []
for tool_keyword in tools_keywords.values():
    tool_keyword_list.extend(tool_keyword)
# handle special cases on amazon sagemaker
tool_keyword_list.remove('sagemaker')
tool_keyword_list.append('sagemaker')
tool_keyword_list

['aim',
 'amazon sagemaker',
 'azureml',
 'azure machine learning',
 'clearml',
 'cnvrg',
 'codalab',
 'comet',
 'determined',
 'domino',
 'dvc',
 'guild ai',
 'kedro',
 'mlflow',
 'mlrun',
 'modeldb',
 'neptune',
 'polyaxon',
 'sacred',
 'sigopt',
 'valohai',
 'vertex ai',
 'wandb',
 'weights & biases',
 'weights and biases',
 'sagemaker']

In [66]:
# manually preprocess the content of the issues and then save the results back to the json files
df_issues = pd.read_json(os.path.join(path_labeling_issue_gpt, 'issues.json'))
for index, row in df_issues.iterrows():
    if (len(str(row['Issue_body'])) > 10000):
        print(row['Issue_link'])

https://github.com/autogluon/autogluon/issues/268
https://github.com/awslabs/gluonts/issues/426
https://github.com/kedro-org/kedro/issues/308
https://github.com/zenml-io/zenml/issues/767
https://github.com/microsoft/computervision-recipes/issues/332
https://github.com/microsoft/computervision-recipes/issues/320
https://github.com/microsoft/nni/issues/3518
https://github.com/Lightning-AI/lightning/issues/9879
https://github.com/databrickslabs/dbx/issues/548
https://github.com/nv-morpheus/Morpheus/issues/512
https://github.com/prinz-nussknacker/prinz/issues/78
https://github.com/pycaret/pycaret/issues/2838
https://github.com/pycaret/pycaret/issues/931
https://github.com/microsoft/qlib/issues/1035
https://github.com/graphnet-team/graphnet/issues/316


In [71]:
# Example No.1: feed the issue content (with code) to the text-davinci-003 model and get the summary, then feed the summary to topic model and get the topics
df_issues = pd.read_json(os.path.join(path_labeling_issue_gpt, 'issues.json'))
df_issues_todo = df_issues[df_issues['Issue_summary'] == '']
df_issues_done = df_issues[df_issues['Issue_summary'] != '']
df_issues_todo.shape[0], df_issues_done.shape[0]

(0, 345)

In [None]:
error_requests = []

for index, row in df_issues_todo.iterrows():
    print(f'working on issue {index}')
    try:
        response = openai.Completion.create(
            model='text-davinci-003',
            prompt='Use 1 to 2 sentences to summarize the following issue.\nText: """' + 'Title: ' +
            row['Issue_title'].strip() + '; Content: ' +
            str(row['Issue_body']).strip() + '""""\n',
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        df_issues_todo.at[index,
                          'Issue_summary'] = response['choices'][0]['text'].strip()
    except Exception as e:
        print(e)
        error_requests.append(index)
    time.sleep(5)

error_requests

df_issues = pd.concat(
    [df_issues_todo, df_issues_done], ignore_index=True)
df_issues.to_json(os.path.join(
    path_labeling_issue_gpt_code, 'issues_summary.json'), indent=4, orient='records')


In [296]:
# preprocess the summary of the issues
df_issues = pd.read_json(os.path.join(path_labeling_issue_gpt_code, 'issues_summary.json'))
df_issues['Issue_summary_preprocessed'] = ''

for index, row in df_issues.iterrows():
    summary = row['Issue_summary'].lower()
    for tool_keyword in tool_keyword_list:
        if tool_keyword in summary:
            summary = summary.replace(tool_keyword, 'tool')
    df_issues.at[index, 'Issue_summary_preprocessed'] = summary
df_issues.to_json(os.path.join(path_labeling_issue_gpt_code, 'issues_summary_preprocessed.json'), indent=4, orient='records')

In [None]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = TfidfVectorizer(max_df=0.7, min_df=2, stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  diversity=0.5,                      # Step 6 - Diversify topic words
  verbose=True
)

df_issues = pd.read_json(os.path.join(path_labeling_issue_gpt_code, 'issues_summary_preprocessed.json'))
topic_model = topic_model.fit(df_issues['Issue_summary_preprocessed'].tolist())
topic_model.get_topic_info()
topic_model.visualize_topics()
topic_model.visualize_documents(df_issues['Issue_summary_preprocessed'].tolist())
hierarchical_topics = topic_model.hierarchical_topics(df_issues['Issue_summary_preprocessed'].tolist())
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
topic_model.visualize_term_rank(log_scale=True)
# topic_model.save(os.path.join(path_labeling_issue_gpt_code, 'topic_model'))

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2023-02-11 09:23:41,506 - BERTopic - Transformed documents to Embeddings
2023-02-11 09:23:44,682 - BERTopic - Reduced dimensionality
2023-02-11 09:23:44,704 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,106,-1_logger_config_bug_yaml
1,0,47,0_toollogger_metrics_training_experiment
2,1,46,1_ui_configuration_does_artifact
3,2,29,2_bucket_tool_16_instances
4,3,27,3_train_error_caused_callback
5,4,18,4_model_saving_loaded_dataset
6,5,13,5_deployment_ml_sdk_yml
7,6,10,6_workspace_cloning_bug_instance
8,7,10,7_run_failing_ui_job
9,8,10,8_cluster_updated_github_pull


100%|██████████| 12/12 [00:04<00:00,  2.69it/s]


In [299]:
# Example No.2: feed the issue content (with code) to topic model and get the topics
df_issues = pd.read_json(os.path.join(path_labeling_issue, 'issues.json'))
df_issues['Issue_content_preprocessed'] = ''

for index, row in df_issues.iterrows():
    content = 'Title: ' + row['Issue_title'].lower() + '; Content:' + str(row['Issue_body']).lower()
    for tool_keyword in tool_keyword_list:
        if tool_keyword in content:
            content = content.replace(tool_keyword, 'tool')
    df_issues.at[index, 'Issue_content_preprocessed'] = content
df_issues.to_json(os.path.join(path_labeling_issue, 'issues_preprocessed_with_code.json'), indent=4, orient='records')

In [13]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = TfidfVectorizer(max_df=0.7, min_df=2, stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  diversity=0.5,                      # Step 6 - Diversify topic words
  verbose=True
)

df_issues = pd.read_json(os.path.join(path_labeling_issue, 'issues_preprocessed_with_code.json'))
topic_model = topic_model.fit(df_issues['Issue_content_preprocessed'].tolist())
topic_model.get_topic_info()
topic_model.visualize_topics()
topic_model.visualize_documents(df_issues['Issue_content_preprocessed'].tolist())
hierarchical_topics = topic_model.hierarchical_topics(df_issues['Issue_content_preprocessed'].tolist())
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
topic_model.visualize_term_rank(log_scale=True)
# topic_model.save(os.path.join(path_labeling_issue_native_code, 'topic_model'))

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2023-02-11 10:14:43,228 - BERTopic - Transformed documents to Embeddings
2023-02-11 10:14:45,719 - BERTopic - Reduced dimensionality
2023-02-11 10:14:45,739 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,5,-1_run_tool_failed_ui
1,0,332,0_file_python3_tool_self
2,1,8,1_workspace_connect_notebook_bug


ValueError: zero-size array to reduction operation maximum which has no identity

In [9]:
# Example No.3: feed the issue content (without code) to the text-davinci-003 model and get the summary, then feed the summary to topic model and get the topics
import re

regex = r"(<.+?>)|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|({.*?})|(\\u[0-9a-z]+)|((!)?\[.*?\])|(\(.+?\))|([a-z0-9-\.]+[<=>]=[a-z0-9\.]+)|(@[a-z0-9]+)|((https?:\/)?\/[^\s]+)|(\\[^\s]+)|([^\s]+\\[^\s]+)|([^\s]+\.[^\s]+)|([a-z]+_[a-z]+)|(_+[a-z]+_*)|(_*[a-z]+_+)|([0-9\|\-\r\n\t\"\*#*=~-]+)"

# preprocess the content of the issues
df_issues = pd.read_json(os.path.join(path_labeling_issue, 'issues.json'))
df_issues['Issue_content_preprocessed'] = ''

for index, row in df_issues.iterrows():
    content = 'Title: ' + row['Issue_title'].lower() + '; Content:' + str(row['Issue_body']).lower()
    content = content.encode('ascii', errors='ignore').decode('ascii')
    content = re.sub(regex, ' ', content, flags=re.S)
    for tool_keyword in tool_keyword_list:
        if tool_keyword in content:
            content = content.replace(tool_keyword, 'tool')
    content = ' '.join([w for w in content.split() if len(w)>1])
    df_issues.at[index, 'Issue_content_preprocessed'] = content

df_issues.to_json(os.path.join(path_labeling_issue, 'issues_preprocessed_without_code.json'), indent=4, orient='records')

In [291]:
error_requests = []

for index, row in df_issues.iterrows():
    print(f'working on issue {index}')
    try:
        response = openai.Completion.create(
            model='text-davinci-003',
            prompt='Use 1 to 2 sentences to summarize the following issue.\nText: """' +
            row['Issue_content_preprocessed'] + '""""\n',
            temperature=0,
            max_tokens=200,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        df_issues.at[index,
                     'Issue_summary'] = response['choices'][0]['text'].strip()
    except Exception as e:
        print(e)
        error_requests.append(index)
    time.sleep(5)

error_requests

df_issues.to_json(os.path.join(
    path_labeling_issue_gpt_text, 'issues_summary.json'), indent=4, orient='records')

working on issue 0
working on issue 1
working on issue 2
working on issue 3
working on issue 4
working on issue 5
working on issue 6
working on issue 7
working on issue 8
working on issue 9
working on issue 10
working on issue 11
working on issue 12
working on issue 13
working on issue 14
working on issue 15
working on issue 16
working on issue 17
working on issue 18
working on issue 19
working on issue 20
working on issue 21
working on issue 22
working on issue 23
working on issue 24
working on issue 25
working on issue 26
working on issue 27
working on issue 28
working on issue 29
working on issue 30
working on issue 31
working on issue 32
working on issue 33
working on issue 34
working on issue 35
working on issue 36
working on issue 37
working on issue 38
working on issue 39
working on issue 40
working on issue 41
working on issue 42
working on issue 43
working on issue 44
working on issue 45
working on issue 46
working on issue 47
working on issue 48
working on issue 49
working on

[]

In [12]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = TfidfVectorizer(max_df=0.7, min_df=2, stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  diversity=0.5,                      # Step 6 - Diversify topic words
  verbose=True
)

df_issues = pd.read_json(os.path.join(path_labeling_issue_gpt_text, 'issues_summary.json'))
topic_model = topic_model.fit(df_issues['Issue_summary'].tolist())
topic_model.get_topic_info()
topic_model.visualize_topics()
topic_model.visualize_documents(df_issues['Issue_summary'].tolist())
hierarchical_topics = topic_model.hierarchical_topics(df_issues['Issue_summary'].tolist())
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
topic_model.visualize_term_rank(log_scale=True)
# topic_model.save(os.path.join(path_labeling_issue_gpt_text, 'topic_model'))

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2023-02-11 10:10:52,942 - BERTopic - Transformed documents to Embeddings
2023-02-11 10:10:55,632 - BERTopic - Reduced dimensionality
2023-02-11 10:10:55,657 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,97,-1_command_issue_notebook_github
1,0,29,0_chart_helm_job_kubeflow
2,1,25,1_cased_xlnet_matrix_predictor
3,2,23,2_logger_parameters_hydra_errors
4,3,17,3_trainer_logging_rate_callback
5,4,16,4_benchmark_xcom_backtest_async
6,5,16,5_workspace_notebook_hong_intermittent
7,6,12,6_pytorch_lightning_stderr_tensorboard
8,7,12,7_user_filenotfounderror_mlflow_scripts
9,8,12,8_pipelinemodel_deepcopy_viz_datasets


100%|██████████| 19/19 [00:07<00:00,  2.48it/s]


In [11]:
# Example No.4: feed the issue content (without code) to topic model and get the topics
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-mpnet-base-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = TfidfVectorizer(max_df=0.7, min_df=2, stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
  diversity=0.5,                      # Step 6 - Diversify topic words
  verbose=True
)

# preprocess the content of the issues
df_issues = pd.read_json(os.path.join(path_labeling_issue, 'issues_preprocessed_without_code.json'))
topic_model = topic_model.fit(df_issues['Issue_content_preprocessed'].tolist())
topic_model.get_topic_info()
topic_model.visualize_topics()
topic_model.visualize_documents(df_issues['Issue_content_preprocessed'].tolist())
hierarchical_topics = topic_model.hierarchical_topics(df_issues['Issue_content_preprocessed'].tolist())
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
topic_model.visualize_barchart()
topic_model.visualize_heatmap()
topic_model.visualize_term_rank(log_scale=True)
# topic_model.save(os.path.join(path_labeling_issue_native_text, 'topic_model'))

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2023-02-11 10:08:39,388 - BERTopic - Transformed documents to Embeddings
2023-02-11 10:08:41,958 - BERTopic - Reduced dimensionality
2023-02-11 10:08:41,984 - BERTopic - Clustered reduced embeddings


Unnamed: 0,Topic,Count,Name
0,-1,97,-1_stdout_tool_model_toolpipelinemodel
1,0,59,0_pytorch_logging_loss_training
2,1,38,1_content_zenml_fetch_oauthtoken
3,2,22,2_tool_result_projects_cli
4,3,16,3_experiment_false_resume_toolexception
5,4,16,4_aws_arn_partition_regions
6,5,14,5_prefix_ui_overrides_async
7,6,13,6_line_pip_package_installing
8,7,13,7_connect_browser_workspaces_ups
9,8,10,8_types_deploy_pipelines_endpoint


100%|██████████| 14/14 [00:04<00:00,  2.87it/s]
