In [1]:
import pandas as pd
import numpy as np

import glob
import time
import os

In [2]:
path_dataset = '../Dataset'

path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')
path_labeling = os.path.join(path_dataset, 'Labeling')

path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')
path_github_issue_raw = os.path.join(path_github_issue, 'Raw')
path_gitlab_issue_raw = os.path.join(path_gitlab_issue, 'Raw')
path_github_issue_filtered = os.path.join(path_github_issue, 'Filtered')
path_gitlab_issue_filtered = os.path.join(path_gitlab_issue, 'Filtered')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_labeling):
    os.makedirs(path_labeling)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

if not os.path.exists(path_github_issue_raw):
    os.makedirs(path_github_issue_raw)

if not os.path.exists(path_gitlab_issue_raw):
    os.makedirs(path_gitlab_issue_raw)

if not os.path.exists(path_github_issue_filtered):
    os.makedirs(path_github_issue_filtered)

if not os.path.exists(path_gitlab_issue_filtered):
    os.makedirs(path_gitlab_issue_filtered)

In [3]:
tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'DVC': 'iterative/dvc',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'Guild AI': 'guildai/guildai',
    'Kedro': 'kedro-org/kedro',
    'MLflow': 'mlflow/mlflow',
    'MLRun': 'mlrun/mlrun',
    'ModelDB': 'VertaAI/modeldb',
    'Neptune': 'neptune-ai/neptune-client',
    'Optuna': 'optuna/optuna',
    'Polyaxon': 'polyaxon/polyaxon',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Amazon SageMaker': '2017-11-19',
    'Azure Machine Learning': '2015-02-18',
    'cnvrg.io': '2020-03-31',
    'Comet': '2017-01-01',
    'Iterative Studio': '2021-05-12',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Vertex AI': '2019-03-01'
}

tools_link = {
    'cnvrg.io': 'https://github.com/cnvrg',
    'Comet': 'https://github.com/comet-ml',
    'Iterative Studio': 'https://studio.iterative.ai',
    'SigOpt': 'https://github.com/sigopt',
    'Vertex AI': 'https://cloud.google.com/vertex-ai'
}

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['amazon sagemaker', 'aws sagemaker', 'sagemaker'],
    'Azure Machine Learning': ['microsoft azure machine learning', 'azure machine learning', 'microsoft azure ml', 'microsoft azureml', 'azure ml', 'azureml'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['google vertex ai', 'vertex ai'],
    'Weights & Biases': ['weights & biases', 'weights&biases', 'W & B', 'W&B', 'weights and biases', 'wandb']
}

issue_labels = {
    'bug',
    'error',
    'invalid',
    'looking into it',
    'waiting feedback',
}


In [5]:
import openai

openai.api_key = os.getenv('OPENAI_API_KEY')

In [4]:
from Scrape.GHMiner import GitHubMiner
from Scrape.GLMiner import GitLabMiner

github_miner = GitHubMiner(private_token=os.getenv('GITHUB_TOKEN'))
gitlab_miner = GitLabMiner(private_token=os.getenv('GITLAB_TOKEN'))

In [17]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [7]:
tools_data = pd.DataFrame()

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    if tool_name in tools_release_date:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name, release_time=pd.to_datetime(tools_release_date[tool_name]))
    else:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name)

    if not tool_data.empty:
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')


In [11]:
import json

dependents = pd.DataFrame()

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # either search by sourcegraph
            if 'Results' in json_data:
                for repo_file in json_data['Results']:
                    # file name match pattern
                    if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('github'):
                        repo_name = repo_file['repository']['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
                    # code usage match pattern
                    elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('github'):
                        repo_name = repo_file['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
            # or search by dependent graph
            elif 'all_public_dependent_repos' in json_data:
                for repo_file in json_data['all_public_dependent_repos']:
                    github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # search by sourcegraph exclusively
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('gitlab'):
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'gitlab.com/')
                    gitlab_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('gitlab'):
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_dependents.append(repo_name)

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.remove(tools_repo[tool_name])

    # no need to add tools without dependents
    if not len(github_dependents) and not len(gitlab_dependents):
        continue

    dependent = {
        'Tool': tool_name,
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }

    dependents = pd.concat(
        [dependents, pd.DataFrame([dependent])], ignore_index=True)

dependents.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')


In [7]:
github_repos = set()
gitlab_repos = set()

for index, row in dependents.iterrows():
    github_repos = github_repos.union(row['GitHub Dependents'])
    gitlab_repos = gitlab_repos.union(row['GitLab Dependents'])

print(len(github_repos))
print(len(gitlab_repos))

22552
9


In [5]:
dependents_summary = pd.DataFrame(
    columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat(
        [dependents_summary, dependent_data], ignore_index=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,Aim,92,1
1,Amazon SageMaker,931,3
2,Azure Machine Learning,689,0
3,ClearML,303,0
4,Codalab,30,0
5,Comet,480,0
6,Determined,44,0
7,Domino,2,0
8,DVC,4229,0
9,Guild AI,53,4


In [6]:
df_dependents = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))

In [None]:
# scrape issues of Gitlab dependents for each tool

for index, row in df_dependents.iterrows():
    print(f'{index}: {row["Tool"]}')
    repos = []
    for repo in row['GitLab Dependents'].tolist():
        repo_data, error_data = gitlab_miner.scrape_repo(repo)
        if (not repo_data.empty) and (repo_data['Repo Created Date'].values[0] > repo_data['First Release Date'].values[0]) and repo_data['#Issue'].values[0]:
            repos.append(repo)
    issues = gitlab_miner.scrape_issue_list(repos)
    if not issues.empty:
        issues.to_json(os.path.join(path_gitlab_issue_raw, f'{row["Tool"]}.json'), indent=4, orient='records')

In [40]:
# exclude Gitlab issues that are not related to each tool

valid_issues_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_gitlab_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]

    for index, issue in issues.iterrows():
        for keyword in tools_keywords[tool_name]:
            if keyword in issue['Issue_title'].lower():
                valid_issue = pd.DataFrame([issue])
                valid_issues = pd.concat(
                    [valid_issues, valid_issue], ignore_index=True)
                break
    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        
len(valid_issues_all)

5

In [41]:
final = set()
for _, row in valid_issues_all['Issue_label'].map(set).items():
    final = final.union(row)
final

{'arena::security', 'improvement', 'learn', 'product::sorts', 'type::bug'}

In [42]:
# filter out issues that are not related to challenges

valid_issues_filtered = []

for index, row in valid_issues_all.iterrows():
    if not row['Issue_title'].isascii():
        continue

    break_sign = False
    title = row['Issue_title'].lower()
    for label_repo in row['Issue_label']:
        for issue_label in issue_labels:
            if not break_sign and issue_label in label_repo.lower():
                valid_issues_filtered.append(row)
                break_sign = True
            elif not break_sign and issue_label in title:
                valid_issues_filtered.append(row)
                break_sign = True

valid_issues_filtered = pd.concat(valid_issues_filtered, axis=1).T
valid_issues_filtered.to_json(os.path.join(
    path_gitlab_issue_filtered, 'issues.json'), indent=4, orient='records')
len(valid_issues_filtered)

1

In [None]:
# scrape issues of Github dependents for each tool

for index, row in df_dependents.iterrows():
    print(f'{index}: {row["Tool"]}')
    repos = []
    for repo in row['GitHub Dependents'].tolist():
        repo_data, error_data = github_miner.scrape_repo(repo)
        if (not repo_data.empty) and (repo_data['Repo Created Date'].values[0] > repo_data['First Release Date'].values[0]) and repo_data['#Issue'].values[0]:
            repos.append(repo)
    issues = github_miner.scrape_issue_list(repos)
    if not issues.empty:
        issues.to_json(os.path.join(path_github_issue_raw, f'{row["Tool"]}.json'), indent=4, orient='records')

In [43]:
# exclude Github issues that are not related to each tool
valid_issues_all = pd.DataFrame()

for file_name in glob.glob(os.path.join(path_github_issue_raw, '*.json')):
    issues = pd.read_json(file_name)
    valid_issues = pd.DataFrame()
    tool_name = os.path.split(file_name)[1].split('.')[0]

    for index, issue in issues.iterrows():
        for keyword in tools_keywords[tool_name]:
            if keyword in issue['Issue_title'].lower():
                valid_issue = pd.DataFrame([issue])
                valid_issues = pd.concat(
                    [valid_issues, valid_issue], ignore_index=True)
                break
            
    if not valid_issues.empty:
        valid_issues['Tool'] = tool_name
        valid_issues_all = pd.concat(
            [valid_issues_all, valid_issues], ignore_index=True)
        
len(valid_issues_all)

2714

In [45]:
final = set()
for _, row in valid_issues_all['Issue_label'].map(set).items():
    final = final.union(row)
final

{'"experiments"',
 '0.4.6',
 '1.1',
 '1.4',
 '1.6',
 '1.7',
 '2.0',
 '3 - Quality of Life',
 '3rd party',
 '3rd party update',
 ':bridge_at_night:  Bridge',
 ':bug: bug',
 ':rotating_light:',
 '? - Needs Triage',
 'A: example-dvc-experiments',
 'A: example-get-started',
 'ADO',
 'AI\u202fFrameworks/ONNX',
 'AML Compute Instance',
 'API',
 'API & Doc',
 'Auto\u202fML',
 'BF',
 'Cloud',
 'Community',
 'Compute',
 'Core UI',
 'DRL',
 'Data Labeling',
 'Data4ML',
 'Data\u202fDrift',
 'Data\u202fPrep\u202fServices',
 'Documentation',
 'ERRATA_CANDIDATE',
 'Enhancement',
 'Environments',
 'Evaluation',
 'Experimentation UI',
 'FAQ',
 'Feature - Medium Priority',
 'HIGH',
 'HPO',
 'Hyperdrive',
 'Important',
 'In the roadmap',
 'Inf1',
 'Inference',
 'Ingestion',
 'Issue: Bug Report 🐞',
 'Issue: Feature Request',
 'L',
 'LOE: S',
 'Localized',
 'MLOps',
 'NLP',
 'NUM',
 'Needs Triage',
 'Not related to PyCaret',
 'Notebook',
 'Optional',
 'P0',
 'P1',
 'P2',
 'Pipelines',
 'Priority 1',
 'Rei

In [46]:
# filter out issues that are not related to challenges
valid_issues_filtered = []

for index, row in valid_issues_all.iterrows():
    if not row['Issue_title'].isascii():
        continue

    break_sign = False
    title = row['Issue_title'].lower()
    for label_repo in row['Issue_label']:
        for issue_label in issue_labels:
            if not break_sign and issue_label in label_repo.lower():
                valid_issues_filtered.append(row)
                break_sign = True
            elif not break_sign and issue_label in title:
                valid_issues_filtered.append(row)
                break_sign = True

valid_issues_filtered = pd.concat(valid_issues_filtered, axis=1).T
valid_issues_filtered.to_json(os.path.join(
    path_github_issue_filtered, 'issues.json'), indent=4, orient='records')
len(valid_issues_filtered)

407

In [47]:
# combine Github and Gitlab issues

df_issue_github = pd.read_json(os.path.join(
    path_github_issue_filtered, 'issues.json'))
df_issue_gitlab = pd.read_json(os.path.join(
    path_gitlab_issue_filtered, 'issues.json'))

df_issue_github['Platform'] = 'Github'
df_issue_gitlab['Platform'] = 'Gitlab'

df_issues = pd.concat([df_issue_github, df_issue_gitlab], ignore_index=True)

del df_issues['Issue_label']

df_issues.to_json(os.path.join(
    path_labeling, 'original.json'), indent=4, orient='records')


In [50]:
# add potential field to issues for later filling

df_issues = pd.read_json(os.path.join(path_labeling, 'original.json'))

df_issues['Issue_original_content'] = np.nan
df_issues['Issue_preprocessed_content'] = np.nan
df_issues['Issue_gpt_summary_original'] = np.nan
df_issues['Issue_gpt_summary'] = np.nan

df_issues['Issue_score_count'] = df_issues['Issue_upvote_count'] - df_issues['Issue_downvote_count']
df_issues['Issue_body'] = df_issues['Issue_body'].fillna('')

del df_issues['Issue_upvote_count']
del df_issues['Issue_downvote_count']

df_issues.to_json(os.path.join(path_labeling, 'issues_new.json'), indent=4, orient='records')


In [5]:
# content preprocessing patterns

from gensim.parsing.preprocessing import preprocess_string
import re
    
regex = r"(ftp|https?:\/\/[^\s]+)|([a-z]*([a-z])\3[a-z]*)|(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+-[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]<>]+)"

def preprocess_text(text, remove_code=False):
    text = text.lower().encode('ascii', errors='ignore').decode('ascii')

    for tool_keywords in tools_keywords.values():
        for tool_keyword in tool_keywords:
            if tool_keyword in text:
                text = text.replace(tool_keyword, '')

    text = re.sub(regex, ' ', text, 0, re.DOTALL) if remove_code else text
            
    text = preprocess_string(text)
    text = ' '.join(text)
    return text

In [55]:
# create prompt for gpt model

import random

prompt_issue = 'Your task is to provide a brief and accurate summary of the issue post that the user has encountered based on the given text. Your summary should be concise, highlighting only the most important details related to the challenges faced by the user.\n###'
# prompt_fix = 'Given a challenge-discussion pair, please extract any possible solutions mentioned in the discussion and provide a brief summary of them. If no solution is mentioned, please indicate that there are no solutions provided.\n###'

def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0

    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1

In [6]:
# Experiment 1

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    issue = preprocess_text(row['Issue_title']) + ' ' + preprocess_text(str(row['Issue_body']))
    if len(issue.split()) < 6:
        df_issues.drop(index, inplace=True)
        print(issue)
    else:
        df_issues.at[index, 'Issue_original_content'] = issue

df_issues.to_json(os.path.join(path_labeling,
                  'issues.json'), indent=4, orient='records')

In [57]:
df_issues = pd.read_json(os.path.join(path_labeling, 'issues.json'))
df_issues.groupby('Platform').count()['Issue_title'].reset_index()

Unnamed: 0,Platform,Issue_title
0,Github,400
1,Gitlab,1


In [58]:
df_issues = pd.read_json(os.path.join(path_labeling, 'issues.json'))
df_issues.groupby('Tool').count()['Issue_title'].reset_index()

Unnamed: 0,Tool,Issue_title
0,Amazon SageMaker,78
1,Azure Machine Learning,74
2,ClearML,3
3,Comet,22
4,DVC,29
5,Kedro,19
6,MLflow,111
7,Neptune,14
8,Optuna,1
9,SigOpt,4


In [61]:
# Experiment 2

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    if index % 50 == 49:
        print(f'persisting on question {index}')
        df_issues.to_json(os.path.join(
            path_labeling, 'issues.json'), indent=4, orient='records')

    if pd.notna(row['Issue_gpt_summary_original']):
        continue

    try:
        prompt = prompt_issue + 'Title: ' + row['Issue_title'] + ' Body: ' + row['Issue_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-4-32k',
            messages=[
                {"role": "system", "content": "You are an accurate summarizer."},
                {"role": "user", "content": prompt},
            ],
            temperature=0,
            max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=10,
            stream=False
        )
        df_issues.at[index, 'Issue_gpt_summary_original'] = response['choices'][0]['message']['content'].strip()
        
    except Exception as e:
        # output unsuccesful requests
        print(f'{e} on issue {row["Issue_link"]}')

    time.sleep(1)

df_issues.to_json(os.path.join(
    path_labeling, 'issues.json'), indent=4, orient='records')


In [62]:
assert df_issues.shape[0] == df_issues[df_issues['Issue_gpt_summary_original'].str.len() > 0].shape[0]


In [7]:
# Experiment 2

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Issue_gpt_summary'] = preprocess_text(row['Issue_gpt_summary_original'])
    
df_issues.to_json(os.path.join(
    path_labeling, 'issues.json'), indent=4, orient='records')

In [8]:
# Experiment 3

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    content = preprocess_text(row['Issue_title'], remove_code=True) + ' ' + preprocess_text(str(row['Issue_body']), remove_code=True)
    df_issues.at[index, 'Issue_preprocessed_content'] = content

df_issues.to_json(os.path.join(
    path_labeling, 'issues.json'), indent=4, orient='records')

In [15]:
# # Experiment 4

# df_issues = pd.read_json(os.path.join(
#     path_labeling, 'issues.json'))

# for index, row in df_issues.iterrows():
#     if pd.notna(row['Issue_closed_time']) and row['Comment_body']:
#         df_issues.at[index, 'Answer_original_content'] = preprocess_text(row['Comment_body'])

# df_issues.to_json(os.path.join(path_labeling,
#                   'issues.json'), indent=4, orient='records')

In [23]:
# # Experiment 5

# df_issues = pd.read_json(os.path.join(
#     path_labeling, 'issues.json'))

# for index, row in df_issues.iterrows():
#     if index % 50 == 49:
#         print(f'persisting on question {index}')
#         df_issues.to_json(os.path.join(
#             path_labeling, 'issues.json'), indent=4, orient='records')

#     if pd.isna(row['Issue_closed_time']) or not row['Comment_body'] or pd.notna(row['Answer_gpt_summary_original']):
#         continue

#     try:
#         prompt = prompt_fix + 'Challenge: ' + row['Issue_gpt_summary_original'] + ' Discussion: ' + row['Comment_body'] + '###\n'
#         response = retry_with_backoff(
#             openai.ChatCompletion.create,
#             model='gpt-4-32k',
#             messages=[
#                 {"role": "system", "content": "You are an accurate summarizer."},
#                 {"role": "user", "content": prompt},
#             ],
#             temperature=0,
#             max_tokens=150,
#             top_p=1,
#             frequency_penalty=0,
#             presence_penalty=0,
#             timeout=10,
#             stream=False
#         )
#         content = response['choices'][0]['message']['content'].strip()
#         df_issues.at[index, 'Answer_gpt_summary_original'] = content
#         df_issues.at[index, 'Answer_gpt_summary'] = preprocess_text(content)
#     except Exception as e:
#         # output unsuccesful requests
#         print(f'{e} on issue {row["Issue_link"]}')

#     time.sleep(1)

# df_issues.to_json(os.path.join(
#     path_labeling, 'issues.json'), indent=4, orient='records')


persisting on question 49
persisting on question 99
persisting on question 149
persisting on question 199
persisting on question 249
persisting on question 299
persisting on question 349
persisting on question 399


In [19]:
# # Experiment 6

# df_issues = pd.read_json(os.path.join(
#     path_labeling, 'issues.json'))

# for index, row in df_issues.iterrows():
#     if pd.notna(row['Issue_closed_time']) and row['Comment_body']:
#         df_issues.at[index, 'Answer_preprocessed_content'] = preprocess_text(row['Comment_body'], remove_code=True)

# df_issues.to_json(os.path.join(
#     path_labeling, 'issues.json'), indent=4, orient='records')

In [15]:
# sample size is based on the recommendation from https://www.calculator.net/sample-size-calculator.html

sample_size = 197

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

df_sample = df_issues[df_issues['Issue_closed_time'].notna()].sample(n=sample_size, random_state=42)

df_sample.to_json(os.path.join(
    path_labeling, 'sample.json'), indent=4, orient='records')


In [None]:
# from github import Github
# import time

# token = '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'
# g = Github(login_or_token=token)

# df = pd.read_json(os.path.join(path_labeling, 'issues+.json'))

# df['Issue_self_closed'] = np.nan

# for index, row in df.iterrows():
#     if row['Platform'] == 'Github' and pd.notna(row['Issue_closed_time']):
#         try:
#             link = row['Issue_link']
#             components = link.split('/')
#             repo = g.get_repo(f'{components[3]}/{components[4]}')
#             issue_number = int(components[-1])
#             issue = repo.get_issue(number=issue_number)
#             df.at[index, 'Issue_self_closed'] = issue.closed_by.id == issue.user.id
#             # time.sleep(3)
#         except Exception as e:
#             print(f'{e} on issue {row["Issue_link"]}')

# df.to_json(os.path.join(path_labeling, 'issues+.json'), indent=4, orient='records')

In [7]:
# from gitlab import Gitlab

# token = 'glpat-SvwyWD6pbPNvbsBSvxdy'
# g = Gitlab(private_token=token)

# df = pd.read_json(os.path.join(path_labeling, 'issues.json'))

# for index, row in df.iterrows():
#     if row['Platform'] == 'Gitlab' and pd.notna(row['Issue_closed_time']):
#         link = row['Issue_link']
#         components = link.split('/')
#         repo = g.projects.get(id=f'{components[3]}/{components[4]}')
#         issue_number = int(components[-1])
#         issue = repo.issues.get(issue_number)
#         Issue_self_closed = issue.__getattr__("closed_by")["id"] == issue.author["id"]
