In [1]:
import pandas as pd
import numpy as np

import time
import os

In [2]:
path_dataset = '../Dataset'

path_github = os.path.join(path_dataset, 'GitHub')
path_gitlab = os.path.join(path_dataset, 'GitLab')
path_labeling = os.path.join(path_dataset, 'Labeling')

path_github_repo = os.path.join(path_github, 'Repo')
path_gitlab_repo = os.path.join(path_gitlab, 'Repo')
path_github_issue = os.path.join(path_github, 'Issue')
path_gitlab_issue = os.path.join(path_gitlab, 'Issue')

if not os.path.exists(path_github):
    os.makedirs(path_github)

if not os.path.exists(path_gitlab):
    os.makedirs(path_gitlab)

if not os.path.exists(path_labeling):
    os.makedirs(path_labeling)

if not os.path.exists(path_github_repo):
    os.makedirs(path_github_repo)

if not os.path.exists(path_gitlab_repo):
    os.makedirs(path_gitlab_repo)

if not os.path.exists(path_github_issue):
    os.makedirs(path_github_issue)

if not os.path.exists(path_gitlab_issue):
    os.makedirs(path_gitlab_issue)

In [96]:
# from subprocess import call

# call(["./Dependents_Lookup.sh"], cwd=path_github_repo, shell=True)

1

In [3]:
tools_repo = {
    'Aim': 'aimhubio/aim',
    'Amazon SageMaker': 'aws/sagemaker-python-sdk',
    'Azure Machine Learning': 'Azure/azure-sdk-for-python',
    'ClearML': 'allegroai/clearml',
    'Codalab': 'codalab/codalab-worksheets',
    'DVC': 'iterative/dvc',
    'Determined': 'determined-ai/determined',
    'Domino': 'dominodatalab/python-domino',
    'Guild AI': 'guildai/guildai',
    'Kedro': 'kedro-org/kedro',
    'MLflow': 'mlflow/mlflow',
    'MLRun': 'mlrun/mlrun',
    'ModelDB': 'VertaAI/modeldb',
    'Neptune': 'neptune-ai/neptune-client',
    'Optuna': 'optuna/optuna',
    'Polyaxon': 'polyaxon/polyaxon',
    'Sacred': 'IDSIA/sacred',
    'Valohai': 'valohai/valohai-cli',
    'Weights & Biases': 'wandb/wandb'
}

tools_release_date = {
    'Amazon SageMaker': '2017-11-19',
    'Azure Machine Learning': '2015-02-18',
    'cnvrg.io': '2020-03-31',
    'Comet': '2017-01-01',
    'Iterative Studio': '2021-05-12',
    'Polyaxon': '2018-10-16',
    'SigOpt': '2014-11-01',
    'Vertex AI': '2019-03-01'
}

tools_link = {
    'cnvrg.io': 'https://github.com/cnvrg',
    'Comet': 'https://github.com/comet-ml',
    'Iterative Studio': 'https://studio.iterative.ai',
    'SigOpt': 'https://github.com/sigopt',
    'Vertex AI': 'https://cloud.google.com/vertex-ai'
}

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['amazon sagemaker', 'aws sagemaker', 'sagemaker'],
    'Azure Machine Learning': ['microsoft azure machine learning', 'azure machine learning', 'microsoft azure ml', 'microsoft azureml', 'azure ml', 'azureml'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['google vertex ai', 'vertex ai'],
    'Weights & Biases': ['weights and biases', 'weights & biases', 'weights&biases', 'wandb', 'W & B', 'W&B']
}

issue_labels = {
    'bug',
    'error',
    'invalid',
}


In [65]:
import openai

openai.api_key = os.getenv('OPENAI_API_KEY')

In [4]:
from Scrape.GHMiner import GitHubMiner
from Scrape.GLMiner import GitLabMiner

github_miner = GitHubMiner(private_token=os.getenv('GITHUB_TOKEN'))
gitlab_miner = GitLabMiner(private_token=os.getenv('GITLAB_TOKEN'))

In [17]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [99]:
tools_data = pd.DataFrame()

# scrape open-source asset-management tools
for tool_name, tool_repo in tools_repo.items():
    if tool_name in tools_release_date:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name, release_time=pd.to_datetime(tools_release_date[tool_name]))
    else:
        tool_data = github_miner.scrape_repo(repo_name=tool_repo, real_name=tool_name)

    if not tool_data.empty:
        tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

# add closed-source asset-management tools
for tool_name in tools_link.keys():
    tool_data = {
        'Name': tool_name,
        'Link': tools_link[tool_name],
        'First Release Date': pd.to_datetime(tools_release_date[tool_name])
    }
    tool_data = pd.DataFrame([tool_data])
    tools_data = pd.concat([tools_data, tool_data], ignore_index=True)

tools_data.to_json(os.path.join(path_dataset, 'Tools.json'),
                   indent=4, orient='records')


In [100]:
import json

dependents = pd.DataFrame()

# collect dependents for tools with coding patterns
for tool_name in tools_keywords.keys():
    github_dependents = []
    gitlab_dependents = []

    # collect Github dependents
    file_name = os.path.join(path_github_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # either search by sourcegraph
            if 'Results' in json_data:
                for repo_file in json_data['Results']:
                    # file name match pattern
                    if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('github'):
                        repo_name = repo_file['repository']['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
                    # code usage match pattern
                    elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('github'):
                        repo_name = repo_file['name'].removeprefix(
                            'github.com/')
                        github_dependents.append(repo_name)
            # or search by dependent graph
            elif 'all_public_dependent_repos' in json_data:
                for repo_file in json_data['all_public_dependent_repos']:
                    github_dependents.append(repo_file['name'])

    # collect Gitlab dependents
    file_name = os.path.join(path_gitlab_repo, tool_name + '.json')
    if os.path.exists(file_name):
        with open(file_name, encoding='utf8') as json_file:
            json_data = json.load(json_file)
            # search by sourcegraph exclusively
            for repo_file in json_data['Results']:
                # file name match pattern
                if 'FileMatch' == repo_file['__typename'] and repo_file['repository']['name'].startswith('gitlab'):
                    repo_name = repo_file['repository']['name'].removeprefix(
                        'gitlab.com/')
                    gitlab_dependents.append(repo_name)
                # code usage match pattern
                elif 'Repository' == repo_file['__typename'] and repo_file['name'].startswith('gitlab'):
                    repo_name = repo_file['name'].removeprefix('gitlab.com/')
                    gitlab_dependents.append(repo_name)

    # remove tool repo from dependents if any
    if tool_name in tools_repo and tools_repo[tool_name] in github_dependents:
        github_dependents.remove(tools_repo[tool_name])

    # no need to add tools without dependents
    if not len(github_dependents) and not len(gitlab_dependents):
        continue

    dependent = {
        'Tool': tool_name,
        'GitHub Dependents': github_dependents,
        'GitLab Dependents': gitlab_dependents
    }

    dependents = pd.concat(
        [dependents, pd.DataFrame([dependent])], ignore_index=True)

dependents.to_json(os.path.join(
    path_dataset, 'Dependents.json'), indent=4, orient='records')


In [101]:
github_repos = set()
gitlab_repos = set()

for index, row in dependents.iterrows():
    github_repos = github_repos.union(row['GitHub Dependents'])
    gitlab_repos = gitlab_repos.union(row['GitLab Dependents'])

print(len(github_repos))
print(len(gitlab_repos))

37786
10


In [104]:
dependents_summary = pd.DataFrame(
    columns=['Tool', '#GitHub Dependents', '#GitLab Dependents'])
for index, row in dependents.iterrows():
    dependent_data = {
        'Tool': row['Tool'],
        '#GitHub Dependents': len(row['GitHub Dependents']),
        '#GitLab Dependents': len(row['GitLab Dependents'])
    }
    dependent_data = pd.DataFrame([dependent_data])
    dependents_summary = pd.concat(
        [dependents_summary, dependent_data], ignore_index=True)
dependents_summary

Unnamed: 0,Tool,#GitHub Dependents,#GitLab Dependents
0,Aim,189,1
1,Amazon SageMaker,1114,3
2,Azure Machine Learning,826,0
3,ClearML,498,0
4,Codalab,40,0
5,Comet,678,0
6,Determined,36,0
7,Domino,1,0
8,DVC,6099,0
9,Guild AI,67,4


In [29]:
df_dependents = pd.read_json(os.path.join(path_dataset, 'Dependents.json'))
df_tool = pd.read_json(os.path.join(path_dataset, 'Tools.json'))
tool_release_dates = pd.Series(pd.to_datetime(df_tool['First Release Date'].values), index=df_tool['Name']).to_dict()

In [32]:
# scrape issues of Gitlab dependents for each tool

issues_raw = pd.DataFrame()

for index, row in df_dependents.iterrows():
    print(row['Tool'])
    for repo in row['GitLab Dependents']:
        print(repo)
        repo_data = gitlab_miner.scrape_repo(repo)
        if (not repo_data.empty) and (repo_data['Repo Created Date'].values[0] > tool_release_dates[row['Tool']]) and repo_data['#Issue'].values[0]:
            issues = gitlab_miner.scrape_issue(repo)
            issues['Tool'] = row['Tool']
            issues_raw = pd.concat([issues_raw, issues], ignore_index=True)
            issues_raw.to_json(os.path.join(path_gitlab_issue, 'raw.json'), indent=4, orient='records')
    
issues_raw.to_json(os.path.join(path_gitlab_issue, 'raw.json'), indent=4, orient='records')

Aim
lu-ci/sigma/apex-sigma
Amazon SageMaker
fluidattacks/universe


TypeError: 'NoneType' object is not subscriptable

In [None]:
# scrape issues of Github dependents for each tool

issues_raw = pd.DataFrame()

for index, row in df_dependents.iterrows():
    print(row['Tool'])
    for repo in row['GitHub Dependents']:
        print(repo)
        repo_data = gitlab_miner.scrape_repo(repo)
        if (not repo_data.empty) and (repo_data['Repo Created Date'].values[0] > tool_release_dates[row['Tool']]) and repo_data['#Issue'].values[0]:
            issues = gitlab_miner.scrape_issue(repo)
            issues['Tool'] = row['Tool']
            issues_raw = pd.concat([issues_raw, issues], ignore_index=True)
            issues_raw.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')
    
issues_raw.to_json(os.path.join(path_github_issue, 'raw.json'), indent=4, orient='records')

In [None]:
# exclude issues that are not related to each tool

issues_gitlab = pd.read_json(os.path.join(path_gitlab_issue, 'raw.json'))
issues_github = pd.read_json(os.path.join(path_github_issue, 'raw.json'))

for index, row in issues_gitlab.iterrows():
    for keyword in tools_keywords[row['Tool']]:
        if keyword in row['Issue_title'].lower():
            break
    else:
        issues_gitlab.drop(index, inplace=True)

for index, row in issues_github.iterrows():
    for keyword in tools_keywords[row['Tool']]:
        if keyword in row['Issue_title'].lower():
            break
    else:
        issues_github.drop(index, inplace=True)

issues_github['Platform'] = 'Github'
issues_gitlab['Platform'] = 'Gitlab'

issues_raw = pd.concat([issues_gitlab, issues_github], ignore_index=True)        
len(issues_raw)

2719

In [None]:
final = set()
for _, row in issues_raw['Issue_label'].map(set).items():
    final = final.union(row)
final

{'"experiments"',
 '0.4.6',
 '1.1',
 '1.4',
 '1.6',
 '1.7',
 '2.0',
 '3 - Quality of Life',
 '3rd party',
 '3rd party update',
 ':bridge_at_night:  Bridge',
 ':bug: bug',
 ':rotating_light:',
 '? - Needs Triage',
 'A: example-dvc-experiments',
 'A: example-get-started',
 'ADO',
 'AI\u202fFrameworks/ONNX',
 'AML Compute Instance',
 'API',
 'API & Doc',
 'Auto\u202fML',
 'BF',
 'Cloud',
 'Community',
 'Compute',
 'Core UI',
 'DRL',
 'Data Labeling',
 'Data4ML',
 'Data\u202fDrift',
 'Data\u202fPrep\u202fServices',
 'Documentation',
 'ERRATA_CANDIDATE',
 'Enhancement',
 'Environments',
 'Evaluation',
 'Experimentation UI',
 'FAQ',
 'Feature - Medium Priority',
 'HIGH',
 'HPO',
 'Hyperdrive',
 'Important',
 'In the roadmap',
 'Inf1',
 'Inference',
 'Ingestion',
 'Issue: Bug Report üêû',
 'Issue: Feature Request',
 'L',
 'LOE: S',
 'Localized',
 'MLOps',
 'NLP',
 'NUM',
 'Needs Triage',
 'Not related to PyCaret',
 'Notebook',
 'Optional',
 'P0',
 'P1',
 'P2',
 'Pipelines',
 'Priority 1',
 '

In [None]:
Áúã‰∏Ä‰∏ãissue_labelÈúÄ‰∏çÈúÄË¶ÅÊñ∞Â¢ûÔºÅÔºü

In [41]:
# filter out issues that are not related to challenges

for index, row in issues_raw.iterrows():
    if not row['Issue_title'].isascii():
        issues_raw.drop(index, inplace=True)
        continue
    
    if not row['Issue_label']:
        continue
    else:
        for label in row['Issue_label']:
            for issue_label in issue_labels:
                if issue_label in label.lower():
                    break
            else:
                continue
            break
        else:
            issues_raw.drop(index, inplace=True)

del issues_raw['Issue_label']

issues_raw.to_json(os.path.join(path_labeling, 'original.json'), indent=4, orient='records')
len(issues_raw)

1657

In [55]:
# add potential field to issues for later filling

df_issues = pd.read_json(os.path.join(path_labeling, 'original.json'))

df_issues['Issue_original_content'] = np.nan
df_issues['Issue_preprocessed_content'] = np.nan
df_issues['Issue_gpt_summary_original'] = np.nan
df_issues['Issue_gpt_summary'] = np.nan

df_issues['Issue_score_count'] = df_issues['Issue_upvote_count'] - df_issues['Issue_downvote_count']
df_issues['Comment_score_count'] = df_issues['Comment_upvote_count'] - df_issues['Comment_downvote_count']
df_issues['Issue_body'] = df_issues['Issue_body'].fillna('')

del df_issues['Issue_upvote_count']
del df_issues['Issue_downvote_count']
del df_issues['Comment_upvote_count']
del df_issues['Comment_downvote_count']

df_issues = df_issues.reindex(sorted(df_issues.columns), axis=1)
df_issues.to_json(os.path.join(path_labeling, 'issues.json'), indent=4, orient='records')


In [56]:

df_old = pd.read_json(os.path.join(path_labeling, 'issues.json'))
df_issues = pd.read_json(os.path.join(path_labeling, 'issues+.json'))

for index, row in df_issues.iterrows():
    for i2, r2 in df_old.iterrows():
        if row['Issue_link'] == r2['Issue_link']:
            df_issues.at[index, 'Issue_original_content'] = r2['Issue_original_content']
            df_issues.at[index, 'Issue_preprocessed_content'] = r2['Issue_preprocessed_content']
            df_issues.at[index, 'Issue_gpt_summary_original'] = r2['Issue_gpt_summary_original']
            df_issues.at[index, 'Issue_gpt_summary'] = r2['Issue_gpt_summary']
            break

df_issues.to_json(os.path.join(path_labeling, 'issues.json'), indent=4, orient='records')

In [57]:
# content preprocessing patterns

from gensim.parsing.preprocessing import preprocess_string
import re
    
regex = r'''([a-z]*([a-z])\2[a-z]*)|(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+-[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]<>]+)'''

def preprocess_text(text, remove_code=False):
    text = text.lower().encode('ascii', errors='ignore').decode('ascii')

    for tool_keywords in tools_keywords.values():
        for tool_keyword in tool_keywords:
            if tool_keyword in text:
                text = text.replace(tool_keyword, '')

    text = re.sub(regex, ' ', text, 0, re.DOTALL) if remove_code else text
            
    text = preprocess_string(text)
    text = ' '.join(text)
    return text

In [58]:
# Experiment 1

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    issue = preprocess_text(row['Issue_title']) + ' ' + preprocess_text(str(row['Issue_body']))
    if len(issue.split()) < 6:
        df_issues.drop(index, inplace=True)
        print(issue)
    else:
        df_issues.at[index, 'Issue_original_content'] = issue

df_issues.to_json(os.path.join(path_labeling,
                  'issues.json'), indent=4, orient='records')

support intel optim librari 
bert requesthandl responsehandl insid endpoint 
mnmg xgboost 
deploi 
refactor 
investig remov trialcatalog 
endpoint invoc return bodi 
remot uri 
rearchitect remot extens 
investig extens work vscode dev 
log cli 
telemetri cli 
test cli servic 
actual resourc class 
servic class entri point sdk 
rest servic resourc 
http servic interact rest servic 
templat servic manag yaml templat 
select servic resourc 
cli 
add sourc label yaml diagnost 
updat yaml accordingli 
chang load label extens load 
insid program extens 
chang comput label aml 
pariti new portal 
feedback command work linux 
run creat lot warn cloudpickl 
datastor configur present hardcod inneryedataset
accuraci show set code studio 
investig experi track 
setup server 
integr 
startup expect ostyp 
import import solv problem thank
combin param param work 
remov improv interfac 
node param write test
us pytest virtualenv instal 
run ceph 
entri ensembl protein transform output 
implement repr

In [60]:
df_issues = pd.read_json(os.path.join(path_labeling, 'issues.json'))
df_issues.groupby('Platform').count()['Issue_title'].reset_index()

Unnamed: 0,Platform,Issue_title
0,Github,1497
1,Gitlab,3


In [61]:
df_issues = pd.read_json(os.path.join(path_labeling, 'issues.json'))
df_issues.groupby('Tool').count()['Issue_title'].reset_index()

Unnamed: 0,Tool,Issue_title
0,Amazon SageMaker,334
1,Azure Machine Learning,299
2,ClearML,17
3,Comet,39
4,DVC,128
5,Determined,1
6,Kedro,98
7,MLflow,309
8,Neptune,37
9,Optuna,4


In [19]:
# create prompt for gpt model

import random

prompt_issue = 'Your task is to provide a brief and accurate summary of the issue post. Your summary should be concise, highlighting only the most important aspects regarding the challenges faced by the user.\n###'
# prompt_fix = 'Given a challenge-discussion pair, please extract any possible solutions mentioned in the discussion and provide a brief summary of them. If no solution is mentioned, please indicate that there are no solutions provided.\n###'

def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0

    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1

In [30]:
# Experiment 2

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    if index % 50 == 49:
        print(f'persisting on question {index}')
        df_issues.to_json(os.path.join(
            path_labeling, 'issues.json'), indent=4, orient='records')

    if pd.notna(row['Issue_gpt_summary_original']):
        continue

    try:
        prompt = prompt_issue + 'Title: ' + row['Issue_title'] + ' Body: ' + row['Issue_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-3.5-turbo-16k',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df_issues.at[index, 'Issue_gpt_summary_original'] = response['choices'][0]['message']['content']
        
    except Exception as e:
        # output unsuccesful requests
        print(f'{e} on issue {row["Issue_link"]}')

    time.sleep(1)

df_issues.to_json(os.path.join(
    path_labeling, 'issues.json'), indent=4, orient='records')


persisting on question 49
persisting on question 99
persisting on question 149
persisting on question 199
persisting on question 249
persisting on question 299
persisting on question 349
This model's maximum context length is 16385 tokens. However, your messages resulted in 42302 tokens. Please reduce the length of the messages. on issue https://github.com/Azure/MachineLearningNotebooks/issues/1668
persisting on question 399
persisting on question 449
persisting on question 499
persisting on question 549
persisting on question 599
persisting on question 649
persisting on question 699
persisting on question 749
persisting on question 799
persisting on question 849
persisting on question 899
persisting on question 949
persisting on question 999
persisting on question 1049
persisting on question 1099
persisting on question 1149
persisting on question 1199
persisting on question 1249
persisting on question 1299
persisting on question 1349
persisting on question 1399
persisting on question 

In [34]:
assert df_issues.shape[0] == df_issues[df_issues['Issue_gpt_summary_original'].str.len() > 0].shape[0]


In [35]:
# Experiment 2

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Issue_gpt_summary'] = preprocess_text(row['Issue_gpt_summary_original'])
    
df_issues.to_json(os.path.join(
    path_labeling, 'issues.json'), indent=4, orient='records')

In [28]:
# Experiment 3

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

for index, row in df_issues.iterrows():
    try:
        content = preprocess_text(row['Issue_title'], remove_code=True) + ' ' + preprocess_text(str(row['Issue_body']), remove_code=True)
        df_issues.at[index, 'Issue_preprocessed_content'] = content
    except Exception as e:
        print(e)

df_issues.to_json(os.path.join(
    path_labeling, 'issues.json'), indent=4, orient='records')

In [15]:
# # Experiment 4

# df_issues = pd.read_json(os.path.join(
#     path_labeling, 'issues.json'))

# for index, row in df_issues.iterrows():
#     if pd.notna(row['Issue_closed_time']) and row['Comment_body']:
#         df_issues.at[index, 'Comment_original_content'] = preprocess_text(row['Comment_body'])

# df_issues.to_json(os.path.join(path_labeling,
#                   'issues.json'), indent=4, orient='records')

In [23]:
# # Experiment 5

# df_issues = pd.read_json(os.path.join(
#     path_labeling, 'issues.json'))

# for index, row in df_issues.iterrows():
#     if index % 50 == 49:
#         print(f'persisting on question {index}')
#         df_issues.to_json(os.path.join(
#             path_labeling, 'issues.json'), indent=4, orient='records')

#     if pd.isna(row['Issue_closed_time']) or not row['Comment_body'] or pd.notna(row['Comment_gpt_summary_original']):
#         continue

#     try:
#         prompt = prompt_fix + 'Challenge: ' + row['Issue_gpt_summary_original'] + ' Discussion: ' + row['Comment_body'] + '###\n'
#         response = retry_with_backoff(
#             openai.ChatCompletion.create,
#             model='gpt-4-32k',
#             messages=[
#                 {"role": "system", "content": "You are an accurate summarizer."},
#                 {"role": "user", "content": prompt},
#             ],
#             temperature=0,
#             max_tokens=150,
#             top_p=1,
#             frequency_penalty=0,
#             presence_penalty=0,
#             timeout=10,
#             stream=False
#         )
#         content = response['choices'][0]['message']['content'].strip()
#         df_issues.at[index, 'Comment_gpt_summary_original'] = content
#         df_issues.at[index, 'Comment_gpt_summary'] = preprocess_text(content)
#     except Exception as e:
#         # output unsuccesful requests
#         print(f'{e} on issue {row["Issue_link"]}')

#     time.sleep(1)

# df_issues.to_json(os.path.join(
#     path_labeling, 'issues.json'), indent=4, orient='records')


persisting on question 49
persisting on question 99
persisting on question 149
persisting on question 199
persisting on question 249
persisting on question 299
persisting on question 349
persisting on question 399


In [19]:
# # Experiment 6

# df_issues = pd.read_json(os.path.join(
#     path_labeling, 'issues.json'))

# for index, row in df_issues.iterrows():
#     if pd.notna(row['Issue_closed_time']) and row['Comment_body']:
#         df_issues.at[index, 'Comment_preprocessed_content'] = preprocess_text(row['Comment_body'], remove_code=True)

# df_issues.to_json(os.path.join(
#     path_labeling, 'issues.json'), indent=4, orient='records')

In [62]:
# sample size is based on the recommendation from https://www.calculator.net/sample-size-calculator.html

sample_size = 306

df_issues = pd.read_json(os.path.join(
    path_labeling, 'issues.json'))

df_sample = df_issues[df_issues['Issue_closed_time'].notna()].sample(n=sample_size, random_state=42)

df_sample.to_json(os.path.join(
    path_labeling, 'sample.json'), indent=4, orient='records')


In [None]:
# from github import Github
# import time

# token = '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'
# g = Github(login_or_token=token)

# df = pd.read_json(os.path.join(path_labeling, 'original.json'))

# repo_name_last = ''

# for index, row in df.iterrows():
#     if row['Platform'] == 'Github':
#         try:
#             link = row['Issue_link']
#             components = link.split('/')
#             repo_name = f'{components[3]}/{components[4]}'
            
#             if repo_name != repo_name_last:
#                 repo = g.get_repo(repo_name)
#                 n_contributors = repo.get_contributors().totalCount
#                 issues = repo.get_issues(state='all')
                
#             issue_number = int(components[-1])
#             issue = repo.get_issue(number=issue_number)
            
#             df.at[index, 'Issue_title'] = issue.title
#             df.at[index, 'Issue_label'] = [label.name for label in issue.labels]
#             df.at[index, 'Issue_created_time'] = issue.created_at
#             df.at[index, 'Issue_closed_time'] = issue.closed_at
#             reactions = issue.get_reactions()
#             df.at[index, 'Issue_upvote_count'] = sum(reaction.content == '+1' for reaction in reactions)
#             df.at[index, 'Issue_downvote_count'] = sum(reaction.content == '-1' for reaction in reactions)
#             df.at[index, 'Issue_body'] = issue.body
#             df.at[index, 'Issue_comment_count'] = issue.comments
#             df.at[index, 'Issue_repo_issue_count'] = issues.totalCount
#             df.at[index, 'Issue_repo_watch_count'] = repo.subscribers_count
#             df.at[index, 'Issue_repo_star_count'] = repo.stargazers_count
#             df.at[index, 'Issue_repo_fork_count'] = repo.forks
#             df.at[index, 'Issue_repo_contributor_count'] = n_contributors
#             df.at[index, 'Issue_self_closed'] = np.nan
            
#             if(pd.notna(issue.closed_at)):
#                 df.at[index, 'Issue_self_closed'] = issue.closed_by.id == issue.user.id
#                 comments = []
#                 upvotes = []
#                 downvotes = []
#                 for comment in issue.get_comments():
#                     comments.append(comment.body)
#                     reactions = comment.get_reactions()
#                     upvote = sum(reaction.content == '+1' for reaction in reactions)
#                     downvote = sum(reaction.content == '-1' for reaction in reactions)
#                     upvotes.append(upvote)
#                     downvotes.append(downvote)
#                 df.at[index, 'Comment_body'] = ' '.join(comments)
#                 df.at[index, 'Comment_upvote_count'] = sum(upvotes)
#                 df.at[index, 'Comment_downvote_count'] = sum(downvotes)
                
#             repo_name_last = repo_name
                
#             # time.sleep(5)
#         except Exception as e:
#             print(f'{e} on issue {row["Issue_link"]}')

# df.to_json(os.path.join(path_labeling, 'original.json'), indent=4, orient='records')

In [7]:
# from gitlab import Gitlab

# token = 'glpat-SvwyWD6pbPNvbsBSvxdy'
# g = Gitlab(private_token=token)

# df = pd.read_json(os.path.join(path_labeling, 'issues.json'))

# for index, row in df.iterrows():
#     if row['Platform'] == 'Gitlab' and pd.notna(row['Issue_closed_time']):
#         link = row['Issue_link']
#         components = link.split('/')
#         repo = g.projects.get(id=f'{components[3]}/{components[4]}')
#         issue_number = int(components[-1])
#         issue = repo.issues.get(issue_number)
#         Issue_self_closed = issue.__getattr__("closed_by")["id"] == issue.author["id"]
