In [12]:
import os
import re
import time
import openai
import random
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import remove_stopwords

In [13]:
random_state = 42

path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

style_post_tools = ['Azure Machine Learning', 'DVC', 'Guild AI', 'SigOpt', 'Weights & Biases']

regex = r'''([a-z]*([a-z])\2[a-z]*)|(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+-[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]<>]+)'''

prompt_summary = 'Your task is to provide an accurate summary of the text within 100 words. Your summary should highlight only the most important aspects regarding the anomaly or inquiries.\n###'

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sage maker', 'sagemaker', 'aws', 'amazon'],
    'Azure Machine Learning': ['azure machine learning', 'azure ml', 'azureml', 'azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai', 'guildai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow', 'databricks'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex ai', 'vertexai', 'google'],
    'Weights & Biases': ['weights and biases', 'wandb', 'weights & biases', 'weights&biases', 'w & b', 'w&b']
}


In [14]:
# content preprocessing patterns

def preprocess_normal_post(text, remove_code=False):
    text = text.lower().encode('ascii', errors='ignore').decode('ascii')

    for tool_keywords in tools_keywords.values():
        for tool_keyword in tool_keywords:
            if tool_keyword in text:
                text = text.replace(tool_keyword, '')

    text = re.sub(regex, ' ', text, 0, re.DOTALL) if remove_code else text
            
    text = preprocess_string(text)
    text = ' '.join(text)
    return text

def preprocess_style_post(text, remove_code=False):          
    soup = BeautifulSoup(text, 'html.parser')
    remove_tags = ['script', 'style']
    remove_tags.append('code') if remove_code else None
    for tag in soup(remove_tags):
        tag.decompose()
    text = soup.get_text()
    
    text = text.lower().encode('ascii', errors='ignore').decode('ascii')
    for tool_keywords in tools_keywords.values():
        for tool_keyword in tool_keywords:
            if tool_keyword in text:
                text = text.replace(tool_keyword, '')
    
    text = re.sub(regex, ' ', text, 0, re.DOTALL) if remove_code else text
    
    text = preprocess_string(text)
    text = ' '.join(text)
    return text

# expential backoff

def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0

    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1

In [None]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_comment_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']

    df_issues.at[index, 'Discussion_body'] = row['Comment_body']
    df_issues.at[index, 'Discussion_score_count'] = row['Comment_score_count']

del df_issues['Issue_title']
del df_issues['Issue_body']
del df_issues['Issue_link']
del df_issues['Issue_created_time']
del df_issues['Issue_comment_count']
del df_issues['Issue_score_count']
del df_issues['Issue_closed_time']
del df_issues['Issue_repo_issue_count']
del df_issues['Issue_repo_star_count']
del df_issues['Issue_repo_watch_count']
del df_issues['Issue_repo_fork_count']
del df_issues['Issue_repo_contributor_count']
del df_issues['Issue_self_closed']

del df_issues['Comment_body']
del df_issues['Comment_score_count']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

for index, row in df_questions.iterrows():
    df_questions.at[index, 'Challenge_title'] = row['Question_title']
    df_questions.at[index, 'Challenge_body'] = row['Question_body']
    df_questions.at[index, 'Challenge_link'] = row['Question_link']
    df_questions.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_questions.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_questions.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_questions.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_questions.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_questions.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_questions.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_questions.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_questions.at[index, 'Challenge_self_closed'] = row['Question_self_closed']
    
    df_questions.at[index, 'Solution_body'] = row['Answer_body']
    df_questions.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_questions.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']
    df_questions.at[index, 'Solution_score_count'] = row['Answer_score_count']

del df_questions['Question_title']
del df_questions['Question_body']
del df_questions['Question_link']
del df_questions['Question_created_time']
del df_questions['Question_last_edit_time']
del df_questions['Question_answer_count']
del df_questions['Question_comment_count']
del df_questions['Question_score_count']
del df_questions['Question_closed_time']
del df_questions['Question_view_count']
del df_questions['Question_favorite_count']
del df_questions['Question_self_closed']

del df_questions['Answer_body']
del df_questions['Answer_comment_count']
del df_questions['Answer_last_edit_time']
del df_questions['Answer_score_count']

df = pd.concat([df_issues, df_questions], ignore_index=True)
df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


In [11]:
# Experiment 1

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title']) + ' ' + preprocess_style_post(row['Challenge_body'])
    else:
        content = preprocess_normal_post(row['Challenge_title']) + ' ' + preprocess_normal_post(row['Challenge_body'])

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [10]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    if pd.notna(row['Challenge_gpt_summary_original_content']):
        continue
    
    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + ' Body: ' + row['Challenge_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-4',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary_original_content'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(1)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


This model's maximum context length is 16385 tokens. However, your messages resulted in 16720 tokens. Please reduce the length of the messages. on post https://github.com/kedro-org/kedro/issues/308
This model's maximum context length is 16385 tokens. However, your messages resulted in 42298 tokens. Please reduce the length of the messages. on post https://github.com/Azure/MachineLearningNotebooks/issues/1668
This model's maximum context length is 16385 tokens. However, your messages resulted in 16742 tokens. Please reduce the length of the messages. on post https://stackoverflow.com/questions/73624005
This model's maximum context length is 16385 tokens. However, your messages resulted in 52631 tokens. Please reduce the length of the messages. on post https://community.wandb.ai/t/oserror-could-not-find-a-suitable-tls-ca-certificate/3913
This model's maximum context length is 16385 tokens. However, your messages resulted in 19521 tokens. Please reduce the length of the messages. on post 

In [12]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_style_post(row['Challenge_gpt_summary_original_content'])
    else:
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_normal_post(row['Challenge_gpt_summary_original_content'])

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [13]:
# Experiment 3

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_style_post(row['Challenge_body'], remove_code=True)
    else:
        content = preprocess_normal_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_normal_post(row['Challenge_body'], remove_code=True)
        
    df.at[index, 'Challenge_preprocessed_content'] = content
        
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [16]:

from gensim.parsing.preprocessing import preprocess_string

preprocess_string('unknown')

['unknown']

In [15]:
# remove custom stop words from challenges and solutions

stop_words_custom = [
    'abl',
    'abnorm',
    'acknowledg',
    'actual',
    'ad',
    'addition',
    'admit',
    'advis',
    'alright',
    'altern',
    'amaz',
    'answer',
    'anomali'
    'api',
    'appear',
    'applic',
    'appreci',
    'approach',
    'appropri',
    'aris',
    'ask',
    'assum',
    'astonish',
    'attempt',
    'avail',
    'aw',
    'awesom',
    'azur',
    'bad',
    'basic',
    'behavior',
    'behaviour',
    'best',
    'better',
    'bring',
    'bug',
    'case',
    'categori',
    'caus',
    'certain',
    'challeng',
    'chang',
    'check',
    'cloudera',
    'code',
    # 'colab',
    'command',
    'concern',
    'confirm',
    'confus',
    'consid',
    'consult',
    'contain',
    'content',
    'continu',
    'correct',
    'correctli',
    'correspond',
    'couldn',
    'crash',
    'curiou',
    'current',
    'custom',
    'deep',
    'demand',
    'demo',
    'deni',
    'depict',
    'describ',
    'despit',
    'detail',
    'develop',
    'differ',
    'differenti',
    'difficult',
    'difficulti',
    'discov',
    'discuss',
    'distinguish',
    'easi',
    'east',
    'effect',
    'emerg',
    'encount',
    # 'end',
    'enquiri',
    'ensur',
    'error',
    'especi',
    'exampl',
    'exception',
    'excit',
    'exist',
    'expect',
    'experi',
    'eventu',
    'databrick',
    'def',
    'domo',
    'dont',
    'face',
    'fact',
    'fascin',
    'fail',
    'failur',
    'fairli',
    'fals',
    'far',
    'fault',
    'favorit',
    'favourit',
    'feel',
    'find',
    'fine',
    'firstli',
    'fix',
    'float',
    'follow',
    'form',
    'frustrat',
    'gcp',
    'get',
    'give',
    'given',
    'go',
    'good',
    'googl',
    'got',
    'guarante',
    'handl',
    'happen',
    'hard',
    'have',
    'hear',
    'hei',
    'hello',
    'help',
    'ibm',
    'impli',
    'implic',
    'includ',
    'incorrect',
    'incorrectli',
    'incred',
    'indic',
    'info',
    'inform',
    'inner',
    'inquiri',
    'insight',
    'instead',
    'int',
    'interest',
    'invalid',
    'investig',
    'issu',
    'join',
    # 'jupyter',
    # 'keras',
    'kind',
    'know',
    'known',
    'lead',
    'learn',
    'let',
    'like',
    'long',
    'look',
    'lot',
    'machin',
    'malfunct',
    'make',
    'main',
    'major',
    'manag',
    'manner',
    'marvel',
    'max',
    'mean',
    'meaning',
    'meaningfulli',
    'meaningless',
    'meantim',
    'mention',
    'method',
    'min',
    'mind',
    'mistak',
    'mistakenli',
    # 'multipl',
    'name',
    'near',
    'necessari',
    'need',
    'new',
    'non',
    'north',
    'notice',
    'number',
    'obtain',
    'occas',
    'occasion',
    'occur',
    'offer',
    'old',
    'opinion',
    'own',
    # 'open',
    'oracl',
    'ought',
    'outcom',
    'part',
    'particip',
    'particular',
    'particularli',
    'perceive',
    # 'perform',
    'permit',
    'person',
    'perspect',
    'place',
    'point',
    'pointless',
    'possibl',
    'post',
    'pretty',
    'problem',
    'product',
    'program',
    'project',
    'proper',
    'provid',
    'python',
    # 'pytorch',
    'question',
    'real',
    'realize',
    'recent',
    'recognize',
    'recommend',
    'refer',
    'regard',
    'requir',
    'resolv',
    'respond',
    'result',
    'right',
    'rightli',
    'satisfi',
    'saw',
    'scenario',
    # 'scikit',
    'script',
    'second',
    'secondli',
    'seek',
    'seen',
    'self',
    'shall',
    'shan',
    'shock',
    'shouldn',
    'show',
    'similar',
    'simpl',
    'situat',
    # 'sklearn',
    'snippet',
    'snowflak',
    'solut',
    'solv',
    'sound',
    'sourc',
    'south',
    'special',
    'specif',
    # 'start',
    'startl',
    'strang',
    'string',
    'struggl',
    'stun',
    'succe',
    'success',
    'suggest',
    'super',
    # 'support',
    'sure',
    'suspect',
    'take',
    'talk',
    'tell',
    # 'tensorflow',
    'text',
    'thank',
    'thing',
    'think',
    'thirdli',
    'thought',
    'tool',
    'topic',
    'total',
    'true',
    'truth',
    'try',
    'unabl',
    'understand',
    'unexpect',
    'unknown',
    'unsur',
    'upcom',
    'us',
    'user',
    'usual',
    'valid',
    'view',
    'viewpoint',
    'wai',
    'want',
    'weird',
    'west',
    'will',
    'word',
    'worst',
    'won',
    'wonder',
    'work',
    'wors',
    'wouldn',
    'wrong',
    'wrongli',
    'xgboost',
    'ye',
] 

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    df.at[index, 'Challenge_original_content'] = remove_stopwords(row['Challenge_original_content'], stopwords=stop_words_custom)
    df.at[index, 'Challenge_preprocessed_content'] = remove_stopwords(row['Challenge_preprocessed_content'], stopwords=stop_words_custom)
    df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = remove_stopwords(row['Challenge_gpt_summary_preprocessed_content'], stopwords=stop_words_custom)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')
