In [17]:
import os
import re
import time
import spacy
import openai
import random
import enchant
import textstat
import itertools
import collections
import pandas as pd
import plotly.graph_objects as go

from bs4 import BeautifulSoup
from collections import namedtuple
from gensim.parsing.preprocessing import remove_stopwords, strip_short, strip_punctuation, STOPWORDS

In [39]:
random_state = 42

nlp = spacy.load("en_core_web_sm")
spell_checker = enchant.Dict("en_US")

path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

prompt_summary = 'Concisely convey the most significant points about the text in one or two brief sentences.\n###'

# List of image extensions we want to recognize
keywords_image = {
    ".jpg", 
    ".png", 
    ".jpeg", 
    ".gif", 
    ".bmp", 
    ".webp", 
    ".svg", 
    ".tiff"
}

keywords_patch = {
    'pull',
}

keywords_issue = {
    'answers',
    'discussions',
    'forums',
    'issues',
    'questions',
    'stackoverflow',
}

keywords_tool = {
    'github',
    'gitlab',
    'pypi',
}

keywords_doc = {
    'developers',
    'docs',
    'documentation',
    'features',
    'library',
    'org',
    'wiki',
}

keywords_tutorial = {
    'guide',
    'learn',
    'tutorial',
}

tools_keyword_mapping = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sagemaker', 'amazon', 'aws'],
    'Azure Machine Learning': ['azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow', 'databricks'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex', 'google'],
    'Weights & Biases': ['weights', 'biases', 'wandb']
}

tools_keyword_list = list(itertools.chain(*tools_keyword_mapping.values()))

stop_words_custom = {
    'action',
    'actions',
    'activity',
    'advance',
    'advice',
    'analysis',
    'analyses',
    'answer',
    'answers',
    'approach',
    'approaches',
    'article',
    'assertion',
    'behavior',
    'bit',
    'block',
    'body',
    'bug',
    'build',
    'building',
    'case',
    'cases',
    'cause',
    'change',
    'changes',
    'char',
    'character',
    'characters',
    'classification',
    'collection',
    'com',
    'combination',
    'commmunication',
    'community',
    'company',
    'confusion',
    'content',
    'contents',
    'control',
    'count',
    'couple',
    'couples',
    'course',
    'courses',
    'cross',
    'custom',
    'customer',
    'customers',
    'day',
    'days',
    'decision',
    'default',
    'differ',
    'difference',
    'description',
    'desktop',
    'detail',
    'details',
    'edit',
    'end',
    'error',
    'errors',
    'example',
    'examples',
    'exception',
    'exceptions',
    'experience',
    'explanation',
    'explanations',
    'exit',
    'face',
    'fact',
    'facts',
    'fail',
    'feature',
    'features',
    'feedback',
    'feedbacks',
    'fix',
    'fixes',
    'float',
    'follow',
    'following',
    'forecast',
    'forecasting',
    'form',
    'forms',
    'functionality',
    'functionalities',
    'future',
    'goal',
    'goals',
    'guidance',
    'guide',
    'guy',
    'guys',
    'help',
    'hour',
    'hours',
    'idea',
    'ideas',
    'info',
    'information',
    'instruction',
    'instructions',
    'int',
    'issue',
    'issues',
    'kind',
    'kinds',
    'language',
    'languages',
    'laptop',
    'learn',
    'learning',
    'level',
    'levels',
    # 'location',
    # 'locations',
    'look',
    'looks',
    'lot',
    'lots',
    'luck',
    'machine',
    'machines',
    'message',
    'messages',
    'method',
    'methods',
    'minute',
    'minutes',
    'mistake',
    'mistakes',
    'moment',
    'month',
    'months',
    'need',
    'needs',
    'note',
    'number',
    'numbers',
    'one',
    'ones',
    'org',
    'organization',
    'part',
    'parts',
    'people',
    'person',
    'picture',
    'place',
    'plan',
    'post',
    'posts',
    'price',
    'problem',
    'problems',
    'processing',
    'product',
    'products',
    'program',
    'project',
    'projects',
    'purpose',
    'purposes',
    'question',
    'questions',
    'raise',
    'reason',
    'reasons',
    'recommendation',
    'recommendations',
    'regression',
    'research',
    'result',
    'results',
    'return',
    'returns',
    'scenario',
    'scenarios',
    'science',
    'screen',
    'screenshot',
    'screenshots',
    'second',
    'seconds',
    'section',
    'self',
    'sense',
    'sentence',
    'setup',
    'shape',
    'site',
    'situation',
    'software',
    'solution',
    'solutions',
    'speech',
    'start',
    'state',
    'statement',
    'states',
    'status',
    'step',
    'string',
    'study',
    'stuff',
    'success',
    'suggest',
    'suggestion',
    'suggestions',
    'summary',
    'summaries',
    'support',
    'task',
    'tasks',
    'text',
    'time',
    'times',
    'thank',
    'thanks',
    'thing',
    'things',
    'three',
    'title',
    'today',
    'tomorrow',
    'tool',
    'tools',
    'trouble',
    'truth',
    'tutorial',
    'tutorials',
    'two',
    'understand',
    'understanding',
    'url',
    'urls',
    'use',
    'user',
    'users',
    'uses',
    'value',
    'values',
    'video',
    'videos',
    'vision',
    'voice',
    'way',
    'ways',
    'week',
    'weeks',
    'word',
    'words',
    'work',
    'workaround',
    'workarounds',
    'works',
    'year',
    'years',
    'yesterday',
}

stop_words_list = STOPWORDS.union(tools_keyword_list).union(stop_words_custom)


In [32]:
def count_code_line(block_list):
    total_loc = 0
    for blocks in block_list:
        for block in blocks:
            for line in block.splitlines():
                if line.strip():
                    total_loc += 1
    return total_loc

def extract_styles(content):
    soup = BeautifulSoup(content, 'html.parser')
    # extract code blocks type 1
    code_line1 = count_code_line([c.get_text() for c in soup.find_all('code')]) 
    # extract code blocks type 2
    code_line2 = count_code_line([c.get_text() for c in soup.find_all('blockquote')]) 
    code_line = code_line1 + code_line2
    # extract links
    links = [a['href'] for a in soup.find_all('a', href=True)] 
    # remove code blocks type 1
    for tag in soup.find_all('code'):  
        tag.decompose()
    # remove code blocks type 2
    for tag in soup.find_all('blockquote'):  
        tag.decompose()
    # remove links
    for tag in soup.find_all('a'):  
        tag.decompose()
    clean_text = soup.get_text(separator=' ', strip=True)
    return clean_text, links, code_line

def extract_code(content):
    code_patterns = [r'```.+?```', r'``.+?``', r'`.+?`']
    clean_text = content
    code_line = 0

    for code_pattern in code_patterns:
        code_snippets = re.findall(code_pattern, clean_text, flags=re.DOTALL)
        code_line += count_code_line(code_snippets)
        clean_text = re.sub(code_pattern, '', clean_text, flags=re.DOTALL)
    
    return clean_text, code_line

def extract_links(text):
    link_pattern1 = r"\!?\[.*?\]\(.*?\)"
    links1 = re.findall(link_pattern1, text)
    clean_text = re.sub(link_pattern1, '', text)
    link_pattern2 = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    links2 = re.findall(link_pattern2, clean_text)
    clean_text = re.sub(link_pattern2, '', clean_text)
    links = links1 + links2
    return clean_text, links

def analyze_links(links):
    image_links = 0
    documentation_links = 0
    tool_links = 0
    issue_links = 0
    patch_links = 0
    tutorial_links = 0
    example_links = 0
    
    for link in links:
        if any([image in link for image in keywords_image]):
            image_links += 1
        elif any([patch in link for patch in keywords_patch]):
            patch_links += 1
        elif any([issue in link for issue in keywords_issue]):
            issue_links += 1
        elif any([tool in link for tool in keywords_tool]):
            tool_links += 1
        elif any([doc in link for doc in keywords_doc]):
            documentation_links += 1
        elif any([tool in link for tool in keywords_tutorial]):
            tutorial_links += 1
        else:
            example_links += 1

    link_analysis = namedtuple('Analyzer', ['image', 'documentation', 'tool', 'issue', 'patch', 'tutorial', 'example'])
    return link_analysis(image_links, documentation_links, tool_links, issue_links, patch_links, tutorial_links, example_links)

def analyze_text(text):
    word_count = textstat.lexicon_count(text)
    readability = textstat.flesch_reading_ease(text)
    reading_time = textstat.reading_time(text)
    
    text_analysis = namedtuple('Analyzer', ['word_count', 'readability', 'reading_time'])
    return text_analysis(word_count, readability, reading_time)

def extract_nouns(text):
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return ' '.join(nouns)

def extract_english_words(text):
    words = [word for word in text.split() if spell_checker.check(word)]
    return ' '.join(words)

# def remove_text_containing_subwords(text, subwords):
#     clean_text = text.split()
#     for subword in subwords:
#         pattern = re.compile(subword)
#         clean_text = [word for word in clean_text if not pattern.search(subword)]
#     return ' '.join(clean_text)

def word_frequency(text):
    word_counts = collections.Counter(text.split())
    return word_counts

def split_content(content):
    clean_text, links1, code_line1 = extract_styles(content)
    clean_text, code_line2 = extract_code(clean_text)
    clean_text, links2 = extract_links(clean_text)
    
    links = links1 + links2
    code_line = code_line1 + code_line2
    
    content_collection = namedtuple('Analyzer', ['text', 'links', 'code_line'])
    return content_collection(clean_text, links, code_line)

def preprocess_text(text):
    clean_text = text.lower()
    clean_text = strip_punctuation(clean_text)
    clean_text = extract_english_words(clean_text)
    clean_text = extract_nouns(clean_text)
    clean_text = strip_short(clean_text)
    clean_text = remove_stopwords(clean_text, stop_words_list)
    return clean_text

# expential backoff
def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0
    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1
            

In [34]:
total_counts = collections.Counter()
df_issues = pd.read_json(os.path.join(path_dataset, 'original.json'))
for index, row in df_issues.iterrows():
    text = split_content(row['Challenge_body']).text
    # print(text)
    text = preprocess_text(text)
    total_counts.update(word_frequency(text))
total_counts

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  soup = BeautifulSoup(content, 'html.parser')
  soup = BeautifulSoup(content, 'html.parser')


Counter({'model': 9263,
         'data': 5542,
         'file': 4842,
         'code': 3927,
         'training': 2734,
         'pipeline': 2478,
         'notebook': 2243,
         'script': 2009,
         'version': 2004,
         'endpoint': 1954,
         'image': 1860,
         'job': 1809,
         'instance': 1783,
         'time': 1767,
         'thanks': 1766,
         'files': 1664,
         'models': 1640,
         'service': 1570,
         'output': 1508,
         'input': 1459,
         'run': 1458,
         'environment': 1443,
         'experiment': 1394,
         'studio': 1374,
         'packages': 1364,
         'container': 1255,
         'type': 1236,
         'function': 1137,
         'python': 1078,
         'docker': 1054,
         'command': 997,
         'train': 996,
         'server': 984,
         'path': 950,
         'inference': 920,
         'documentation': 917,
         'workspace': 905,
         'user': 879,
         'ml': 877,
         'storage': 8

In [None]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_tag_count'] = row['Issue_tag_count']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_answer_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']

    df_issues.at[index, 'Solution_body'] = row['Comment_body']
    df_issues.at[index, 'Solution_score_count'] = row['Comment_score_count']

del df_issues['Issue_title']
del df_issues['Issue_body']
del df_issues['Issue_link']
del df_issues['Issue_created_time']
del df_issues['Issue_comment_count']
del df_issues['Issue_score_count']
del df_issues['Issue_closed_time']
del df_issues['Issue_repo_issue_count']
del df_issues['Issue_repo_star_count']
del df_issues['Issue_repo_watch_count']
del df_issues['Issue_repo_fork_count']
del df_issues['Issue_repo_contributor_count']
del df_issues['Issue_self_closed']

del df_issues['Comment_body']
del df_issues['Comment_score_count']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

for index, row in df_questions.iterrows():
    df_questions.at[index, 'Challenge_title'] = row['Question_title']
    df_questions.at[index, 'Challenge_body'] = row['Question_body']
    df_questions.at[index, 'Challenge_link'] = row['Question_link']
    df_questions.at[index, 'Challenge_tag_count'] = row['Question_tag_count']
    df_questions.at[index, 'Challenge_topic_count'] = row['Question_topic_count']
    df_questions.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_questions.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_questions.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_questions.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_questions.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_questions.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_questions.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_questions.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_questions.at[index, 'Challenge_self_closed'] = row['Question_self_closed']

    df_questions.at[index, 'Solution_body'] = row['Answer_body']
    df_questions.at[index, 'Solution_score_count'] = row['Answer_score_count']
    df_questions.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_questions.at[index, 'Solution_comment_body'] = row['Answer_comment_body']
    df_questions.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']

del df_questions['Question_title']
del df_questions['Question_body']
del df_questions['Question_link']
del df_questions['Question_created_time']
del df_questions['Question_last_edit_time']
del df_questions['Question_answer_count']
del df_questions['Question_comment_count']
del df_questions['Question_score_count']
del df_questions['Question_closed_time']
del df_questions['Question_view_count']
del df_questions['Question_favorite_count']
del df_questions['Question_self_closed']

del df_questions['Answer_body']
del df_questions['Answer_comment_count']
del df_questions['Answer_last_edit_time']
del df_questions['Answer_comment_body']
del df_questions['Answer_score_count']

df = pd.concat([df_issues, df_questions], ignore_index=True)
df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


In [None]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_dataset, 'original.json'))
df['State'] = df['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

categories = ['Platform', 'Tool', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_dataset, 'Tool platform state sankey.png'))

In [6]:
# Experiment 1

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title']) + ' ' + preprocess_style_post(row['Challenge_body'])
    else:
        content = preprocess_normal_post(row['Challenge_title']) + ' ' + preprocess_normal_post(row['Challenge_body'])
    df.at[index, 'Challenge_original_content'] = content

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [10]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    if pd.notna(row['Challenge_gpt_summary_original_content']):
        continue
    
    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + ' Body: ' + row['Challenge_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-4',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary_original_content'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(1)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


This model's maximum context length is 16385 tokens. However, your messages resulted in 16720 tokens. Please reduce the length of the messages. on post https://github.com/kedro-org/kedro/issues/308
This model's maximum context length is 16385 tokens. However, your messages resulted in 42298 tokens. Please reduce the length of the messages. on post https://github.com/Azure/MachineLearningNotebooks/issues/1668
This model's maximum context length is 16385 tokens. However, your messages resulted in 16742 tokens. Please reduce the length of the messages. on post https://stackoverflow.com/questions/73624005
This model's maximum context length is 16385 tokens. However, your messages resulted in 52631 tokens. Please reduce the length of the messages. on post https://community.wandb.ai/t/oserror-could-not-find-a-suitable-tls-ca-certificate/3913
This model's maximum context length is 16385 tokens. However, your messages resulted in 19521 tokens. Please reduce the length of the messages. on post 

In [7]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_style_post(row['Challenge_gpt_summary_original_content'])
    else:
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_normal_post(row['Challenge_gpt_summary_original_content'])

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [8]:
# Experiment 3

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_style_post(row['Challenge_body'], remove_code=True)
    else:
        content = preprocess_normal_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_normal_post(row['Challenge_body'], remove_code=True)
        
    df.at[index, 'Challenge_preprocessed_content'] = content
        
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [4]:
# remove custom stop words from challenges and solutions


df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    text = row['Challenge_original_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_original_content'] = text
                
    text = row['Challenge_preprocessed_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_preprocessed_content'] = text

    text = row['Challenge_gpt_summary_preprocessed_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = text

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')
