In [128]:
import os
import re
import time
import openai
import random
import string
import enchant
import textstat
import pandas as pd
import plotly.graph_objects as go

from langdetect import detect
from bs4 import BeautifulSoup
from collections import namedtuple
from gensim.parsing.preprocessing import preprocess_string

In [26]:
random_state = 42

english = enchant.Dict("en_US")

path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

# style_post_tools = ['Azure Machine Learning', 'DVC', 'Guild AI', 'SigOpt', 'Weights & Biases']

# regex_char = r'[^a-z]'
# regex_code = r'''((.)\2{2,})|(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+-[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]<>]+)'''

prompt_summary = 'Concisely convey the most significant points about the text in one or two brief sentences.\n###'

tools_keywords = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sagemaker', 'amazon', 'aws'],
    'Azure Machine Learning': ['azure machine learning', 'azure ml', 'azure-ml', 'azureml', 'azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild ai', 'guild-ai', 'guildai'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow', 'databricks'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex ai', 'vertex-ai', 'vertexai', 'google'],
    'Weights & Biases': ['weights and biases', 'wandb', 'weights & biases', 'weights&biases', 'w & b', 'w&b']
}

keywords_patch = {
    'pull',
}

keywords_issue = {
    'answers',
    'discussions',
    'forums',
    'issues',
    'questions',
    'stackoverflow',
}

keywords_tool = {
    'github',
    'gitlab',
    'pypi',
}

keywords_doc = {
    'developers',
    'docs',
    'documentation',
    'features',
    'library',
    'org',
    'wiki',
}

keywords_tutorial = {
    'guide',
    'learn',
    'tutorial',
}


In [150]:
def count_code_line(code_blocks):
    total_loc = 0
    for block in code_blocks:
        for line in block.splitlines():
            if line.strip():
                total_loc += 1
    return total_loc

def extract_styles(content):
    soup = BeautifulSoup(content, 'html.parser')
    # extract code
    code_line1 = count_code_line([c.get_text() for c in soup.find_all('code')]) 
    # extract code
    code_line2 = count_code_line([c.get_text() for c in soup.find_all('blockquote')]) 
    code_line = code_line1 + code_line2
    # extract link
    links = [a['href'] for a in soup.find_all('a', href=True)] 
    # extract image
    image_count = len([link for link in links if link.startswith('https://i.stack.imgur.com')])
    non_image_links = [link for link in links if not link.startswith('https://i.stack.imgur.com')]
    for tag in soup.find_all('img'):  # remove images
        tag.decompose()
    for tag in soup.find_all('code'):  # remove code blocks type 1
        tag.decompose()
    for tag in soup.find_all('blockquote'):  # remove code blocks type 2
        tag.decompose()
    for tag in soup.find_all('a'):  # remove URLs
        tag.decompose()
    clean_text = soup.get_text(separator=' ', strip=True)
    return clean_text, non_image_links, code_line, image_count

def extract_code(content):
    code_pattern = r"`([^`]+)`"
    code_line = count_code_line(re.findall(code_pattern, content, flags=re.DOTALL))
    clean_text = re.sub(code_pattern, '', content, flags=re.DOTALL)
    return clean_text, code_line

def extract_links(text):
    link_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    links = re.findall(link_pattern, text)
    clean_text = re.sub(link_pattern, '', text)
    return clean_text, links

def count_image_number(content):
    image_pattern = r"\!\[.*?\]\(.*?\)"
    image_count = len(re.findall(image_pattern, content))
    clean_text = re.sub(image_pattern, '', content)
    return clean_text, image_count

def split_content(content):
    clean_text, links1, code_count1, image_count1 = extract_styles(content)
    clean_text, code_count2 = extract_code(clean_text)
    clean_text, image_count2 = count_image_number(clean_text)
    clean_text, links2 = extract_links(clean_text)
    
    links = links1 + links2
    code_count = code_count1 + code_count2
    image_count = image_count1 + image_count2
    
    content_collection = namedtuple('Analyzer', ['text', 'links', 'code_count', 'image_count'])
    return content_collection(clean_text, links, code_count, image_count)

def analyze_links(links):
    documentation_links = 0
    tool_links = 0
    issue_links = 0
    patch_links = 0
    tutorial_links = 0
    example_links = 0
    
    for link in links:
        if any([patch in link for patch in keywords_patch]):
            patch_links += 1
        elif any([issue in link for issue in keywords_issue]):
            issue_links += 1
        elif any([tool in link for tool in keywords_tool]):
            tool_links += 1
        elif any([doc in link for doc in keywords_doc]):
            documentation_links += 1
        elif any([tool in link for tool in keywords_tutorial]):
            tutorial_links += 1
        else:
            example_links += 1

    link_analysis = namedtuple('Analyzer', ['documentation', 'tool', 'issue', 'patch', 'example'])
    return link_analysis(documentation_links, tool_links, issue_links, patch_links, tutorial_links, example_links)

def analyze_text(text):
    word_count = textstat.lexicon_count(text)
    readability = textstat.flesch_reading_ease(text)
    reading_time = textstat.reading_time(text)
    
    text_analysis = namedtuple('Analyzer', ['word_count', 'readability', 'reading_time'])
    return text_analysis(word_count, readability, reading_time)

def check_english(content):
    if detect(content) == 'en':
        return True
    return False
            
def preprocess_content(content):
    clean_text = content.lower()
    for tool_keywords in tools_keywords.values():
        for tool_keyword in tool_keywords:
            if tool_keyword in content:
                clean_text = clean_text.replace(tool_keyword, ' ')

# expential backoff
def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0
    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1


# def remove_non_printable(content):
#     return ''.join(c for c in content if c not in string.printable)

# def remove_words_containing_string(word, string):
#     words = word.split()
#     pattern = re.compile(string)
#     filtered_words = [word for word in words if not pattern.search(word)]
#     return ' '.join(filtered_words)

# def remove_keywords(content):
#     word_list = []
#     for word in content.split():
#         if english.check(word):
#             word_list.append(word)

# df = pd.read_json(os.path.join(path_dataset, 'issues.json'))
# df = df[df['Platform'].str.contains('Git')]
# for index, row in df.iterrows():
#     print(analyze_content(row['Issue_body']).text)


In [151]:
# df = pd.read_json(os.path.join(path_dataset, 'issues.json'))
# for index, row in df.iterrows():
#     if 'Check for Athena => SageMaker region when' in row['Issue_title']:
#         content_collection = split_content(row['Issue_body'])
#         print(content_collection.text)
#         print(content_collection.links)
#         print(content_collection.code_count)
#         print(content_collection.image_count)




might want to add this to the 01_setup/ Check Dependencies, but only if we want to enforce this.
[]
0
1


  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  soup = BeautifulSoup(content, 'html.parser')


In [3]:
# content preprocessing patterns

def remove_non_printable(s):
    return ''.join(c for c in s if c not in string.printable)

def remove_words_containing_string(text, string):
    # Split the text into words
    words = text.split()
    # Create a regular expression pattern to match the string
    pattern = re.compile(string)
    # Filter out the words that contain the string
    filtered_words = [word for word in words if not pattern.search(word)]
    # Join the filtered words back into a string
    return ' '.join(filtered_words)

def preprocess_normal_post(text, remove_code=False):
    text = text.lower().encode('ascii', errors='ignore').decode('ascii')

    for tool_keywords in tools_keywords.values():
        for tool_keyword in tool_keywords:
            if tool_keyword in text:
                text = text.replace(tool_keyword, '')

    text = re.sub(regex_code, ' ', text, 0, re.DOTALL) if remove_code else text
    text = re.sub(regex_char, ' ', text)
    
    word_list = []
    for word in text.split():
        if english.check(word):
            word_list.append(word)
            
    text = ' '.join(preprocess_string(' '.join(word_list)))
    return text

def preprocess_style_post(text, remove_code=False):          
    soup = BeautifulSoup(text, 'html.parser')
    remove_tags = ['script', 'style']
    remove_tags.append('code') if remove_code else None
    for tag in soup(remove_tags):
        tag.decompose()
    text = soup.get_text()
    text = preprocess_normal_post(text, remove_code)
    return text


In [None]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_tag_count'] = row['Issue_tag_count']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_answer_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']

    df_issues.at[index, 'Solution_body'] = row['Comment_body']
    df_issues.at[index, 'Solution_score_count'] = row['Comment_score_count']

del df_issues['Issue_title']
del df_issues['Issue_body']
del df_issues['Issue_link']
del df_issues['Issue_created_time']
del df_issues['Issue_comment_count']
del df_issues['Issue_score_count']
del df_issues['Issue_closed_time']
del df_issues['Issue_repo_issue_count']
del df_issues['Issue_repo_star_count']
del df_issues['Issue_repo_watch_count']
del df_issues['Issue_repo_fork_count']
del df_issues['Issue_repo_contributor_count']
del df_issues['Issue_self_closed']

del df_issues['Comment_body']
del df_issues['Comment_score_count']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

for index, row in df_questions.iterrows():
    df_questions.at[index, 'Challenge_title'] = row['Question_title']
    df_questions.at[index, 'Challenge_body'] = row['Question_body']
    df_questions.at[index, 'Challenge_link'] = row['Question_link']
    df_questions.at[index, 'Challenge_tag_count'] = row['Question_tag_count']
    df_questions.at[index, 'Challenge_topic_count'] = row['Question_topic_count']
    df_questions.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_questions.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_questions.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_questions.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_questions.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_questions.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_questions.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_questions.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_questions.at[index, 'Challenge_self_closed'] = row['Question_self_closed']

    df_questions.at[index, 'Solution_body'] = row['Answer_body']
    df_questions.at[index, 'Solution_score_count'] = row['Answer_score_count']
    df_questions.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_questions.at[index, 'Solution_comment_body'] = row['Answer_comment_body']
    df_questions.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']

del df_questions['Question_title']
del df_questions['Question_body']
del df_questions['Question_link']
del df_questions['Question_created_time']
del df_questions['Question_last_edit_time']
del df_questions['Question_answer_count']
del df_questions['Question_comment_count']
del df_questions['Question_score_count']
del df_questions['Question_closed_time']
del df_questions['Question_view_count']
del df_questions['Question_favorite_count']
del df_questions['Question_self_closed']

del df_questions['Answer_body']
del df_questions['Answer_comment_count']
del df_questions['Answer_last_edit_time']
del df_questions['Answer_comment_body']
del df_questions['Answer_score_count']

df = pd.concat([df_issues, df_questions], ignore_index=True)
df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


In [7]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_dataset, 'original.json'))
df['State'] = df['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

categories = ['Platform', 'Tool', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_dataset, 'Tool platform state sankey.png'))


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast


invalid value encountered in cast



In [6]:
# Experiment 1

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title']) + ' ' + preprocess_style_post(row['Challenge_body'])
    else:
        content = preprocess_normal_post(row['Challenge_title']) + ' ' + preprocess_normal_post(row['Challenge_body'])
    df.at[index, 'Challenge_original_content'] = content

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [10]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    if pd.notna(row['Challenge_gpt_summary_original_content']):
        continue
    
    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + ' Body: ' + row['Challenge_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-4',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary_original_content'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(1)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


This model's maximum context length is 16385 tokens. However, your messages resulted in 16720 tokens. Please reduce the length of the messages. on post https://github.com/kedro-org/kedro/issues/308
This model's maximum context length is 16385 tokens. However, your messages resulted in 42298 tokens. Please reduce the length of the messages. on post https://github.com/Azure/MachineLearningNotebooks/issues/1668
This model's maximum context length is 16385 tokens. However, your messages resulted in 16742 tokens. Please reduce the length of the messages. on post https://stackoverflow.com/questions/73624005
This model's maximum context length is 16385 tokens. However, your messages resulted in 52631 tokens. Please reduce the length of the messages. on post https://community.wandb.ai/t/oserror-could-not-find-a-suitable-tls-ca-certificate/3913
This model's maximum context length is 16385 tokens. However, your messages resulted in 19521 tokens. Please reduce the length of the messages. on post 

In [7]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_style_post(row['Challenge_gpt_summary_original_content'])
    else:
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_normal_post(row['Challenge_gpt_summary_original_content'])

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [8]:
# Experiment 3

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_style_post(row['Challenge_body'], remove_code=True)
    else:
        content = preprocess_normal_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_normal_post(row['Challenge_body'], remove_code=True)
        
    df.at[index, 'Challenge_preprocessed_content'] = content
        
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [4]:
# remove custom stop words from challenges and solutions

stop_words_custom = [
    'abil',
    'abl',
    'abnorm',
    'accid',
    'achiev',
    'acknowledg',
    'activ',
    'actual',
    'ad',
    'addition',
    'admit',
    'advis',
    'alright',
    'altern',
    'amaz',
    'announc',
    'anomali',
    'answer',
    'anomali'
    'api',
    'app',
    'appear',
    'applic',
    'appreci',
    'approach',
    'appropri',
    'aris',
    'artifici',
    'ask',
    'assist',
    'assum',
    'astonish',
    'attempt',
    'attent',
    'author',
    'avail',
    'avoid',
    'aw',
    'awesom',
    'azur',
    'bad',
    'basic',
    'behavior',
    'behaviour',
    'benefici',
    'benefit',
    'best',
    'better',
    'blog',
    'bring',
    'bug',
    'case',
    'categori',
    'cau',
    'caus',
    'certain',
    'challeng',
    'chang',
    'check',
    'choic',
    'choos',
    'chose',
    'chosen',
    'clarif',
    'clear',
    'cloud',
    'cloudera',
    'code',
    'com',
    'command',
    'commun',
    'compani',
    'concept',
    'concern',
    'concis',
    'condit',
    'confirm',
    'confus',
    'consid',
    'consist',
    'consciou',
    'consult',
    'contact',
    'contain',
    'content',
    'continu',
    'correct',
    'correctli',
    'correspond',
    'couldn',
    'crash',
    'curiou',
    'current',
    'custom',
    'decid',
    'deep',
    'deliber',
    'demand',
    'demo',
    'deni',
    'depict',
    'describ',
    'desir',
    'despit',
    'detail',
    'develop',
    'differ',
    'differenti',
    'difficult',
    'difficulti',
    'discov',
    'discrep',
    'discuss',
    'dislik',
    'distinguish',
    'easi',
    'east',
    'effect',
    'emerg',
    'encount',
    # 'end',
    'engin',
    'enquiri',
    'ensur',
    'error',
    'especi',
    'exampl',
    'except',
    'exception',
    'excit',
    'exist',
    'expect',
    'experi',
    'expert',
    'explain',
    'express',
    'eventu',
    'evid',
    'dai',
    'databrick',
    'def',
    'direct',
    'directli',
    'domo',
    'dont',
    'face',
    'fact',
    'fascin',
    'fail',
    'failur',
    'fairli',
    'fals',
    'fanci',
    'far',
    'fault',
    'favorit',
    'favourit',
    'feedback',
    'feel',
    'final',
    'find',
    'fine',
    'firstli',
    'fix',
    'float',
    'follow',
    'form',
    'frustrat',
    'futur',
    'gcp',
    'get',
    'give',
    'given',
    'go',
    'good',
    'googl',
    'got',
    'guarante',
    'guidanc',
    'guidelin'
    'handl',
    'hang',
    'happen',
    'happi',
    'hard',
    'harm',
    'have',
    'hear',
    'hei',
    'hello',
    'help',
    'highlight',
    'hinder',
    'horribl',
    'hour',
    'ibm',
    'immedi',
    'impli',
    'implic',
    'improv',
    'includ',
    'incorrect',
    'incorrectli',
    'incred',
    'indic',
    'individu',
    'info',
    'inform',
    'inner',
    'inquir',
    'inquiri',
    'insid',
    'insight',
    'instead',
    'instruct',
    'int',
    'intellig',
    'interest',
    'introduc',
    'invalid',
    'investig',
    'invit',
    'involv',
    'issu',
    'java',
    'join',
    'keep',
    'kind',
    'know',
    'known',
    'lack',
    'lastli',
    'late',
    'later',
    'latest',
    'lazi',
    'lead',
    'learn',
    'let',
    'like',
    'long',
    'look',
    'lot',
    'machin',
    'malfunct',
    'make',
    'mail',
    'main',
    'major',
    'manag',
    'manner',
    'manual',
    'mark',
    'marvel',
    'max',
    'mayb',
    'mean',
    'meaning',
    'meaningfulli',
    'meaningless',
    'meantim',
    'mention',
    'method',
    'min',
    'mind',
    'mistak',
    'mistakenli',
    'month',
    'name',
    'near',
    'necessari',
    'need',
    'neg',
    'network',
    'neural',
    'new',
    'non',
    'north',
    'note',
    'notic',
    'number',
    'obtain',
    'occas',
    'occasion',
    'occur',
    'offer',
    'offici',
    'old',
    'opinion',
    'option',
    'org',
    'organ',
    'outsid',
    'overal',
    'own',
    # 'open',
    'oracl',
    'ought',
    'outcom',
    'part',
    'particip',
    'particular',
    'particularli',
    'past',
    'perceive',
    'perform',
    'permit',
    'person',
    'perspect',
    'place',
    'plan',
    'point',
    'pointless',
    'poor',
    'posit',
    'possibl',
    'post',
    'potenti',
    'practic',
    'pretty',
    'prevent',
    'previou',
    'primari',
    'problem',
    'product',
    'program',
    'project',
    'proper',
    'propos',
    'provid',
    'purpos',
    'python',
    'question',
    'randomli',
    'real',
    'realize',
    'recent',
    'recognize',
    'recommend',
    'refer',
    'regard',
    'relat',
    'relev',
    'repeatedli',
    'requir',
    'research',
    'resolv',
    'respond',
    'result',
    'return',
    'right',
    'rightli',
    'satisfi',
    'saw',
    'scenario',
    'scienc',
    'screenshot',
    'script',
    'second',
    'secondli',
    'see',
    'seek',
    'seen',
    'self',
    'shall',
    'shan',
    'shock',
    'shouldn',
    'show',
    'similar',
    'simpl',
    'situat',
    'snippet',
    'snowflak',
    'solut',
    'solv',
    'soon',
    'sound',
    'sourc',
    'south',
    'special',
    'specif',
    'spend',
    'spent',
    # 'start',
    'startl',
    'straight',
    'straightforward',
    'strang',
    'string',
    'struggl',
    'stuck',
    'studi',
    'studio',
    'stun',
    'succe',
    'success',
    'successfulli',
    'suggest',
    'summari',
    'super',
    'surpris',
    'support',
    'suppos',
    'sure',
    'survei',
    'suspect',
    'take',
    'talk',
    'task',
    'technic',
    'technolog',
    'tell',
    'temporari',
    'temporarili',
    'term',
    'text',
    'thank',
    'thing',
    'think',
    'thirdli',
    'thought',
    'time',
    'tool',
    'topic',
    'total',
    'tri',
    'troubl',
    'true',
    'truli',
    'truth',
    'try',
    'tutori',
    'unabl',
    'unclear',
    'underli',
    'understand',
    'unexpect',
    'unknown',
    'unsur',
    'upcom',
    'us',
    'user',
    'usual',
    'valid',
    'valu',
    'variant',
    'vertex',
    'versu',
    'view',
    'viewpoint',
    'wai',
    'want',
    'weak',
    'websit',
    'weird',
    'west',
    'will',
    'word',
    'worst',
    'won',
    'wonder',
    'work',
    'wors',
    'worth',
    'wouldn',
    'written',
    'wrong',
    'wrongli',
    'www',
    'xgboost',
    'ye',
] 

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    text = row['Challenge_original_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_original_content'] = text
                
    text = row['Challenge_preprocessed_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_preprocessed_content'] = text

    text = row['Challenge_gpt_summary_preprocessed_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = text

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')
