In [30]:
import os
import re
import time
import spacy
import openai
import random
import enchant
import textstat
import itertools
import collections
import pandas as pd
import plotly.graph_objects as go

from bs4 import BeautifulSoup
from collections import namedtuple
from gensim.parsing.preprocessing import remove_stopwords, strip_short, STOPWORDS

In [60]:
random_state = 42

nlp = spacy.load("en_core_web_sm")
spell_checker = enchant.Dict("en_US")

path_dataset = os.path.join(os.path.dirname(os.getcwd()), 'Dataset')

pd.set_option("display.max_rows", None, "display.max_columns", None, 'display.max_colwidth', None)

openai.api_key = os.getenv('OPENAI_API_KEY', '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')

# style_post_tools = ['Azure Machine Learning', 'DVC', 'Guild AI', 'SigOpt', 'Weights & Biases']

# regex_char = r'[^a-z]'
# regex_code = r'''((.)\2{2,})|(<.*?>)|({.*?})|((!)?\[.*?\])|(\(.*?\))|(\`{3}.+?\`{3})|(\`{2}.+?\`{2})|(\`{1}.+?\`{1})|([^\s]*[<=>]=[^\s]+)|(@[^\s]+)|([^\s]*\\[^\s]+)|([^\s]+\/[^\s]+)|([^\s]+\.[^\s]+)|([^\s]+-[^\s]+)|([^\s]+_[^\s]+)|(_+[^\s]+_*)|(_*[^\s]+_+)|([0-9\|\-\r\n\t\"\-#*=~:{}\(\)\[\]<>]+)'''

prompt_summary = 'Concisely convey the most significant points about the text in one or two brief sentences.\n###'

# List of image extensions we want to recognize
image_extensions = {".jpg", ".png", ".jpeg", ".gif", ".bmp", ".webp", ".svg", ".tiff"}

keywords_patch = {
    'pull',
}

keywords_issue = {
    'answers',
    'discussions',
    'forums',
    'issues',
    'questions',
    'stackoverflow',
}

keywords_tool = {
    'github',
    'gitlab',
    'pypi',
}

keywords_doc = {
    'developers',
    'docs',
    'documentation',
    'features',
    'library',
    'org',
    'wiki',
}

keywords_tutorial = {
    'guide',
    'learn',
    'tutorial',
}

tools_keyword_mapping = {
    'Aim': ['aim'],
    'Amazon SageMaker': ['sagemaker', 'amazon', 'aws'],
    'Azure Machine Learning': ['azure', 'microsoft'],
    'ClearML': ['clearml'],
    'cnvrg.io': ['cnvrg'],
    'Codalab': ['codalab'],
    'Comet': ['comet'],
    'Determined': ['determined'],
    'Domino': ['domino'],
    'DVC': ['dvc'],
    'Guild AI': ['guild'],
    'Kedro': ['kedro'],
    'MLflow': ['mlflow', 'databricks'],
    'MLRun': ['mlrun'],
    'ModelDB': ['modeldb'],
    'Neptune': ['neptune'],
    'Optuna': ['optuna'],
    'Polyaxon': ['polyaxon'],
    'Sacred': ['sacred'],
    'SigOpt': ['sigopt'],
    'Valohai': ['valohai'],
    'Vertex AI': ['vertex', 'google'],
    'Weights & Biases': ['weights', 'biases', 'wandb']
}

tools_keyword_list = list(itertools.chain(*tools_keyword_mapping.values()))

stop_words_custom = {
    'action',
    'advance',
    'advice',
    'analysis',
    'approach',
    'assertion',
    'behavior',
    'bit',
    'block',
    'body',
    'bug',
    'case',
    'change',
    'classification',
    'com',
    'company',
    'content',
    'control',
    'custom',
    'day',
    'default',
    'difference',
    # 'dist',
    'description',
    'detail',
    'edit',
    'end',
    'error',
    'example',
    'exception',
    'exit',
    'fact',
    'fail',
    'fix',
    # 'flow',
    'follow',
    'functionality',
    'goal',
    'help',
    'hour',
    'idea',
    'info',
    'issue',
    'key',
    'kind',
    'language',
    'learn',
    'level',
    'line',
    'location',
    'loop',
    'lot',
    'machine',
    'message',
    'method',
    'minute',
    'month',
    'note',
    'number',
    'page',
    'people',
    'place',
    'point',
    'post',
    'prediction',
    'price',
    'problem',
    'product',
    'program',
    'project',
    'question',
    'reason',
    'regression',
    'result',
    'return',
    'scenario',
    'screenshot',
    'second',
    'section',
    'self',
    'sense',
    'setup',
    'shape',
    'site',
    'solution',
    'speech',
    'start',
    'state',
    'status',
    'step',
    'success',
    'suggest',
    'summary',
    'support',
    'task',
    'text',
    'times',
    'thank',
    'thing',
    'today',
    'truth',
    'understand',
    'url',
    'use',
    'value',
    'video',
    'vision',
    'way',
    'week',
    'works',
    'wrong',
    'year',
}

In [66]:
def count_code_line(block_list):
    total_loc = 0
    for blocks in block_list:
        for block in blocks:
            for line in block.splitlines():
                if line.strip():
                    total_loc += 1
    return total_loc

def extract_styles(content):
    soup = BeautifulSoup(content, 'html.parser')
    # extract code blocks type 1
    code_line1 = count_code_line([c.get_text() for c in soup.find_all('code')]) 
    # extract code blocks type 2
    code_line2 = count_code_line([c.get_text() for c in soup.find_all('blockquote')]) 
    code_line = code_line1 + code_line2
    # extract links
    links = [a['href'] for a in soup.find_all('a', href=True)] 
    # extract images
    image_count = len([link for link in links if link.startswith('https://i.stack.imgur.com')])
    non_image_links = [link for link in links if not link.startswith('https://i.stack.imgur.com')]
    # remove images
    for tag in soup.find_all('img'):  
        tag.decompose()
    # remove code blocks type 1
    for tag in soup.find_all('code'):  
        tag.decompose()
    # remove code blocks type 2
    for tag in soup.find_all('blockquote'):  
        tag.decompose()
    # remove links
    for tag in soup.find_all('a'):  
        tag.decompose()
    clean_text = soup.get_text(separator=' ', strip=True)
    return clean_text, non_image_links, code_line, image_count

def extract_code(content):
    code_patterns = [r'```.+?```', r'``.+?``', r'`.+?`']
    clean_text = content
    code_line = 0

    for code_pattern in code_patterns:
        code_snippets = re.findall(code_pattern, clean_text, flags=re.DOTALL)
        code_line += count_code_line(code_snippets)
        clean_text = re.sub(code_pattern, '', clean_text, flags=re.DOTALL)
    
    return clean_text, code_line

def extract_links(text):
    link_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    links = re.findall(link_pattern, text)
    clean_text = re.sub(link_pattern, '', text)
    return clean_text, links

def count_image_number(content):
    image_pattern = r"\!\[.*?\]\(.*?\)"
    image_count = len(re.findall(image_pattern, content))
    clean_text = re.sub(image_pattern, '', content)
    return clean_text, image_count

def extract_image(links):
    image_count = 0
    non_image_links = []
    for link in links:
        for extension in image_extensions:
            if extension in link:
                image_count += 1
                break
        else:
            non_image_links.append(link)
    return non_image_links, image_count

def analyze_links(links):
    documentation_links = 0
    tool_links = 0
    issue_links = 0
    patch_links = 0
    tutorial_links = 0
    example_links = 0
    
    for link in links:
        if any([patch in link for patch in keywords_patch]):
            patch_links += 1
        elif any([issue in link for issue in keywords_issue]):
            issue_links += 1
        elif any([tool in link for tool in keywords_tool]):
            tool_links += 1
        elif any([doc in link for doc in keywords_doc]):
            documentation_links += 1
        elif any([tool in link for tool in keywords_tutorial]):
            tutorial_links += 1
        else:
            example_links += 1

    link_analysis = namedtuple('Analyzer', ['documentation', 'tool', 'issue', 'patch', 'example'])
    return link_analysis(documentation_links, tool_links, issue_links, patch_links, tutorial_links, example_links)

def analyze_text(text):
    word_count = textstat.lexicon_count(text)
    readability = textstat.flesch_reading_ease(text)
    reading_time = textstat.reading_time(text)
    
    text_analysis = namedtuple('Analyzer', ['word_count', 'readability', 'reading_time'])
    return text_analysis(word_count, readability, reading_time)

def extract_nouns(text):
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return ' '.join(nouns)

def extract_english_words(text):
    # english_pattern = r"\b([a-zA-Z]+)\b"
    # english_words = re.findall(english_pattern, text)
    words = [word for word in text.split() if spell_checker.check(word)]
    return ' '.join(words)

def word_frequency(text):
    word_counts = collections.Counter(text.split())
    return word_counts

def split_content(content):
    clean_text, links1, code_count1, image_count1 = extract_styles(content)
    clean_text, code_count2 = extract_code(clean_text)
    clean_text, image_count2 = count_image_number(clean_text)
    clean_text, links2 = extract_links(clean_text)
    
    links = links1 + links2
    code_count = code_count1 + code_count2
    image_count = image_count1 + image_count2
    
    content_collection = namedtuple('Analyzer', ['text', 'links', 'code_count', 'image_count'])
    return content_collection(clean_text, links, code_count, image_count)

def preprocess_content(content):
    clean_text = content.lower()
    clean_text = extract_english_words(clean_text)
    clean_text = extract_nouns(clean_text)
    clean_text = strip_short(clean_text)
    stop_words = STOPWORDS.union(tools_keyword_list)
    clean_text = remove_stopwords(clean_text, stop_words)
    return clean_text

# expential backoff
def retry_with_backoff(fn, retries=2, backoff_in_seconds=1, *args, **kwargs):
    x = 0
    if args is None:
        args = []
    if kwargs is None:
        kwargs = {}

    while True:
        try:
            return fn(*args, **kwargs)
        except:
            if x == retries:
                raise

            sleep = backoff_in_seconds * 2 ** x + random.uniform(0, 1)
            time.sleep(sleep)
            x += 1

# def remove_words_containing_string(word, string):
#     words = word.split()
#     pattern = re.compile(string)
#     filtered_words = [word for word in words if not pattern.search(word)]
#     return ' '.join(filtered_words)

In [47]:
total_counts = collections.Counter()
df_issues = pd.read_json(os.path.join(path_dataset, 'original.json'))
for index, row in df_issues.iterrows():
    text = split_content(row['Challenge_body']).text
    text = preprocess_content(text)
    total_counts.update(word_frequency(text))
total_counts

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)
  soup = BeautifulSoup(content, 'html.parser')
  soup = BeautifulSoup(content, 'html.parser')


Counter({'model': 8367,
         'error': 6357,
         'data': 5144,
         'file': 4629,
         'code': 3778,
         'training': 2609,
         'way': 2335,
         'pipeline': 2283,
         'notebook': 2160,
         'machine': 2025,
         'script': 1916,
         'endpoint': 1839,
         'thanks': 1749,
         'version': 1735,
         'image': 1708,
         'time': 1704,
         'job': 1693,
         'instance': 1692,
         'issue': 1629,
         'files': 1618,
         'models': 1607,
         'line': 1483,
         'service': 1482,
         'dataset': 1423,
         'environment': 1404,
         'problem': 1374,
         'example': 1361,
         'output': 1333,
         'studio': 1309,
         'input': 1250,
         'api': 1238,
         'run': 1204,
         'project': 1201,
         'container': 1197,
         'experiment': 1185,
         'message': 1172,
         'packages': 1171,
         'function': 1128,
         'custom': 1048,
         'type': 10

In [59]:
sorted(dict(total_counts), reverse=True)

['zzzzzzzzz',
 'zzzz',
 'zxd',
 'zulip',
 'zsolt',
 'zotero',
 'zoom',
 'zoo',
 'zones',
 'zone',
 'zipped',
 'zipfile',
 'zip',
 'zinc',
 'zhouyangyu',
 'zhouweiqing',
 'zeroshot',
 'zeros',
 'zeroes',
 'zenml',
 'zenfiles',
 'zendesk',
 'zcowd',
 'zangr',
 'zaharia',
 'yyyy',
 'yyy',
 'yum',
 'yticks',
 'yrs',
 'youu',
 'youtubers',
 'youtube',
 'youth',
 'yourrole',
 'yourneptuneendpoint',
 'yourkey',
 'yolox',
 'yolact',
 'yml',
 'ymin',
 'yields',
 'yield',
 'yesterdays',
 'yesterday',
 'yes',
 'yeet',
 'years',
 'year',
 'yay',
 'yaxis',
 'yassine',
 'yarn',
 'yappi',
 'yannara',
 'yamnet',
 'yaml',
 'xxxxxxxxxxxxxxxxxxxxxxxxxx',
 'xxxxxxxxxxxxxxxxxxxxxxxx',
 'xxxxxxxxxxxxxxx',
 'xxxxxxxxxxxx',
 'xxxxxxxxxx',
 'xxxxxxxx',
 'xxxxxxx',
 'xxxxxx',
 'xxxxx',
 'xxxx',
 'xxx',
 'xmltodict',
 'xml',
 'xmin',
 'xmax',
 'xlsx',
 'xlarge',
 'xjiwsevphigaaesrgfrgs',
 'xgdmatrixsetdenseinfo',
 'xgbregressor',
 'xgboostsagemakerestimator',
 'xgboostreport',
 'xgboostlabelencoder',
 'xgboostcl

In [None]:
df_issues = pd.read_json(os.path.join(path_dataset, 'issues.json'))

for index, row in df_issues.iterrows():
    df_issues.at[index, 'Challenge_title'] = row['Issue_title']
    df_issues.at[index, 'Challenge_body'] = row['Issue_body']
    df_issues.at[index, 'Challenge_link'] = row['Issue_link']
    df_issues.at[index, 'Challenge_tag_count'] = row['Issue_tag_count']
    df_issues.at[index, 'Challenge_created_time'] = row['Issue_created_time']
    df_issues.at[index, 'Challenge_answer_count'] = row['Issue_comment_count']
    df_issues.at[index, 'Challenge_score_count'] = row['Issue_score_count']
    df_issues.at[index, 'Challenge_closed_time'] = row['Issue_closed_time']
    df_issues.at[index, 'Challenge_repo_issue_count'] = row['Issue_repo_issue_count']
    df_issues.at[index, 'Challenge_repo_star_count'] = row['Issue_repo_star_count']
    df_issues.at[index, 'Challenge_repo_watch_count'] = row['Issue_repo_watch_count']
    df_issues.at[index, 'Challenge_repo_fork_count'] = row['Issue_repo_fork_count']
    df_issues.at[index, 'Challenge_repo_contributor_count'] = row['Issue_repo_contributor_count']
    df_issues.at[index, 'Challenge_self_closed'] = row['Issue_self_closed']

    df_issues.at[index, 'Solution_body'] = row['Comment_body']
    df_issues.at[index, 'Solution_score_count'] = row['Comment_score_count']

del df_issues['Issue_title']
del df_issues['Issue_body']
del df_issues['Issue_link']
del df_issues['Issue_created_time']
del df_issues['Issue_comment_count']
del df_issues['Issue_score_count']
del df_issues['Issue_closed_time']
del df_issues['Issue_repo_issue_count']
del df_issues['Issue_repo_star_count']
del df_issues['Issue_repo_watch_count']
del df_issues['Issue_repo_fork_count']
del df_issues['Issue_repo_contributor_count']
del df_issues['Issue_self_closed']

del df_issues['Comment_body']
del df_issues['Comment_score_count']

df_questions = pd.read_json(os.path.join(path_dataset, 'questions.json'))

for index, row in df_questions.iterrows():
    df_questions.at[index, 'Challenge_title'] = row['Question_title']
    df_questions.at[index, 'Challenge_body'] = row['Question_body']
    df_questions.at[index, 'Challenge_link'] = row['Question_link']
    df_questions.at[index, 'Challenge_tag_count'] = row['Question_tag_count']
    df_questions.at[index, 'Challenge_topic_count'] = row['Question_topic_count']
    df_questions.at[index, 'Challenge_created_time'] = row['Question_created_time']
    df_questions.at[index, 'Challenge_answer_count'] = row['Question_answer_count']
    df_questions.at[index, 'Challenge_comment_count'] = row['Question_comment_count']
    df_questions.at[index, 'Challenge_score_count'] = row['Question_score_count']
    df_questions.at[index, 'Challenge_closed_time'] = row['Question_closed_time']
    df_questions.at[index, 'Challenge_favorite_count'] = row['Question_favorite_count']
    df_questions.at[index, 'Challenge_last_edit_time'] = row['Question_last_edit_time']
    df_questions.at[index, 'Challenge_view_count'] = row['Question_view_count']
    df_questions.at[index, 'Challenge_self_closed'] = row['Question_self_closed']

    df_questions.at[index, 'Solution_body'] = row['Answer_body']
    df_questions.at[index, 'Solution_score_count'] = row['Answer_score_count']
    df_questions.at[index, 'Solution_comment_count'] = row['Answer_comment_count']
    df_questions.at[index, 'Solution_comment_body'] = row['Answer_comment_body']
    df_questions.at[index, 'Solution_last_edit_time'] = row['Answer_last_edit_time']

del df_questions['Question_title']
del df_questions['Question_body']
del df_questions['Question_link']
del df_questions['Question_created_time']
del df_questions['Question_last_edit_time']
del df_questions['Question_answer_count']
del df_questions['Question_comment_count']
del df_questions['Question_score_count']
del df_questions['Question_closed_time']
del df_questions['Question_view_count']
del df_questions['Question_favorite_count']
del df_questions['Question_self_closed']

del df_questions['Answer_body']
del df_questions['Answer_comment_count']
del df_questions['Answer_last_edit_time']
del df_questions['Answer_comment_body']
del df_questions['Answer_score_count']

df = pd.concat([df_issues, df_questions], ignore_index=True)
df.to_json(os.path.join(path_dataset, 'original.json'), indent=4, orient='records')


In [None]:
# Draw sankey diagram of tool and platform

df = pd.read_json(os.path.join(path_dataset, 'original.json'))
df['State'] = df['Challenge_closed_time'].apply(lambda x: 'closed' if not pd.isna(x) else 'open')

categories = ['Platform', 'Tool', 'State']
df_info = df.groupby(categories).size().reset_index(name='value')

labels = {}
newDf = pd.DataFrame()
for i in range(len(categories)):
    labels.update(df[categories[i]].value_counts().to_dict())
    if i == len(categories)-1:
        break
    tempDf = df_info[[categories[i], categories[i+1], 'value']]
    tempDf.columns = ['source', 'target', 'value']
    newDf = pd.concat([newDf, tempDf])
    
newDf = newDf.groupby(['source', 'target']).agg({'value': 'sum'}).reset_index()
source = newDf['source'].apply(lambda x: list(labels).index(x))
target = newDf['target'].apply(lambda x: list(labels).index(x))
value = newDf['value']

labels = [f'{k} ({v})' for k, v in labels.items()]
link = dict(source=source, target=target, value=value)
node = dict(label=labels)
data = go.Sankey(link=link, node=node)

fig = go.Figure(data)
fig.update_layout(width=1000, height=1000, font_size=20)
fig.write_image(os.path.join(path_dataset, 'Tool platform state sankey.png'))

In [6]:
# Experiment 1

df = pd.read_json(os.path.join(path_dataset, 'original.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title']) + ' ' + preprocess_style_post(row['Challenge_body'])
    else:
        content = preprocess_normal_post(row['Challenge_title']) + ' ' + preprocess_normal_post(row['Challenge_body'])
    df.at[index, 'Challenge_original_content'] = content

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [10]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if index % 100 == 99:
        print(f'persisting on post {index}')
        df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

    if pd.notna(row['Challenge_gpt_summary_original_content']):
        continue
    
    try:
        prompt = prompt_summary + 'Title: ' + row['Challenge_title'] + ' Body: ' + row['Challenge_body'] + '###\n'
        response = retry_with_backoff(
            openai.ChatCompletion.create,
            model='gpt-4',
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=150,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            timeout=50,
            stream=False
        )
        df.at[index, 'Challenge_gpt_summary_original_content'] = response['choices'][0]['message']['content']
    except Exception as e:
        print(f'{e} on post {row["Challenge_link"]}')

    time.sleep(1)

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')


This model's maximum context length is 16385 tokens. However, your messages resulted in 16720 tokens. Please reduce the length of the messages. on post https://github.com/kedro-org/kedro/issues/308
This model's maximum context length is 16385 tokens. However, your messages resulted in 42298 tokens. Please reduce the length of the messages. on post https://github.com/Azure/MachineLearningNotebooks/issues/1668
This model's maximum context length is 16385 tokens. However, your messages resulted in 16742 tokens. Please reduce the length of the messages. on post https://stackoverflow.com/questions/73624005
This model's maximum context length is 16385 tokens. However, your messages resulted in 52631 tokens. Please reduce the length of the messages. on post https://community.wandb.ai/t/oserror-could-not-find-a-suitable-tls-ca-certificate/3913
This model's maximum context length is 16385 tokens. However, your messages resulted in 19521 tokens. Please reduce the length of the messages. on post 

In [7]:
# Experiment 2

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_style_post(row['Challenge_gpt_summary_original_content'])
    else:
        df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = preprocess_normal_post(row['Challenge_gpt_summary_original_content'])

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [8]:
# Experiment 3

df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    if (row['Platform'] == 'Stack Overflow') or ((row['Platform'] == 'Tool-specific') and (row['Tool'] in style_post_tools)):
        content = preprocess_style_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_style_post(row['Challenge_body'], remove_code=True)
    else:
        content = preprocess_normal_post(row['Challenge_title'], remove_code=True) + ' ' + preprocess_normal_post(row['Challenge_body'], remove_code=True)
        
    df.at[index, 'Challenge_preprocessed_content'] = content
        
df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')

  soup = BeautifulSoup(text, 'html.parser')


In [4]:
# remove custom stop words from challenges and solutions


df = pd.read_json(os.path.join(path_dataset, 'preprocessed.json'))

for index, row in df.iterrows():
    text = row['Challenge_original_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_original_content'] = text
                
    text = row['Challenge_preprocessed_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_preprocessed_content'] = text

    text = row['Challenge_gpt_summary_preprocessed_content']
    for stop_word in stop_words_custom:
        text = remove_words_containing_string(text, stop_word)
    df.at[index, 'Challenge_gpt_summary_preprocessed_content'] = text

df.to_json(os.path.join(path_dataset, 'preprocessed.json'), indent=4, orient='records')
