# FAQ from Code4rena's Discord chat messages

### Objective
- The goal of this notebook is to create an FAQ from the chat messages in the Code4rena's (C4) Discord "questions" channel.
- The purpose of this FAQ is to primarily allow C4 to identify gaps in documentation and accordingly update it.


### High-level approach
- Parse chat messages in the HTML export into a structured format (timestamp, author, message, reply_to_author, reply_to_message)
- Create overlapping groups of messages and add each group to the LLM prompt to generate questions and answers with timestamp and answer source author
- To handle highly-similar questions and only keep the latest:
    - Add all the questions to the embeddings index
    - For each quesion look for highly similar questions in the index, with threshold of 0.9
    - For the similar questions, choose the one with the latest timestamp

In [14]:
%pip install -U beautifulsoup4 lxml matplotlib openai plotly pandas scipy scikit-learn python-dotenv langchain tiktoken chromadb tqdm

Collecting plotly
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pandas
  Downloading pandas-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl (11.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy
  Using cached scipy-1.11.3-cp310-cp310-macosx_10_9_x86_64.whl (37.3 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl (10.2 MB)
Collecting langchain
  Downloading langchain-0.0.324-py3-none-any.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting chromadb
  Downloading chromadb-0.4.15-py3-none-any.whl (479 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m479.8/479.8

In [15]:
import os
from dotenv import load_dotenv
from getpass import getpass

load_dotenv()

openai_api_key = os.environ.get('OPENAI_API_KEY') or getpass('Enter your OpenAI API key: ')
os.environ['OPENAI_API_KEY'] = openai_api_key

In [18]:
from bs4 import BeautifulSoup

formatted_msg_blocks = []

with open('./Code4rena_-_Main_-_questions.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


soup = BeautifulSoup(html_content, 'html.parser')

### Parse chat messages in the HTML export into a structured format

In [19]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_content, 'html.parser')

message_groups = soup.find_all('div', class_='chatlog__message-group')

parsed_messages = []

for message_group in message_groups:
    author = message_group.find('span', class_='chatlog__author')
    if author:
        author = author.text
    else:
        author = ''
    timestamp = message_group.find('span', class_='chatlog__timestamp')
    if timestamp:
        timestamp = timestamp.text
    else:
        timestamp = ''
    markdown_preserves = message_group.find_all('span', class_='chatlog__markdown-preserve')
    message = '\n'.join([mp.text for mp in markdown_preserves])
    reply_to_author = message_group.find('div', class_='chatlog__reply-author')
    if reply_to_author:
        reply_to_author = reply_to_author.text
    else:
        reply_to_author = ''
    reply_to_message = message_group.find('span', class_='chatlog__reply-link')
    if reply_to_message:
        reply_to_message = reply_to_message.text
    else:
        reply_to_message = ''
    parsed_messages.append({'author': author, 'timestamp': timestamp, 'message': message, 'reply_to_author': reply_to_author, 'reply_to_message': reply_to_message})

len(parsed_messages)

6403

### Choose the last N messages for testing

In [20]:

NUM_MESSAGES = len(parsed_messages) # Change this limit to preferred last N messages

latest_messages = parsed_messages[-NUM_MESSAGES:]

### Generate message blocks/groups using sliding window with overlap

In [75]:

message_blocks = []
step = 10
lookout_range = 3
for i in range(0, len(latest_messages), step):    
    before = latest_messages[i-lookout_range:i]
    main_block = latest_messages[i:i+step]
    after = latest_messages[i+1:i+lookout_range]

    message_block = []
    message_block.extend(before)
    message_block.extend(main_block)
    message_block.extend(after)

    message_blocks.append(message_block)
        

### Format messages for prompt

In [76]:
formatted_msg_blocks = []
for mb in message_blocks:
    lines = ""
    
    for m in mb:
        timestamp = m['timestamp']
        author = m['author']
        message = m['message'].replace('"', "").replace("'", "")
        reply_to_author = m['reply_to_author']
        reply_to_message = m['reply_to_message'].replace('"', "").replace("'", "")
        formatted_message = f"{timestamp};{author};{message};{reply_to_author};{reply_to_message}"
        lines += formatted_message + "---"
        
    formatted_msg_blocks.append(lines)
len(formatted_msg_blocks)

641

### Setup the prompt with detailed instructions and a single example (one-shot)

In [77]:
SYSTEM_PROMPT = """You are an intelligent analyst capable of looking at chat messages and generating questions and answers from it to create an FAQ.

- For your task, you have been given chat messages from an organization called Code4rena (a.k.a C4) that specializes in crowd sourced smart contract audits.
- You are given chat messages below, each message is formatted as timestamp;author;message;reply_to_author;reply_to_message and separated by "---"
- To generate questions and answers, think step-by-step, first base it on the reply to author and reply to message, if they are not available, then solely based it on the messages before and after.
- **DO NOT** use any follow-on questions as an answer to previous question.
- If a message seems like a casual conversation and unrelated to the general subject, skip it.
- If a question does not have a helpful answer, feel free to skip it.
- Rephrase the questions and answers to be professional, suitable enough to be used in a FAQ.
- Use the message timestamp from the author as the timestamp for the question and answer.
- Do not mention any thing about the particular chat or author in the answer, it should be generic enough to be used in a FAQ.
- Any links mentioned in the messages are very important, please include them in the answer.
- Identify the true source author that contributed to the answer from the messages
- Output the results as a JSON list with fields "timestamp", "question", "answer", "answer_source_author"
- **DO NOT** make up questions and answers, only use the chat messages as the source of truth.

## Eample:
### Chat messages:
06/28/2023 5:39 PM;DadeKuma;thats old, it doesnt work like that anymore;lsaudit; according to that .cvs file, Low issues are ranked by uniquess too ---06/28/2023 5:40 PM;lsaudit;so if all As get the same award, no matter how many Low findings there are - why should auditors bother to put more than one Low findins in QA?\nif one Low finding is enough to be scored as A ?\nOr maybe Ill rephrase my question. Lets assume that there are only three QA reports. 1st reports issues: A, B, C, D. 2nd: B, C, D, E; 3rd: F. Can only one report be choosen for a final report?\nOr the report will merge: A, B, C, D, E, F. So 1st report will get bonus for A uniquness, and 3rd report, would get bonus for reporting F issue?;;---06/28/2023 5:47 PM;🦙 liveactionllama | C4;The info here might be helpful:\nhttps://docs.code4rena.com/awarding/judging-criteria#qa-reports-low-non-critical\nhttps://docs.code4rena.com/awarding/incentive-model-and-awards#qa-and-gas-optimization-reports\n\nJudges look at both quantity and quality when judging QA reports. If a wardens QA submission only had 1 item, it would be pretty unlikely to receive a high grade. Especially if other wardens QA submissions within that audit contained many high quality items in comparison.;lsaudit; so if all As get the same award, no matter how many Low findings there are - why should auditors bother to put more than one Low findins in QA?

### JSON result:
{{
"timestamp": "06/28/2023 5:40 PM",
"question": "Why should auditors put more than one Low findings in QA if all As get the same award, no matter how many Low findings there are?",
"answer": "Judges look at both quantity and quality when judging QA reports. If a warden's QA submission only had 1 item, it would be pretty unlikely to receive a high grade. Especially if other wardens' QA submissions within that audit contained many high-quality items in comparison. More information can be found at https://docs.code4rena.com/awarding/judging-criteria#qa-reports-low-non-critical and https://docs.code4rena.com/awarding/incentive-model-and-awards#qa-and-gas-optimization-reports.",
"answer_source_author": "🦙 liveactionllama | C4"
}}

## Chat messages:
{chat_messages}

## JSON result:"""

from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["chat_messages"],
    template=SYSTEM_PROMPT,
)

In [78]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4",
    temperature=0,
)

### Call the LLM for each message block

In [79]:
LLM_RESULTS_DIR = './output/llm_results'

In [82]:
formatted_msg_blocks[73]

'02/01/2022 10:00 PM;0xleastwood;to account for the increase in judging fee\nit was a beast of a contest, so there was a lot for the judge to handle;wildmolasses; hi! why was the malt prize pool changed? ---02/02/2022 12:53 AM;wildmolasses;gotcha, did i miss an announcement?;0xleastwood; it was a beast of a contest, so there was a lot for the judge to handle ---02/02/2022 12:53 AM;0xleastwood;I think this was raised a couple of weeks ago\nnot sure if there was an announcement;wildmolasses; gotcha, did i miss an announcement? ---02/02/2022 12:55 AM;🧦 sockdrawer | C4;Discussed in the wardens channel. Our backlog came to a head based on overwhelming levels of issues on some contests and limited judge availability and we needed to ramp up our offers for judging comp for a window of time in order to just clear out the seriously lagging contests in the backlog\nThankfully we had some judges do some SERIOUS overtime and we’re starting to get caught up. The recent process improvements and tool

In [85]:
from langchain.chains import LLMChain
import json
from langchain.callbacks import get_openai_callback
from tqdm import tqdm

START_BLOCK_INDEX = 74 # start from preferred block
END_BLOCK_INDEX = len(formatted_msg_blocks) - 1 # end at preferred block

chain = LLMChain(llm=llm, prompt=prompt)

total_tokens = 0
total_cost = 0
last_processed_block_index = 0

if not os.path.exists(LLM_RESULTS_DIR):
    os.makedirs(LLM_RESULTS_DIR)

with get_openai_callback() as cb:
    print(f"Running for blocks {START_BLOCK_INDEX} to {END_BLOCK_INDEX}")
    for i in tqdm(range(START_BLOCK_INDEX, END_BLOCK_INDEX + 1)):
        block = formatted_msg_blocks[i]
        result = chain.run(chat_messages=block)
        #print(result)
        json_obj = json.loads(result)
        if isinstance(json_obj, dict):
            json_obj = [json_obj]
        total_tokens += cb.total_tokens
        total_cost += cb.total_cost
        last_processed_block_index = i
        with open(f'{LLM_RESULTS_DIR}/faq-{i}.json', 'w') as f:
            json.dump(json_obj, f, indent=2)

Running for blocks 74 to 640


  0%|          | 2/567 [01:08<5:24:20, 34.44s/it]

In [81]:
print(f"Last processed message block index: {last_processed_block_index}")
print(f"Total tokens: {total_tokens}")
print(f"Total cost: {total_cost}")

Last processed message block index: 72
Total tokens: 6005212
Total cost: 206.23014


### Read the LLM result JSON files

In [47]:
import os
import json

json_files = [f for f in os.listdir(LLM_RESULTS_DIR)]

qa_list = []

for file in json_files:
    with open(os.path.join(LLM_RESULTS_DIR, file), 'r') as f:
        json_data = json.load(f)
        qa_list.extend(json_data)

print(qa_list)

[{'timestamp': '02/16/2021 9:26 PM', 'question': 'Should we create a page for the contest and list or link to wardens, judges, and sponsors? Should we also have a form for people to fill out when joining as a warden, including links to their socials, bio, avi, etc.?', 'answer': 'Yes, these are all good ideas.', 'answer_source_author': 'zscole'}, {'timestamp': '02/16/2021 9:27 PM', 'question': 'Should we start a channel specific to the website?', 'answer': 'Yes, we can add a channel here. Also, feel free to submit PRs with any ideas to the GitHub. The website was put together quickly and any help is appreciated.', 'answer_source_author': 'zscole'}, {'timestamp': '02/16/2021 11:17 PM', 'question': 'When will we have access to the codebase?', 'answer': 'Access to the codebase will be available on February 17 @ 1400 UTC (9AM EST), which is a little less than 13 hours from now.', 'answer_source_author': 'zscole'}, {'timestamp': '02/17/2021 3:32 PM', 'question': 'Are the smart contracts from

### Generate Langchain Document objects from the resultant questions and answers

In [41]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
from datetime import datetime


docs = []
for i, qa in enumerate(qa_list):
    question = qa['question']
    doc = Document(page_content=question, metadata={
        'ques_id': i,
        'timestamp': qa['timestamp'],
        'epoch_time': int(datetime.strptime(qa['timestamp'], '%m/%d/%Y %I:%M %p').timestamp()),
        'answer': qa['answer'],
        'answer_source_author': qa['answer_source_author']
    })
    docs.append(doc)

### Add Documents to the vector index

In [44]:
import chromadb

embeddings = OpenAIEmbeddings()
collection_name = "questions"

chroma = chromadb.Client()
try:
    collection = chroma.get_collection(collection_name)
    if collection:
        chroma.delete_collection(collection_name)
except:
    pass

ques_db = Chroma(collection_name=collection_name, embedding_function=embeddings, collection_metadata={"hnsw:space": "cosine"})
ques_db.add_documents(docs)

['2ceb3632-744d-11ee-a814-367dda1ae1c5',
 '2ceb36dc-744d-11ee-a814-367dda1ae1c5',
 '2ceb3722-744d-11ee-a814-367dda1ae1c5',
 '2ceb3754-744d-11ee-a814-367dda1ae1c5',
 '2ceb3786-744d-11ee-a814-367dda1ae1c5',
 '2ceb37b8-744d-11ee-a814-367dda1ae1c5',
 '2ceb37ea-744d-11ee-a814-367dda1ae1c5',
 '2ceb381c-744d-11ee-a814-367dda1ae1c5']

### Filter for latest questions

In [45]:
skip_question_ids = []
final_qa_docs = []

for d in docs:
    q = d.page_content
    ques_id = d.metadata['ques_id']

    if ques_id in skip_question_ids:
        continue

    results = ques_db.similarity_search_with_relevance_scores(q, k=4, score_threshold=0.9, )
    latest_question = d
    for r in results:
        skip_question_ids.append(r[0].metadata['ques_id'])
        ques_id = r[0].metadata['ques_id']
        epoch_time = r[0].metadata['epoch_time']
        if epoch_time > latest_question.metadata['epoch_time']:
            latest_question = r[0]
    final_qa_docs.append(latest_question)

  query: input text


### Create markdown file with the final results

In [46]:
qa_to_store = []
for d in final_qa_docs:
    qa_to_store.append({
        'question': d.page_content,
        'answer': d.metadata['answer'],
        'timestamp': d.metadata['timestamp'],
        'answer_source_author': d.metadata['answer_source_author']
    })

with open('./output/faq.json', 'w') as f:
    json.dump(qa_to_store, f, indent=4)

with open('./output/faq.md', 'w') as f:
    for i, qa in enumerate(final_qa_docs):
        question = qa.page_content
        answer = qa.metadata['answer']
        author = qa.metadata['answer_source_author']
        timestamp = qa.metadata['timestamp']
        f.write(f"#### {i+1}. {question}\n")
        f.write(f"{answer}\n\n")
        f.write(f"*Answer Source Author: {author}*\n\n")
        f.write(f"*Source Timestamp: {timestamp}*\n\n")