In [None]:
%pip install beautifulsoup4 lxml
%pip install matplotlib openai plotly pandas scipy scikit-learn python-dotenv langchain tiktoken chromadb

In [None]:
import os
from dotenv import load_dotenv
from getpass import getpass

load_dotenv()

# setting up an OpenAI template on the run
openai_api_key = os.environ.get('OPENAI_API_KEY') or getpass('Enter your OpenAI API key: ')
os.environ['OPENAI_API_KEY'] = openai_api_key

In [None]:
from bs4 import BeautifulSoup

chat_lines = []

# Load the HTML content from the file
with open('./Code4rena_-_Main_-_questions.html', 'r', encoding='utf-8') as file:
    html_content = file.read()


# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [135]:
from bs4 import BeautifulSoup

# Assuming you have the HTML content in a variable called `html_content`
soup = BeautifulSoup(html_content, 'html.parser')

# Find all `div` elements with the class name `chatlog__message-group`
message_groups = soup.find_all('div', class_='chatlog__message-group')

# Create an empty list to store the results
parsed_messages = []

# div.chatlog__reply-author
# span.chatlog__reply-link

# Loop through each `div` element and extract the text from the `span` elements with class `chatlog__author`, `chatlog__timestamp`, and `chatlog__markdown-preserve`
for message_group in message_groups:
    author = message_group.find('span', class_='chatlog__author')
    if author:
        author = author.text
    else:
        author = ''
    timestamp = message_group.find('span', class_='chatlog__timestamp')
    if timestamp:
        timestamp = timestamp.text
    else:
        timestamp = ''
    markdown_preserves = message_group.find_all('span', class_='chatlog__markdown-preserve')
    message = '\n'.join([mp.text for mp in markdown_preserves])
    reply_to_author = message_group.find('div', class_='chatlog__reply-author')
    if reply_to_author:
        reply_to_author = reply_to_author.text
    else:
        reply_to_author = ''
    reply_to_message = message_group.find('span', class_='chatlog__reply-link')
    if reply_to_message:
        reply_to_message = reply_to_message.text
    else:
        reply_to_message = ''
    parsed_messages.append({'author': author, 'timestamp': timestamp, 'message': message, 'reply_to_author': reply_to_author, 'reply_to_message': reply_to_message})

# Print the results
len(parsed_messages)

6403

In [199]:

# message_blocks = []
# lookout_range = 20
# for i, m in enumerate(parsed_messages):
#     if 'c4' in m['author'].lower():
#         before = parsed_messages[i-lookout_range:i]
#         c4_message = parsed_messages[i]
#         after = parsed_messages[i+1:i+lookout_range]

#         message_block = []
#         message_block.extend(before)
#         message_block.append(m)
#         message_block.extend(after)

#         message_blocks.append(message_block)

message_blocks = []
step = 25
lookout_range = 10
for i in range(0, len(parsed_messages), step):
    # take a slice of the messages
    
    before = parsed_messages[i-lookout_range:i]
    main_block = parsed_messages[i:i+step]
    after = parsed_messages[i+1:i+lookout_range]

    message_block = []
    message_block.extend(before)
    message_block.extend(main_block)
    message_block.extend(after)

    message_blocks.append(message_block)
        

In [200]:
chat_lines = []
for mb in message_blocks:
    # format in timestamp, author, markdown_text, reply_to_author, reply_to_message
    lines = ""
    for m in mb:
        timestamp = m['timestamp']
        author = m['author']
        message = m['message']
        reply_to_author = m['reply_to_author']
        reply_to_message = m['reply_to_message']
        formatted_message = f"{timestamp};{author};{message};{reply_to_author};{reply_to_message}"
        lines += formatted_message + "---"
        
    chat_lines.append(lines)

In [201]:
len(chat_lines)

257

In [210]:
SYSTEM_PROMPT = """You are an intelligent analyst capable of looking at chat messages and generating questions and answers from it to create an FAQ.
- You are given chat messages below, each message is formatted as timestamp;author;message;reply_to_author;reply_to_message and separated by "---"
- To generate questions and answers, first base it on the reply to author and reply to message, if they are not available, then solely based it on the messages before and after.
- Give preference to messages from C4 authors to form answers, they can be identified with the term "C4" in their author name.
- Rephrase the questions and answers to be professional, suitable enough to be used in a FAQ.
- Use the message timestamp from the author as the timestamp for the question and answer.
- **DO NOT** make up questions and answers, only use the chat messages as the source of truth.
- Output the results in JSON format only with fields "timestamp", "question", "answer", "source_author"


## Chat messages:
{chat_messages}

## JSON result:"""

from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["chat_messages"],
    template=SYSTEM_PROMPT,
)

In [211]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
)

In [212]:
from langchain.chains import LLMChain
import json
from langchain.callbacks import get_openai_callback


chain = LLMChain(llm=llm, prompt=prompt)

qa_list = []
total_tokens = 0
total_cost = 0
with get_openai_callback() as cb:
    for l in chat_lines[:1]:
        result = chain.run(chat_messages=l)
        qa_list.extend(json.loads(result))
        total_tokens += cb.total_tokens
        total_cost += cb.total_cost

print(f"Total tokens: {total_tokens}")
print(f"Total cost: {total_cost}")

[
    {
        "timestamp": "02/15/2021 7:56 PM",
        "question": "Where does the name 'Code 423n4' come from?",
        "answer": "The name 'Code 423n4' is a play on words. It stands for 'Code Arena' and is a cute way of using numbers to represent words.",
        "source_author": "scott_L"
    },
    {
        "timestamp": "02/15/2021 7:59 PM",
        "question": "What is the meaning behind using numbers to make words?",
        "answer": "Using numbers to make words in the name 'Code 423n4' is simply a creative and playful choice.",
        "source_author": "Shauhin"
    },
    {
        "timestamp": "02/16/2021 9:02 PM",
        "question": "Is there a page for the contest that lists wardens, judges, and sponsors?",
        "answer": "Yes, there is a plan to create a page for the contest that will list and link to wardens, judges, and sponsors. There may also be a GitHub form for people to fill out when joining as a warden, providing links to their socials, bio, and avatar.",

In [205]:
len(qa_list)

6

In [None]:
"""
- filter for answers from c4 authors, get replies and 5 messages before and after
- prompt llm to generate questions and answers with timestamp for each of them
- add all questions to the index
- for each quesion look for very similar questions in the index, with threshold of 0.8
- for the similar questions, choose the one with that is the latest and add all the similar questions to seen questions
- skip the question if it is in seen questions
- from the final questions, prompt llm to generate raw categories
- from the generated raw categories, prompt llm to generate category groups
"""

In [216]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document

docs = []
for i, qa in enumerate(qa_list):
    question = qa['question']
    doc = Document(page_content=question, metadata={
        'ques_id': i,
        'timestamp': qa['timestamp'],
        'answer': qa['answer'],
        'source_author': qa['source_author']
    })
    docs.append(doc)

docs

[Document(page_content="Where does the name 'Code 423n4' come from?", metadata={'ques_id': 0, 'timestamp': '02/15/2021 7:56 PM', 'answer': "The name 'Code 423n4' is a play on words. It stands for 'Code Arena' and is a cute way of using numbers to represent words.", 'source_author': 'scott_L'}),
 Document(page_content='What is the meaning behind using numbers to make words?', metadata={'ques_id': 1, 'timestamp': '02/15/2021 7:59 PM', 'answer': "Using numbers to make words in the name 'Code 423n4' is simply a creative and playful choice.", 'source_author': 'Shauhin'}),
 Document(page_content='Is there a page for the contest that lists wardens, judges, and sponsors?', metadata={'ques_id': 2, 'timestamp': '02/16/2021 9:02 PM', 'answer': 'Yes, there is a plan to create a page for the contest that will list and link to wardens, judges, and sponsors. There may also be a GitHub form for people to fill out when joining as a warden, providing links to their socials, bio, and avatar.', 'source_au

In [217]:
import chromadb

embeddings = OpenAIEmbeddings()
collection_name = "questions"

chroma = chromadb.Client()
try:
    collection = chroma.get_collection(collection_name)
    if collection:
        chroma.delete_collection(collection_name)
except:
    pass

ques_db = Chroma(collection_name=collection_name, embedding_function=embeddings)
ques_db.add_documents(docs)

['6441ba6a-67d8-11ee-9b69-367dda1ae1c5',
 '6441bb00-67d8-11ee-9b69-367dda1ae1c5',
 '6441bb46-67d8-11ee-9b69-367dda1ae1c5',
 '6441bb82-67d8-11ee-9b69-367dda1ae1c5',
 '6441bbaa-67d8-11ee-9b69-367dda1ae1c5',
 '6441bbdc-67d8-11ee-9b69-367dda1ae1c5',
 '6441bc0e-67d8-11ee-9b69-367dda1ae1c5']

In [218]:
skip_question_ids = []
final_qa_docs = []

for d in docs:
    q = d.page_content
    ques_id = d.metadata['ques_id']

    if ques_id in skip_question_ids:
        continue

    results = ques_db.similarity_search_with_relevance_scores(q, k=4, score_threshold=0.9)
    for r in results:
        skip_question_ids.append(r[0].metadata['ques_id'])
        # TODO choose the latest timestamp
    final_qa_docs.append(d)

In [223]:
qa_to_store = []
for d in final_qa_docs:
    qa_to_store.append({
        'question': d.page_content,
        'answer': d.metadata['answer'],
        'timestamp': d.metadata['timestamp'],
        'source_author': d.metadata['source_author']
    })

with open('./output/faq.json', 'w') as f:
    json.dump(qa_to_store, f, indent=4)

with open('./output/faq.md', 'w') as f:
    for i, qa in enumerate(final_qa_docs):
        question = qa.page_content
        answer = qa.metadata['answer']
        author = qa.metadata['source_author']
        f.write(f"#### {i+1}. {question}\n")
        f.write(f"{answer}\n\n")
        f.write(f"*Source Author: {author}*\n\n")