In [1]:
!pip install 'langchain[llms]'
!pip install Scrapy
!pip install html2text
!pip install lxml
!pip install python-dotenv
!pip install "unstructured[all-docs]"
!pip install tiktoken
!pip install faiss-cpu 
!pip install GitPython
!pip install notebook
!pip install chromadb
!pip install pandas
!pip install rank_bm25
!pip install weaviate-client

Collecting langchain[llms]
  Downloading langchain-0.0.302-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting jsonpatch<2.0,>=1.33
  Using cached jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting numpy<2,>=1
  Using cached numpy-1.26.0-cp310-cp310-macosx_10_9_x86_64.whl (20.6 MB)
Collecting requests<3,>=2
  Using cached requests-2.31.0-py3-none-any.whl (62 kB)
Collecting PyYAML>=5.3
  Using cached PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl (189 kB)
Collecting dataclasses-json<0.7,>=0.5.7
  Downloading dataclasses_json-0.6.1-py3-none-any.whl (27 kB)
Collecting async-timeout<5.0.0,>=4.0.0
  Using cached async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Collecting aiohttp<4.0.0,>=3.8.3
  Using cached aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl (365 kB)
Collecting SQLAlchemy<3,>=1.4
  Using cached SQLAlchemy-2.0.21-cp310-cp310-macosx_10_9_x86_64.whl (2.

In [1]:
import logging
from dotenv import load_dotenv
from IPython.display import display, Markdown, Latex

logging.getLogger().setLevel(logging.INFO)
load_dotenv()

True

In [2]:
import getpass
import os

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') 

WEAVIATE_API_KEY = os.environ.get('WEAVIATE_API_KEY')
WEAVIATE_CODEARENA_INDEX_NAME = "CodeArenaDocsV1"
WEAVIATE_URL = os.environ.get('WEAVIATE_URL')

assert OPENAI_API_KEY, "Please set OPENAI_API_KEY in your environment variables"
assert WEAVIATE_API_KEY, "Please set WEAVIATE_API_KEY in your environment variables"

In [3]:
C4_WEBSITE_STORAGE_DIR = "../knowledge_base/c4/website"
C4_GH_DOCS_STORAGE_DIR = "../knowledge_base/c4/gh_docs"

In [4]:
import json
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

def load_json_files(dir):
    loader = DirectoryLoader(dir, loader_cls=TextLoader)
    documents = loader.load()
    for d in documents:
        page_content_dict = json.loads(d.page_content)
        d.page_content = page_content_dict['md_content']
        d.metadata['url'] = page_content_dict['url']
    return documents

c4_website_data_list = load_json_files(C4_WEBSITE_STORAGE_DIR)

In [5]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

loader = DirectoryLoader(C4_GH_DOCS_STORAGE_DIR, loader_cls=TextLoader)
c4_gh_docs_data_list = loader.load()


for i, d in enumerate(c4_gh_docs_data_list):
    local_path = d.metadata['source']

    if "/README.md" in local_path:
        # remove README.md from the path
        local_path = local_path.replace("/README.md", "")
    
    if "/SUMMARY.md" in local_path:
        # remove SUMMARY.md from the path
        local_path = local_path.replace("/SUMMARY.md", "")
    
    # remove .md from the path
    local_path = local_path.replace(".md", "")

    d.metadata['url'] = f"{local_path.replace(C4_GH_DOCS_STORAGE_DIR, 'https://docs.code4rena.com')}"


In [6]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=2000, chunk_overlap=200
)


website_chunks =  md_splitter.split_documents(c4_website_data_list)
gh_docs_chunks = md_splitter.split_documents(c4_gh_docs_data_list)




In [7]:
website_chunks_with_source = [d.copy(deep=True) for d in website_chunks]

for i, d in enumerate(website_chunks_with_source):
    d.metadata['source'] = f"{i}-pl"

website_chunks_offset = len(website_chunks_with_source)
website_chunks_offset

89

In [8]:

gh_docs_chunks_with_source = [d.copy(deep=True) for d in gh_docs_chunks]

for i, d in enumerate(gh_docs_chunks_with_source):
    local_path = d.metadata['source']
    d.metadata['source'] = f"{i+website_chunks_offset}-pl"

len(gh_docs_chunks_with_source)

72

In [9]:
import weaviate
import os
from langchain.vectorstores import Weaviate

weaviate_client = weaviate.Client(
    url=WEAVIATE_URL,
    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
    additional_headers={"X-OpenAI-Api-Key": OPENAI_API_KEY},
)
weaviate = Weaviate(weaviate_client, WEAVIATE_CODEARENA_INDEX_NAME, text_key='text')


In [10]:
schema = {
    "classes": [
        {
            "class": WEAVIATE_CODEARENA_INDEX_NAME,
            "description": "CodeArena docs index",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {
                "text2vec-openai": {
                    "model": "ada",
                    "modelVersion": "002",
                    "type": "text",
                }
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'text',
                },
                {
                    "dataType": ["text"],
                    "description": "The source id of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'source',
                },
                {
                    "dataType": ["text"],
                    "description": "The reference url of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'url',
                },
            ],
        },
    ]
}
weaviate_client.schema.create(schema)

In [11]:
weaviate_client.schema.delete_class(WEAVIATE_CODEARENA_INDEX_NAME)
weaviate.add_documents(website_chunks_with_source + gh_docs_chunks_with_source)

['a63885a0-cf89-494f-a3e3-039d1c520f74',
 '96a0876d-9b66-44bd-a9ad-8aff129bad0b',
 '1fa4a305-f40e-4c03-bff9-81d694eba9fd',
 'b3736a0e-3ef2-441c-b1f5-e2a8161101a6',
 '0730bfa7-c4c6-455c-b62e-88c4f3c16ca0',
 'f4e9c925-6589-4cf0-898d-a852b15f801f',
 '68528680-e144-4412-a353-2d17dad26cd8',
 '1940f313-042d-4bbd-929c-99071086e8c7',
 '9644f0f0-088f-4638-868e-9e6261741b89',
 '2174791b-b7b4-4fa2-a005-dcb81a113ac4',
 '3c234f73-5396-43bd-8037-ac1dc5e19b6b',
 '5f3fb3f7-c8c8-4844-8fa1-78ba10e0f738',
 '10230bb9-df61-4243-a895-602fa8bf7a9a',
 '81c2942c-3aa0-4362-803d-b164958ba319',
 '757d77c8-0908-4fa2-878b-06ed3f069148',
 'daad18e5-4ae1-429f-9f8d-e6d14cdce071',
 '3fd6fe74-a81f-46f5-a1da-ae3e672d0871',
 '100af503-5a7a-46ee-b8fa-91e417033f71',
 'a477f817-53c0-467f-86ca-29a26a59b726',
 'd2d5942b-04ba-4aac-abb5-1576582b5e78',
 'f9595f9c-6268-4345-a733-385867230b78',
 'd24b3b0a-31dd-457b-b779-906b98bae359',
 'a1b6ce72-a349-495c-8b20-d789b40efaee',
 '2dc16dfd-e1cd-40de-af2c-156aafaa0fa9',
 '73c493f0-e53f-

In [12]:
query_result = weaviate_client.query\
    .get(WEAVIATE_CODEARENA_INDEX_NAME, ["text", "source", "url"])\
    .with_hybrid(
        query="What is Scout"
    )\
    .with_limit(4)\
    .do()

print(query_result)

{'data': {'Get': {'CodeArenaDocsV1': [{'source': '157-pl', 'text': "### How can I become a Judge?\n\nComplete [this form](https://code4rena.com/judge-application/) and share: Short bio/intro and summary of relevant experience, links that help demonstrate your expertise, 3 example submissions to Code4rena contests that were judged high severity, description of how each submission demonstrates your depth of knowledge.\n\n### How can I become a Scout?\n\nYou can’t, just yet! Right now, Scouts are hand-picked by the C4 team as it’s a highly sensitive role. We’re looking at the possibility of opening up this process, but not in the near future.\n\n### I want Code4rena to audit my project, where do I start?\n\nIt’s really simple! Just visit [this link](https://code4rena.typeform.com/i-want-an-audit) and fill out the form. Our team will be in touch with you shortly after you’ve completed it.\n\n### Do you have a blog?\n\nWe do indeed, [here](https://medium.com/code-423n4). We post product upd