In [None]:
!pip install 'langchain[llms]'
!pip install Scrapy
!pip install html2text
!pip install lxml
!pip install python-dotenv
!pip install "unstructured[all-docs]"
!pip install tiktoken
!pip install faiss-cpu 
!pip install GitPython
!pip install notebook
!pip install chromadb
!pip install pandas
!pip install rank_bm25
!pip install weaviate-client

In [None]:
import logging
from dotenv import load_dotenv
from IPython.display import display, Markdown, Latex

logging.getLogger().setLevel(logging.INFO)
load_dotenv()

In [None]:
import getpass
import os

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY') 

WEAVIATE_API_KEY = os.environ.get('WEAVIATE_API_KEY')
WEAVIATE_CODEARENA_INDEX_NAME = "CodeArenaDocsV1"
WEAVIATE_URL = "http://localhost:8080"

assert OPENAI_API_KEY, "Please set OPENAI_API_KEY in your environment variables"
assert WEAVIATE_API_KEY, "Please set WEAVIATE_API_KEY in your environment variables"

In [None]:
C4_WEBSITE_STORAGE_DIR = "../knowledge_base/c4/website"
C4_GH_DOCS_STORAGE_DIR = "../knowledge_base/c4/gh_docs"

In [None]:
import json
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

def load_json_files(dir):
    loader = DirectoryLoader(dir, loader_cls=TextLoader)
    documents = loader.load()
    for d in documents:
        page_content_dict = json.loads(d.page_content)
        d.page_content = page_content_dict['md_content']
        d.metadata['url'] = page_content_dict['url']
    return documents

c4_website_data_list = load_json_files(C4_WEBSITE_STORAGE_DIR)

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

loader = DirectoryLoader(C4_GH_DOCS_STORAGE_DIR, loader_cls=TextLoader)
c4_gh_docs_data_list = loader.load()


for i, d in enumerate(c4_gh_docs_data_list):
    local_path = d.metadata['source']

    if "/README.md" in local_path:
        # remove README.md from the path
        local_path = local_path.replace("/README.md", "")
    
    if "/SUMMARY.md" in local_path:
        # remove SUMMARY.md from the path
        local_path = local_path.replace("/SUMMARY.md", "")
    
    # remove .md from the path
    local_path = local_path.replace(".md", "")

    d.metadata['url'] = f"{local_path.replace(C4_GH_DOCS_STORAGE_DIR, 'https://docs.code4rena.com')}"


In [None]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=2000, chunk_overlap=200
)


website_chunks =  md_splitter.split_documents(c4_website_data_list)
gh_docs_chunks = md_splitter.split_documents(c4_gh_docs_data_list)




In [None]:
website_chunks_with_source = [d.copy(deep=True) for d in website_chunks]

for i, d in enumerate(website_chunks_with_source):
    d.metadata['source'] = f"{i}-pl"

website_chunks_offset = len(website_chunks_with_source)
website_chunks_offset

In [None]:

gh_docs_chunks_with_source = [d.copy(deep=True) for d in gh_docs_chunks]

for i, d in enumerate(gh_docs_chunks_with_source):
    local_path = d.metadata['source']
    d.metadata['source'] = f"{i+website_chunks_offset}-pl"

len(gh_docs_chunks_with_source)

In [None]:
import weaviate
import os
from langchain.vectorstores import Weaviate

weaviate_client = weaviate.Client(
    url=WEAVIATE_URL,
    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
    additional_headers={"X-OpenAI-Api-Key": OPENAI_API_KEY},
)
weaviate = Weaviate(weaviate_client, WEAVIATE_CODEARENA_INDEX_NAME, text_key='text')


In [None]:
schema = {
    "classes": [
        {
            "class": WEAVIATE_CODEARENA_INDEX_NAME,
            "description": "CodeArena docs index",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {
                "text2vec-openai": {
                    "model": "ada",
                    "modelVersion": "002",
                    "type": "text",
                }
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'text',
                },
                {
                    "dataType": ["text"],
                    "description": "The source id of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'source',
                },
                {
                    "dataType": ["text"],
                    "description": "The reference url of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'url',
                },
            ],
        },
    ]
}
weaviate_client.schema.create(schema)

In [None]:
weaviate_client.schema.delete_class(WEAVIATE_CODEARENA_INDEX_NAME)
weaviate.add_documents(website_chunks_with_source + gh_docs_chunks_with_source)

In [None]:
query_result = weaviate_client.query\
    .get(WEAVIATE_CODEARENA_INDEX_NAME, ["text", "source", "url"])\
    .with_hybrid(
        query="What is Scout"
    )\
    .with_limit(4)\
    .do()

print(query_result)