### High-level Approach
- Crawl and scrape C4’s website and docs using Scrapy lib
- Convert the html content to markdown format so that the model can better understand the context
- Use LangChain lib to do the following:-
    - Split the markdown header-separated sections into semantic chunks
    - Embed and store the semantic chunks in Weaviate vector db
    - Use the retrieval augmented functionality to answer the question

In [None]:
# Install all the third-party packages

!pip install 'langchain[llms]'
!pip install Scrapy
!pip install html2text
!pip install lxml
!pip install python-dotenv
!pip install "unstructured[all-docs]"
!pip install tiktoken
!pip install faiss-cpu 
!pip install GitPython
!pip install notebook
!pip install chromadb
!pip install pandas
!pip install rank_bm25
!pip install weaviate-client

In [None]:
import logging
from dotenv import load_dotenv
from IPython.display import display, Markdown, Latex

logging.getLogger().setLevel(logging.INFO)
load_dotenv()

In [None]:
import getpass
import os

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

WEAVIATE_URL = os.environ['WEAVIATE_URL']
WEAVIATE_API_KEY = os.environ['WEAVIATE_API_KEY']


In [None]:
# Paths to the data

BASE_DATA_DIR = "./__data__"
C4_WEBSITE_STORAGE_DIR = os.path.join(BASE_DATA_DIR, "c4_website")
C4_GH_DOCS_STORAGE_DIR = os.path.join(BASE_DATA_DIR, "c4_gh_docs")

if not os.path.exists(BASE_DATA_DIR):
    os.makedirs(BASE_DATA_DIR)

### Download Knowledge Base data

#### Crawling and Scraping C4 website using Scrapy

In [None]:
import os
import scrapy
import html2text
import lxml.html
import json
from urllib.parse import urlparse
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor


class GenericSpider(scrapy.Spider):
    name = 'generic'

    def __init__(self, domain='', storage_dir='.', *args, **kwargs):
        super(GenericSpider, self).__init__(*args, **kwargs)
        self.allowed_domains = [domain]
        self.start_urls = [f'http://{domain}/']
        self.storage_dir = storage_dir
    
    def parse(self, response):
        # Remove unwanted elements using lxml
        tree = lxml.html.fromstring(response.text)
        
        # Remove non-text related tags
        for unwanted in tree.xpath('//script|//img|//video|//audio|//iframe|//object|//embed|//canvas|//svg|//link|//source|//track|//map|//area'):
            unwanted.drop_tree()

        cleaned_html = lxml.html.tostring(tree).decode('utf-8')

        # Convert HTML to Markdown
        converter = html2text.HTML2Text()
        markdown_text = converter.handle(cleaned_html)

        # Save to a markdown file in the specified directory
        if not os.path.exists(self.storage_dir):
            os.makedirs(self.storage_dir)

        url = response.url
        page_name = response.url.split("/")[-1] if response.url.split("/")[-1] else "index"

        filename = os.path.join(self.storage_dir, f'{page_name}.json')

        with open(filename, 'w') as f:
            # Store the URL and markdown text in JSON format
            json.dump({'url': url, 'md_content': markdown_text}, f)

        # Recursively follow relative links to other pages on the same domain
        for href in response.css('a::attr(href)').getall():
            url = response.urljoin(href)
            if urlparse(url).netloc in self.allowed_domains:
                yield scrapy.Request(url, self.parse)



settings = get_project_settings()
runner = CrawlerRunner(settings)
runner.crawl(GenericSpider, domain="code4rena.com", storage_dir=C4_WEBSITE_STORAGE_DIR)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()

#### Get docs from Github Repo

In [None]:
from git import Repo

repo = Repo.clone_from(
    "https://github.com/code-423n4/docs", to_path=C4_GH_DOCS_STORAGE_DIR
)

### Retrieval Augmented Generation using LangChain

#### Load locally saved scraped data

In [None]:
import json
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

def load_json_files(dir):
    loader = DirectoryLoader(dir, loader_cls=TextLoader)
    documents = loader.load()
    for d in documents:
        page_content_dict = json.loads(d.page_content)
        d.page_content = page_content_dict['md_content']
        d.metadata['url'] = page_content_dict['url']
    return documents

c4_website_data_list = load_json_files(C4_WEBSITE_STORAGE_DIR)

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader

loader = DirectoryLoader(C4_GH_DOCS_STORAGE_DIR, loader_cls=TextLoader)
c4_gh_docs_data_list = loader.load()


for i, d in enumerate(c4_gh_docs_data_list):
    local_path = d.metadata['source']

    if "/README.md" in local_path:
        # remove README.md from the path
        local_path = local_path.replace("/README.md", "")
    
    if "/SUMMARY.md" in local_path:
        # remove SUMMARY.md from the path
        local_path = local_path.replace("/SUMMARY.md", "")
    
    # remove .md from the path
    local_path = local_path.replace(".md", "")

    d.metadata['url'] = f"{local_path.replace(C4_GH_DOCS_STORAGE_DIR, 'https://docs.code4rena.com')}"


#### Split markdown formatted data into chunks

In [None]:
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)

md_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN, chunk_size=2000, chunk_overlap=200
)


website_chunks =  md_splitter.split_documents(c4_website_data_list)
gh_docs_chunks = md_splitter.split_documents(c4_gh_docs_data_list)

In [None]:
website_chunks_with_source = [d.copy(deep=True) for d in website_chunks]

for i, d in enumerate(website_chunks_with_source):
    d.metadata['source'] = f"{i}-pl"

website_chunks_offset = len(website_chunks_with_source)
website_chunks_offset

In [None]:

gh_docs_chunks_with_source = [d.copy(deep=True) for d in gh_docs_chunks]

for i, d in enumerate(gh_docs_chunks_with_source):
    local_path = d.metadata['source']
    d.metadata['source'] = f"{i+website_chunks_offset}-pl"

len(gh_docs_chunks_with_source)

### Populate Weaviate Vector DB

In [None]:
import weaviate
import os
from langchain.vectorstores import Weaviate

WEAVIATE_CODEARENA_INDEX_NAME = "CodeArenaDocsV1"

weaviate_client = weaviate.Client(
    url=WEAVIATE_URL,
    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
    additional_headers={"X-OpenAI-Api-Key": OPENAI_API_KEY},
)
weaviate = Weaviate(weaviate_client, WEAVIATE_CODEARENA_INDEX_NAME, text_key='text')

In [None]:
schema = {
    "classes": [
        {
            "class": WEAVIATE_CODEARENA_INDEX_NAME,
            "description": "CodeArena docs index",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {
                "text2vec-openai": {
                    "model": "ada",
                    "modelVersion": "002",
                    "type": "text",
                }
            },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'text',
                },
                {
                    "dataType": ["text"],
                    "description": "The source id of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'source',
                },
                {
                    "dataType": ["text"],
                    "description": "The reference url of the chunk",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": True,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": 'url',
                },
            ],
        },
    ]
}


In [None]:

weaviate_client.schema.delete_class(WEAVIATE_CODEARENA_INDEX_NAME)
weaviate_client.schema.create(schema)
weaviate.add_documents(website_chunks_with_source + gh_docs_chunks_with_source)

#### Run test queries on the populated vector db

In [None]:
query_result = weaviate_client.query\
    .get(WEAVIATE_CODEARENA_INDEX_NAME, ["text", "source", "url"])\
    .with_hybrid(
        query="What is Scout"
    )\
    .with_limit(4)\
    .do()

print(query_result)