In [1]:
from bs4 import BeautifulSoup
import chromadb
from collections import defaultdict
from langchain_text_splitters import HTMLHeaderTextSplitter, HTMLSectionSplitter
import os
from pathlib import Path

In [2]:
def chunk_document(splitter, doc):
    documents = splitter.split_text(doc)
    
    grouped_docs = defaultdict(list)
    
    for doc in documents:
        key = " > ".join(doc.metadata.values())
        grouped_docs[key].append(doc.page_content)

    agg_grouped_docs = {}
    for key, values in grouped_docs.items():
        agg_grouped_docs[key] = "\n\n".join(values)
        
    return agg_grouped_docs.values()

In [5]:
def persist_documents():
    crdb = chromadb.PersistentClient(path="pagevector")
    collection = crdb.get_or_create_collection(name="reports")

    headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
    splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    pagedata = Path("pagedata")
    for source in os.listdir(pagedata):
        print(f"Indexing documents for {source}")
        for page in os.listdir(pagedata / source):
            fullpath = os.path.join(pagedata / source / page)
            report_id = Path(page).stem

            documents = []
            with open(fullpath) as f:
                documents = chunk_document(splitter, f.read())

            ids = []
            crdb_docs = []
            for i, value in enumerate(documents):
                ids.append(f"{i}:{report_id}")
                crdb_docs.append(value)

            try:
                collection.upsert(
                    ids=ids,
                    documents=crdb_docs,
                )
            except Exception as e:
                raise Exception(f"error upserting for {source} {report_id}: {e}")

        print(f"Finished indexing documents for {source}. Total {collection.count()} documents")

    print(f"Indexed {collection.count()} documents")

In [7]:
if __name__ == '__main__':
    persist_documents()

Indexing documents for SentinelOne
Finished indexing documents for SentinelOne. Total 1940 documents
Indexing documents for Check Point Research
Finished indexing documents for Check Point Research. Total 1940 documents
Indexing documents for Subscribe to Security Magazine's RSS Feed
Finished indexing documents for Subscribe to Security Magazine's RSS Feed. Total 1940 documents
Indexing documents for Microsoft Security Blog
Finished indexing documents for Microsoft Security Blog. Total 1940 documents
Indexing documents for abuse.ch | IT-Security Blog
Finished indexing documents for abuse.ch | IT-Security Blog. Total 1940 documents
Indexing documents for All CISA Advisories
Finished indexing documents for All CISA Advisories. Total 1940 documents
Indexing documents for Unit 42
Finished indexing documents for Unit 42. Total 1940 documents
Indexing documents for Blog
Finished indexing documents for Blog. Total 1940 documents
Indexing documents for WeLiveSecurity
Finished indexing document