In [1]:
from bs4 import BeautifulSoup
import chromadb
from collections import Counter
from collections import defaultdict
from langchain_text_splitters import HTMLHeaderTextSplitter, HTMLSectionSplitter
import os
from pathlib import Path
import sqlite3

In [2]:
def chunk_document(splitter, doc):
    documents = splitter.split_text(doc)
    
    grouped_docs = defaultdict(list)
    
    for doc in documents:
        key = " > ".join(doc.metadata.values())
        grouped_docs[key].append(doc.page_content)

    agg_grouped_docs = {}
    for key, values in grouped_docs.items():
        agg_grouped_docs[key] = "\n\n".join(values)
        
    return list(agg_grouped_docs.values())

In [3]:
def persist_documents():
    crdb = chromadb.PersistentClient(path="pagevector")
    collection = crdb.get_or_create_collection(name="reports", metadata={
        "hnsw:M": 32,
        "hnsw:search_ef": 100
    })

    headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
    splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    pagedata = Path("pagedata")
    for source in os.listdir(pagedata):
        print(f"Indexing documents for {source}")
        for page in os.listdir(pagedata / source):
            fullpath = os.path.join(pagedata / source / page)
            report_id = Path(page).stem

            documents = []
            with open(fullpath) as f:
                documents = chunk_document(splitter, f.read())

            ids = []
            crdb_docs = []
            for i, value in enumerate(documents):
                ids.append(f"{i}:{report_id}")
                crdb_docs.append(value)

            try:
                collection.upsert(
                    ids=ids,
                    documents=crdb_docs,
                )
            except Exception as e:
                raise Exception(f"error upserting for {source} {report_id}: {e}")

        print(f"Finished indexing documents for {source}. Total {collection.count()} documents")

    print(f"Indexed {collection.count()} documents")

In [4]:
def compute_similarities(sq_conn):
    read_cur = sq_conn.cursor()
    update_cur = sq_conn.cursor()

    crdb = chromadb.PersistentClient(path="pagevector")
    collection = crdb.get_collection("reports")

    headers_to_split_on = [("h1", "Main Topic"), ("h2", "Sub Topic")]
    splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    count = 0
    for row in read_cur.execute("SELECT id, source FROM report"):
        report_id, source = row
        
        pagepath = os.path.join("pagedata", source, f"{report_id}.html")
        with open(pagepath) as f:
            contents = f.read()
            chunks = chunk_document(splitter, contents)

            results = collection.query(query_texts=chunks, n_results=5)
            ids = results["ids"]
            result_report_ids = [rid.split(":")[1] for id_set in ids for rid in id_set]
            filtered_report_ids = [rid for rid in result_report_ids if rid != report_id]

            result_count = Counter(filtered_report_ids)
            top_match_ids = [match[0] for match in result_count.most_common(10)]

            with sq_conn:
                update_cur.execute("""
                    UPDATE report SET
                        related_report_ids = ?
                    WHERE id = ?
                """, (",".join(top_match_ids), report_id,))
                
                if update_cur.rowcount != 1:
                    raise Exception(f"update_cur.rowcount = {update_cur.rowcount}")
            
            count += 1

    print(f"Computed similarities for {count} rows")

In [5]:
if __name__ == '__main__':
    persist_documents()
    
    sq_conn = sqlite3.connect("reports.db")
    try:
        compute_similarities(sq_conn)
    finally:
        sq_conn.close()

Indexing documents for SentinelOne
Finished indexing documents for SentinelOne. Total 1949 documents
Indexing documents for Check Point Research
Finished indexing documents for Check Point Research. Total 1949 documents
Indexing documents for Subscribe to Security Magazine's RSS Feed
Finished indexing documents for Subscribe to Security Magazine's RSS Feed. Total 2087 documents
Indexing documents for Microsoft Security Blog
Finished indexing documents for Microsoft Security Blog. Total 2101 documents
Indexing documents for abuse.ch | IT-Security Blog
Finished indexing documents for abuse.ch | IT-Security Blog. Total 2102 documents
Indexing documents for All CISA Advisories
Finished indexing documents for All CISA Advisories. Total 2148 documents
Indexing documents for Unit 42
Finished indexing documents for Unit 42. Total 2148 documents
Indexing documents for Blog
Finished indexing documents for Blog. Total 2171 documents
Indexing documents for WeLiveSecurity
Finished indexing document