## 0. Setup

Make sure to check the dependencies in the requirements.txt file.

In [1]:
import os
import joblib

import torch
from dotenv import load_dotenv

from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from pymilvus.model.sparse.bm25 import BM25EmbeddingFunction
from pymilvus import (
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    connections,
)

from ContentProcessor import ContentDocProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

True

## 1. HTML to Split Documents Conversion

First, run the CourseScraper to scrape the submodule HTML contents. If running from the project root directory, the command would look like this:
```bash
python ETL/CourseScraper.py --input_json course_materials/emgt605/module_urls.json --ouput_dir course_materials/emgt605/module_content
```

Then we will load the output JSON files from the `module_content` directory and convert the HTML content into documents (dict) with the text split into sections. 

In [4]:
# The path to the directory containing the JSON files output of CourseScraper.py
JSON_DIR = "../course_materials/emgt605/module_content"

# The CSS elements to exclude when extracting text from the HTML content
EXCLUDED_ELEMENTS_CSS='div.quiz-card__feedback, div.block-knowledge__retake-container, iframe, img, a'
CHUNK_TOKEN_SIZE = 500
CHUNK_TOKEN_OVERLAP = 50

# We will run process all JSON files in the JSON_DIR and collect all the resulting Docments
combined_docs = []
json_files = [f for f in os.listdir(JSON_DIR) if f.endswith('.json')]
processor = ContentDocProcessor(excluded_elements_css=EXCLUDED_ELEMENTS_CSS, chunk_token_size=CHUNK_TOKEN_SIZE, chunk_token_overlap=CHUNK_TOKEN_OVERLAP)
for json_file in json_files:
    json_path = os.path.join(JSON_DIR, json_file)
    docs = processor.run(json_path)
    combined_docs.extend(docs)

## 2. Document Embeddings and Vector DB Loading

We will use the normal text for sparse embedding and text with contextual header for the dense embedding.

In [None]:
def add_contextual_headers(doc):
    context = f"Content snippet of: {doc['module_title']} - {doc['subsection']}: {doc['submodule_title']}"
    return context + "\n" + doc['text']

In [5]:
sparse_doc_texts = [doc['text'] for doc in combined_docs]

dense_doc_texts = [add_contextual_headers(doc) for doc in combined_docs]

Initialize the dense embedding model. Note that using GPU is highly recommended for this task as it will be much faster.

In [3]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

dense_embeddings = BGEM3EmbeddingFunction(use_fp16=False, device=DEVICE, return_dense=True, return_sparse=False)
dense_dim = dense_embeddings.dim['dense']

Fetching 30 files: 100%|██████████| 30/30 [00:00<?, ?it/s]


Initialize the BM25 sparse embeddings and save them to disk.

In [None]:
SPARSE_EMBEDDINGS_PATH = "../artifact/emgt605/sparse_embeddings.joblib"

sparse_embeddings = BM25EmbeddingFunction(corpus=sparse_doc_texts)

folder_path = os.path.dirname(SPARSE_EMBEDDINGS_PATH)
os.makedirs(folder_path, exist_ok=True)
joblib.dump(sparse_embeddings, SPARSE_EMBEDDINGS_PATH)

Create the dense and sparse vectors for the documents

In [None]:
sparse_vectors = sparse_embeddings.encode_documents(sparse_doc_texts)
dense_vectors = dense_embeddings.encode_documents(dense_doc_texts)

for i, doc in enumerate(combined_docs):
    doc['sparse_vector'] = sparse_vectors[[i], :]
    doc['dense_vector'] = dense_vectors['dense'][i]

Make a connection to the Zilliz vector database and load the embeddings into the vector database.

In [9]:
# This is the parameter to connect to the Zilliz vector database
ZILLIZ_URI = os.getenv("ZILLIZ_URI")
ZILLIZ_USER = os.getenv("ZILLIZ_USER")
ZILLIZ_PASSWORD = os.getenv("ZILLIZ_PASSWORD")
COLLECTION_NAME = "emgt_605_bge_bm25_500_50"

In [None]:
connections.connect(user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, uri=ZILLIZ_URI)

fields = [
    FieldSchema(
        name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100
    ),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
    FieldSchema(name="module_title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="subsection", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="submodule_title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="submodule_url", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="start_index", dtype=DataType.INT32), 
    FieldSchema(name="data_block_ranges", dtype=DataType.JSON), 
]

schema = CollectionSchema(fields, "Dense (BGE-M3) and Sparse (BM25) Embeddings for EMGT605 Course Content")
col = Collection(COLLECTION_NAME, schema)

sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
dense_index = {"index_type": "FLAT", "metric_type": "COSINE"}
col.create_index("sparse_vector", sparse_index)
col.create_index("dense_vector", dense_index)

col.insert(combined_docs)
col.load()

## 3. Retrieval Testing

Check if the documents has been loaded correctly into the vector database. Note that we will initialize the retriever using the pymilvus SDK instead of Langchain because currently Langchain does not support the BM25 retriever.

In [10]:
connections.connect(user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, uri=ZILLIZ_URI)
col = Collection(COLLECTION_NAME)

In [13]:
query = "What are the core pillars for sustainability?"

In [37]:
dense_query = dense_embeddings.encode_queries([query])['dense']
dense_results = col.search(dense_query, anns_field="dense_vector", limit=3, param={"metric_type": "COSINE"}, output_fields=['text'])
# print(dense_results[0])

In [43]:
sparse_query = sparse_embeddings.encode_queries([query])
sparse_results = col.search(sparse_query, anns_field="sparse_vector", limit=3, param={"metric_type": "IP"}, output_fields=['text'])
# print(sparse_results[0])