## 0. Setup

Make sure to check the dependencies in the requirements.txt file.

In [9]:
import os
import joblib

import torch
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus.vectorstores import Milvus
from langchain_milvus.utils.sparse import BM25SparseEmbedding
from pymilvus import connections, Collection

from ContentProcessor import ContentDocProcessor

In [2]:
load_dotenv()

True

## 1. HTML to LangChain Document Conversion

First, run the CourseScraper to scrape the submodule HTML contents. If running from the project root directory, the command would look like this:
```bash
python ETL/CourseScraper.py --input_json course_materials/emgt605/module_urls.json --ouput_dir course_materials/emgt605/module_content
```

Then we will load the output JSON files from the `module_content` directory and convert them into LangChain documents.

In [3]:
# The path to the directory containing the JSON files output of CourseScraper.py
JSON_DIR = "../course_materials/emgt605/module_content"

# The CSS elements to exclude when extracting text from the HTML content
EXCLUDED_ELEMENTS_CSS='div.quiz-card__feedback, div.block-knowledge__retake-container, iframe, img, a'

# We will run process all JSON files in the JSON_DIR and collect all the resulting Docments
combined_docs = []
json_files = [f for f in os.listdir(JSON_DIR) if f.endswith('.json')]
processor = ContentDocProcessor(excluded_elements_css=EXCLUDED_ELEMENTS_CSS)
for json_file in json_files:
    json_path = os.path.join(JSON_DIR, json_file)
    docs = processor.run(json_path)
    combined_docs.extend(docs)

## 2. Document Embeddings and Vector DB Loading

Initialize the dense embedding model. Note that using GPU is highly recommended for this task as it will be much faster.

In [11]:
# Utilize GPU to load and infer the embedding model if available 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EMBEDDING_MODEL_NAME = "jinaai/jina-embeddings-v3"

model_kwargs = {"device": DEVICE, "trust_remote_code":True}
dense_embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME, model_kwargs=model_kwargs)

Initialize the BM25 sparse embeddings and save them to disk.

In [4]:
SPARSE_EMBEDDINGS_PATH = "sparse_embeddings.joblib"

combined_texts = [doc.page_content for doc in combined_docs]
sparse_embeddings = BM25SparseEmbedding(combined_texts)

joblib.dump(sparse_embeddings, SPARSE_EMBEDDINGS_PATH)

  from .autonotebook import tqdm as notebook_tqdm


['sparse_embeddings.joblib']

Make a connection to the Zilliz vector database and load the embeddings into the vector database.

In [13]:
# This is the parameter to connect to the Zilliz vector database
ZILLIZ_URI = os.getenv("ZILLIZ_URI")
ZILLIZ_USER = os.getenv("ZILLIZ_USER")
ZILLIZ_PASSWORD = os.getenv("ZILLIZ_PASSWORD")
COLLECTION_NAME = "emgt_605_jina_hybrid"

# initialize the Milvus vector database using Langchain Python SDK
vector_db = Milvus(
    embedding_function=[dense_embeddings, sparse_embeddings],
    vector_field=["dense_vectors", "sparse_vectors"],
    connection_args={
        "uri": ZILLIZ_URI,
        "user": ZILLIZ_USER,
        "password": ZILLIZ_PASSWORD,
        "secure": True},
    collection_name=COLLECTION_NAME,
    auto_id=True)

In [None]:
# add the documents from step 1 to the vector database
vector_db.add_documents(combined_docs)

## 3. Retrieval Testing

Check if the documents has been loaded correctly into the vector database. Note that we will initialize the retriever using the pymilvus SDK instead of Langchain because currently Langchain does not support the BM25 retriever.

In [12]:
connections.connect(user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, uri=ZILLIZ_URI)
col = Collection(COLLECTION_NAME)

In [56]:
query = "What are the core pillars for sustainability?"

In [59]:
dense_query = dense_embeddings.embed_query(query)
dense_results = col.search([dense_query], anns_field="dense_vectors", limit=3, param={"metric_type": "L2"}, output_fields=['text'])
# print(dense_results[0])

In [60]:
sparse_query = sparse_embeddings.embed_query(query)
sparse_results = col.search([sparse_query], anns_field="sparse_vectors", limit=3, param={"metric_type": "IP"}, output_fields=['text'])
# print(sparse_results[0])