## 0. Setup

Make sure to check the dependencies in the requirements.txt file.

In [1]:
import os

import torch
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_milvus.vectorstores import Milvus

from ContentProcessor import ContentDocProcessor

In [2]:
load_dotenv()

True

## 1. HTML to LangChain Document Conversion

First, run the CourseScraper to scrape the submodule HTML contents. If running from the project root directory, the command would look like this:
```bash
python ETL/CourseScraper.py --input_json course_materials/emgt605/module_urls.json --ouput_dir course_materials/emgt605/module_content
```

Then we will load the output JSON files from the `module_content` directory and convert them into LangChain documents.

In [29]:
# The path to the directory containing the JSON files output of CourseScraper.py
JSON_DIR = "../course_materials/emgt605/module_content"

# The CSS elements to exclude when extracting text from the HTML content
EXCLUDED_ELEMENTS_CSS='div.quiz-card__feedback, div.block-knowledge__retake-container, iframe, img, a'

# We will run process all JSON files in the JSON_DIR and collect all the resulting Docments
combined_docs = []
json_files = [f for f in os.listdir(JSON_DIR) if f.endswith('.json')]
processor = ContentDocProcessor(excluded_elements_css=EXCLUDED_ELEMENTS_CSS)
for json_file in json_files:
    json_path = os.path.join(JSON_DIR, json_file)
    docs = processor.run(json_path)
    combined_docs.extend(docs)

## 2. Document Embeddings and Vector DB Loading

Initialize the embedding model. Note that using GPU is highly recommended for this task as it will be much faster.

In [None]:
# Utilize GPU to load and infer the embedding model if available 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EMBEDDING_MODEL_NAME = "jinaai/jina-embeddings-v3"

model_kwargs = {"device": DEVICE, "trust_remote_code":True}
dense_embeddings = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME, model_kwargs=model_kwargs)

In [7]:
# This is the parameter to connect to the Zilliz vector database
ZILLIZ_URI = os.getenv("ZILLIZ_URI")
ZILLIZ_USER = os.getenv("ZILLIZ_USER")
ZILLIZ_PASSWORD = os.getenv("ZILLIZ_PASSWORD")
COLLECTION_NAME = "emgt_605_jina"

# initialize the Milvus vector database
vector_db = Milvus(
    embedding_function=[dense_embeddings],
    connection_args={
        "uri": ZILLIZ_URI,
        "user": ZILLIZ_USER,
        "password": ZILLIZ_PASSWORD,
        "secure": True},
    collection_name=COLLECTION_NAME,
    auto_id=True)

In [None]:
# add the documents to the vector database
vector_db.add_documents(combined_docs)

Check if the documents has been loaded correctly into the vector database.

In [None]:
retriever = vector_db.as_retriever()

query = "What are different ways to calculate GDP?" 
results = retriever.invoke(query)