## 0. Setup

In [33]:
from kaggle_secrets import UserSecretsClient
import os

user_secrets = UserSecretsClient()
os.environ['GITHUB_PAT'] = user_secrets.get_secret("GITHUB_PAT")
os.environ['ZILLIZ_PASSWORD'] = user_secrets.get_secret("ZILLIZ_PASSWORD")
os.environ['ZILLIZ_URI'] = user_secrets.get_secret("ZILLIZ_URI")
os.environ['ZILLIZ_USER'] = user_secrets.get_secret("ZILLIZ_USER")

In [3]:
repo_url = f"https://{os.getenv('GITHUB_PAT')}@github.com/tmtsmrsl/uconline_poc.git"
!git clone -b dev {repo_url}

Cloning into 'uconline_poc'...
remote: Enumerating objects: 353, done.[K
remote: Counting objects: 100% (353/353), done.[K
remote: Compressing objects: 100% (190/190), done.[K
remote: Total 353 (delta 199), reused 296 (delta 151), pack-reused 0 (from 0)[K
Receiving objects: 100% (353/353), 198.29 KiB | 6.20 MiB/s, done.
Resolving deltas: 100% (199/199), done.


In [4]:
%cd /kaggle/working/uconline_poc

/kaggle/working/uconline_poc


In [76]:
!cp -r /kaggle/input/artifact /kaggle/working/uconline_poc/artifact

  pid, fd = os.forkpty()


In [6]:
pip install -q -r requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.7/169.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.8/161.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.2/44.2 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.8 MB/s[

In [7]:
import os
import joblib
import json 
import torch
from dotenv import load_dotenv

from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from pymilvus.model.sparse.bm25 import BM25EmbeddingFunction
from pymilvus import (
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    connections,
)

from ETL.ContentProcessor import ContentDocProcessor
from ETL.TranscriptProcessor import TranscriptDocProcessor

## 1. Scrape the HTML content and video transcripts
Check the README.md file for the instructions on how to scrape the HTML content and video transcripts.

## 2. Convert the HTML content and video transcripts to documents

In [216]:
# This it the chunking option for the text processing.
CHUNK_TOKEN_SIZE = 500
CHUNK_TOKEN_OVERLAP = 50
TEXT_SPLITTER_OPTIONS = {"chunk_token_size": CHUNK_TOKEN_SIZE, "chunk_token_overlap": CHUNK_TOKEN_OVERLAP}

# We want the output as Langchain Document
RETURN_DICT = False

In [217]:
HTML_CONTENT_DIR = "artifact/emgt605/html_content"

# The CSS elements to exclude when extracting text from the HTML content
EXCLUDED_ELEMENTS_CSS='div.quiz-card__feedback, div.block-knowledge__retake-container, img, a, iframe'

# Traverse the JSON_DIR and process all the JSON files 
html_content_docs = []
json_files = [f for f in os.listdir(HTML_CONTENT_DIR) if f.endswith('.json')]
content_doc_processor = ContentDocProcessor(text_splitter_options=TEXT_SPLITTER_OPTIONS, excluded_elements_css=EXCLUDED_ELEMENTS_CSS, return_dict=RETURN_DICT)

for json_file in json_files:
    json_path = os.path.join(HTML_CONTENT_DIR, json_file)
    docs = content_doc_processor.run(json_path)
    html_content_docs.extend(docs)
    
for doc in html_content_docs:
    doc.metadata['content_type'] = 'html_content'

b. Load the transcript files and metadata from the `transcripts` directory (output directory of `TranscriptScraper.py`) and convert the video transcripts into documents.

In [218]:
TRANSCRIPT_DIR = "artifact/emgt605/transcripts"

# Traverse the TRANSCRIPT_DIR and process all the transcript files
transcript_docs = []
module_dirs = os.listdir(TRANSCRIPT_DIR)
transcript_doc_processor = TranscriptDocProcessor(text_splitter_options=TEXT_SPLITTER_OPTIONS, return_dict=RETURN_DICT)

for module_dir in module_dirs:
    module_path = os.path.join(TRANSCRIPT_DIR, module_dir)
    docs = transcript_doc_processor.process_module_transcripts(module_path)
    transcript_docs.extend(docs)
    
for doc in transcript_docs:
    doc.metadata['content_type'] = 'video_transcript'



In [219]:
combined_docs = html_content_docs + transcript_docs

## 3. Document Embeddings and Vector DB Loading

Initialize the dense embedding model. Note that using GPU is highly recommended for this task as it will be much faster.

In [220]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
dense_embeddings = BGEM3EmbeddingFunction(use_fp16=False, device=DEVICE, return_dense=True, return_sparse=False)
dense_dim = dense_embeddings.dim['dense']

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Initialize the BM25 sparse embeddings and save them to disk. Notice that we fit the BM25 embedding model to the original text to preserve the original distibution of the word count.

In [221]:
SPARSE_EMBEDDINGS_PATH = "artifact/emgt605/sparse_embeddings.joblib"

original_texts = [doc.page_content for doc in combined_docs]
sparse_embeddings = BM25EmbeddingFunction(corpus=original_texts)

folder_path = os.path.dirname(SPARSE_EMBEDDINGS_PATH)
os.makedirs(folder_path, exist_ok=True)
joblib.dump(sparse_embeddings, SPARSE_EMBEDDINGS_PATH)

['artifact/emgt605/sparse_embeddings.joblib']

We will use the text with contextual header for both the sparse and dense embedding. The added context will improve the representation of both embeddings types. I don't modify the text directly in the document as it would complicate the indexing and deduplication step during the post-retrieval step.

In [260]:
def add_contextual_headers(doc, video_desc_char_limit=1000):
    if doc.metadata['content_type'] == 'html_content':
        context = f"Content snippet of: {doc.metadata['module_title']} - {doc.metadata['subsection']}: {doc.metadata['submodule_title']}"
    elif doc.metadata['content_type'] == 'video_transcript':
        # just to anticipate a very long paragraph for the video description
        video_desc = video_desc[:video_desc_char_limit]
        context = f"{video_desc}\nSnippet of video transcript:"
    return context + "\n" + doc.page_content

In [226]:
contextual_texts = [add_contextual_headers(doc) for doc in combined_docs]

Create the dense and sparse vectors for the documents.

In [227]:
sparse_vectors = sparse_embeddings.encode_documents(contextual_texts)
dense_vectors = dense_embeddings.encode_documents(contextual_texts)

pre tokenize: 100%|██████████| 20/20 [00:00<00:00, 40.09it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 20/20 [00:34<00:00,  1.72s/it]


In [228]:
def convert_doc_to_dict(doc):
    temp_doc = doc.dict()
    for k, v in temp_doc['metadata'].items():
        temp_doc[k] = v
    temp_doc['text'] = temp_doc.pop('page_content')
    temp_doc.pop('id', None)
    temp_doc.pop('metadata', None)
    temp_doc.pop('type', None)
    return temp_doc

In [229]:
combined_dict = [convert_doc_to_dict(doc) for doc in combined_docs]
for i, doc in enumerate(combined_dict):
    doc['sparse_vector'] = sparse_vectors[[i], :]
    doc['dense_vector'] = dense_vectors['dense'][i]

<ipython-input-228-79ea7bc2452a>:2: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  temp_doc = doc.dict()


Make a connection to the Zilliz vector database and load the embeddings into the vector database.

In [230]:
# This is the parameter to connect to the Zilliz vector database
ZILLIZ_URI = os.getenv("ZILLIZ_URI")
ZILLIZ_USER = os.getenv("ZILLIZ_USER")
ZILLIZ_PASSWORD = os.getenv("ZILLIZ_PASSWORD")
COLLECTION_NAME = "emgt_605_video_bge_bm25_500_50"

In [231]:
connections.connect(user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, uri=ZILLIZ_URI)

fields = [
    FieldSchema(
        name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100
    ),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
    FieldSchema(name="module_title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="subsection", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="submodule_title", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="submodule_url", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="content_type", dtype=DataType.VARCHAR, max_length=500),
]

schema = CollectionSchema(fields, 
                          "Dense (BGE-M3) and Sparse (BM25) Embeddings for EMGT605 Course Content", 
                          enable_dynamic_field=True)
col = Collection(COLLECTION_NAME, schema)

sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
dense_index = {"index_type": "FLAT", "metric_type": "COSINE"}
col.create_index("sparse_vector", sparse_index)
col.create_index("dense_vector", dense_index)

col.insert(combined_dict)
col.load()

## 4. Retrieval Testing

Check if the documents has been loaded correctly into the vector database. Note that we will initialize the retriever using the pymilvus SDK instead of Langchain because currently Langchain does not support the BM25 retriever.

In [232]:
connections.connect(user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, uri=ZILLIZ_URI)
col = Collection(COLLECTION_NAME)

In [233]:
query = "What are the core pillars for sustainability?"

In [261]:
dense_query = dense_embeddings.encode_queries([query])['dense']
dense_results = col.search(dense_query, 
                           anns_field="dense_vector", 
                           limit=3, param={"metric_type": "COSINE"}, 
                           output_fields=['text'],
                          expr='content_type=="html_content"')
# print(dense_results[0])

In [262]:
sparse_query = sparse_embeddings.encode_queries([query])
sparse_results = col.search(sparse_query,
                            anns_field="sparse_vector", 
                            limit=3, param={"metric_type": "IP"}, 
                            output_fields=['text'],
                           expr='content_type=="video_transcript"')
# print(sparse_results[0])