## 0. Setup

In [1]:
from kaggle_secrets import UserSecretsClient
import os

user_secrets = UserSecretsClient()
os.environ['GITHUB_PAT'] = user_secrets.get_secret("GITHUB_PAT")
os.environ['ZILLIZ_PASSWORD'] = user_secrets.get_secret("ZILLIZ_PASSWORD")
os.environ['ZILLIZ_URI'] = user_secrets.get_secret("ZILLIZ_URI")
os.environ['ZILLIZ_USER'] = user_secrets.get_secret("ZILLIZ_USER")
os.environ['KAGGLE_KEY'] = user_secrets.get_secret("KAGGLE_KEY")
os.environ['KAGGLE_USERNAME'] = user_secrets.get_secret("KAGGLE_USERNAME")

In [2]:
repo_url = f"https://{os.getenv('GITHUB_PAT')}@github.com/tmtsmrsl/uconline_poc.git"
!git clone -b dev {repo_url}

Cloning into 'uconline_poc'...
remote: Enumerating objects: 571, done.[K
remote: Counting objects: 100% (101/101), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 571 (delta 62), reused 75 (delta 58), pack-reused 470 (from 1)[K
Receiving objects: 100% (571/571), 260.33 KiB | 3.57 MiB/s, done.
Resolving deltas: 100% (334/334), done.


In [3]:
%cd /kaggle/working/uconline_poc

/kaggle/working/uconline_poc


In [4]:
!cp -r /kaggle/input/artifact /kaggle/working/uconline_poc/artifact

In [5]:
pip install -q -r requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.7/169.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.8/161.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.2/44.2 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.4 MB/s[

In [6]:
import os
import joblib
import json 
import re
import torch
from dotenv import load_dotenv

import kagglehub
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from pymilvus.model.sparse.bm25 import BM25EmbeddingFunction
from pymilvus import (
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    connections,
)

from ETL.ContentProcessor import ContentDocProcessor
from ETL.TranscriptProcessor import TranscriptDocProcessor

## 1. Scrape the HTML content and video transcripts
Check the README.md file for the instructions on how to scrape the HTML content and video transcripts.

## 2. Convert the HTML content and video transcripts to documents

In [7]:
# This it the chunking option for the text processing.
CHUNK_TOKEN_SIZE = 500
CHUNK_TOKEN_OVERLAP = 50
TEXT_SPLITTER_OPTIONS = {"chunk_token_size": CHUNK_TOKEN_SIZE, "chunk_token_overlap": CHUNK_TOKEN_OVERLAP}

# We want the output as Langchain Document
RETURN_DICT = False

In [8]:
HTML_CONTENT_DIR = "artifact/emgt605/html_content"

# The CSS elements to exclude when extracting text from the HTML content
EXCLUDED_ELEMENTS_CSS='div.quiz-card__feedback, div.block-knowledge__retake-container, a, iframe'

# Traverse the JSON_DIR and process all the JSON files 
html_content_docs = []
json_files = [f for f in os.listdir(HTML_CONTENT_DIR) if f.endswith('.json')]
content_doc_processor = ContentDocProcessor(text_splitter_options=TEXT_SPLITTER_OPTIONS, excluded_elements_css=EXCLUDED_ELEMENTS_CSS, return_dict=RETURN_DICT)

for json_file in json_files:
    json_path = os.path.join(HTML_CONTENT_DIR, json_file)
    docs = content_doc_processor.run(json_path)
    for doc in docs:
        doc.metadata['content_type'] = 'html_content'
    html_content_docs.extend(docs)

b. Load the transcript files and metadata from the `transcripts` directory (output directory of `TranscriptScraper.py`) and convert the video transcripts into documents.

In [9]:
TRANSCRIPT_DIR = "artifact/emgt605/transcripts"

# Traverse the TRANSCRIPT_DIR and process all the transcript files
transcript_docs = []
module_dirs = os.listdir(TRANSCRIPT_DIR)
transcript_doc_processor = TranscriptDocProcessor(text_splitter_options=TEXT_SPLITTER_OPTIONS, return_dict=RETURN_DICT)

for module_dir in module_dirs:
    module_path = os.path.join(TRANSCRIPT_DIR, module_dir)
    docs = transcript_doc_processor.process_module_transcripts(module_path)
    for doc in docs:
        doc.metadata['content_type'] = 'video_transcript'
    transcript_docs.extend(docs)

config.json:   0%|          | 0.00/914 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/447 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [10]:
combined_docs = html_content_docs + transcript_docs

## 3. Document Embeddings and Vector DB Loading

Initialize the dense embedding model. Note that using GPU is highly recommended for this task as it will be much faster.

In [11]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
dense_embeddings = BGEM3EmbeddingFunction(use_fp16=False, device=DEVICE, return_dense=True, return_sparse=False)
dense_dim = dense_embeddings.dim['dense']

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

imgs/.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

imgs/bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

imgs/nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

imgs/long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

imgs/mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

imgs/miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

imgs/others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

onnx/Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

onnx/config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

onnx/tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Initialize the BM25 sparse embeddings and save them to disk. Notice that we fit the BM25 embedding model to the original text to preserve the original distibution of the word count.

In [12]:
SPARSE_EMBEDDINGS_PATH = "artifact/emgt605/sparse_embeddings_v3.joblib"

original_texts = [doc.page_content for doc in combined_docs]
sparse_embeddings = BM25EmbeddingFunction(corpus=original_texts)

folder_path = os.path.dirname(SPARSE_EMBEDDINGS_PATH)
os.makedirs(folder_path, exist_ok=True)
joblib.dump(sparse_embeddings, SPARSE_EMBEDDINGS_PATH)

['artifact/emgt605/sparse_embeddings_v3.joblib']

In [13]:
MODEL_SLUG = 'bm25' 
VARIATION_SLUG = 'emgt605' 
KAGGLE_USERNAME = os.getenv('KAGGLE_USERNAME')
                        
kagglehub.model_upload(
  handle = f"{KAGGLE_USERNAME}/{MODEL_SLUG}/other/{VARIATION_SLUG}",
  local_model_dir = SPARSE_EMBEDDINGS_PATH)

Uploading Model https://www.kaggle.com/models/tmtsmrsl/bm25/other/emgt605 ...
Starting upload for file artifact/emgt605/sparse_embeddings_v3.joblib


Uploading: 100%|██████████| 112k/112k [00:00<00:00, 270kB/s]

Upload successful: artifact/emgt605/sparse_embeddings_v3.joblib (110KB)





Your model instance version has been created.
Files are being processed...
See at: https://www.kaggle.com/models/tmtsmrsl/bm25/other/emgt605


We will use the text with contextual header for both the sparse and dense embedding. The added context will improve the representation of both embeddings types. I don't modify the text directly in the document as it would complicate the indexing and deduplication step during the post-retrieval step.

In [14]:
def generate_contextual_header(doc):
    source_metadata = doc.metadata
    if source_metadata['content_type'] == 'video_transcript':
        # Replace newlines with a single space and truncate to 1000 characters
        video_desc = re.sub(r'\n+', ' ', source_metadata['video_desc'][:1000])
        return f"Video transcript snippet from video with a description of: {video_desc.strip()}."
        
    elif source_metadata['content_type'] == 'html_content':
        return (
            f"Content snippet of: {source_metadata['module_title']} - "
            f"{source_metadata['subsection']}: {source_metadata['submodule_title']}."
        )

Create the dense and sparse vectors for the documents.

In [15]:
contextual_texts = []
for doc in combined_docs:
    doc.metadata['contextual_header'] = generate_contextual_header(doc)
    contextual_texts.append(f"{doc.metadata['contextual_header']}\n{doc.page_content}")

In [16]:
sparse_vectors = sparse_embeddings.encode_documents(contextual_texts)
dense_vectors = dense_embeddings.encode_documents(contextual_texts)

pre tokenize: 100%|██████████| 21/21 [00:00<00:00, 57.54it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 21/21 [00:25<00:00,  1.21s/it]


In [17]:
def convert_doc_to_dict(doc):
    temp_doc = doc.dict()
    temp_doc['index_metadata'] = temp_doc['metadata'].pop('index_metadata', [])
    temp_doc['text'] = temp_doc.pop('page_content')
    temp_doc.pop('id', None)
    temp_doc.pop('type', None)
    
    return temp_doc

In [18]:
combined_dict = [convert_doc_to_dict(doc) for doc in combined_docs]
for i, doc in enumerate(combined_dict):
    doc['sparse_vector'] = sparse_vectors[[i], :]
    doc['dense_vector'] = dense_vectors['dense'][i]

<ipython-input-17-238f70520eac>:2: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  temp_doc = doc.dict()


Make a connection to the Zilliz vector database and load the embeddings into the vector database.

In [19]:
# This is the parameter to connect to the Zilliz vector database
ZILLIZ_URI = os.getenv("ZILLIZ_URI")
ZILLIZ_USER = os.getenv("ZILLIZ_USER")
ZILLIZ_PASSWORD = os.getenv("ZILLIZ_PASSWORD")
COLLECTION_NAME = "emgt_605_bge_bm25_500_50_v3"

In [20]:
connections.connect(user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, uri=ZILLIZ_URI)

fields = [
    FieldSchema(
        name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=100
    ),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=8192),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
    FieldSchema(name="metadata", dtype=DataType.JSON),
    FieldSchema(name="index_metadata", dtype=DataType.JSON)
    # FieldSchema(name="module_title", dtype=DataType.VARCHAR, max_length=500),
    # FieldSchema(name="subsection", dtype=DataType.VARCHAR, max_length=500),
    # FieldSchema(name="submodule_title", dtype=DataType.VARCHAR, max_length=500),
    # FieldSchema(name="submodule_url", dtype=DataType.VARCHAR, max_length=500),
    # FieldSchema(name="content_type", dtype=DataType.VARCHAR, max_length=500),
]

schema = CollectionSchema(fields, 
                          "Dense (BGE-M3) and Sparse (BM25) Embeddings for EMGT605 Course Content", 
                          enable_dynamic_field=True)
col = Collection(COLLECTION_NAME, schema)

sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
dense_index = {"index_type": "FLAT", "metric_type": "COSINE"}
col.create_index("sparse_vector", sparse_index)
col.create_index("dense_vector", dense_index)

Status(code=0, message=)

In [21]:
col.insert(combined_dict)
col.load()

## 4. Retrieval Testing

Check if the documents has been loaded correctly into the vector database. Note that we will initialize the retriever using the pymilvus SDK instead of Langchain because currently Langchain does not support the BM25 retriever.

In [22]:
connections.connect(user=ZILLIZ_USER, password=ZILLIZ_PASSWORD, uri=ZILLIZ_URI)
col = Collection(COLLECTION_NAME)

In [23]:
output_fields = [
            "pk",
            "metadata",
            "index_metadata",
            "text"   
        ]

In [24]:
query = "How to calculate GDP?"

In [25]:
dense_query = dense_embeddings.encode_queries([query])['dense']
dense_results = col.search(dense_query, 
                           anns_field="dense_vector", 
                           limit=5, param={"metric_type": "COSINE"}, 
                           output_fields=output_fields,
                          # expr='metadata["content_type"]=="video_transcript"'
                          )
# print(dense_results[0])

In [26]:
sparse_query = sparse_embeddings.encode_queries([query])
sparse_results = col.search(sparse_query,
                            anns_field="sparse_vector", 
                            limit=5, param={"metric_type": "IP"}, 
                            output_fields=output_fields,
                           # expr='metadata["content_type"]=="html_content"'
                           )
# print(sparse_results[0])