## 0. Setup

In [1]:
%cd ../..

c:\Users\Marselo\OneDrive\Documents\GitHub\uconline_poc


In [2]:
# !pip install -q langchain-community azure-search-documents==11.6.0b8 azure-identity

In [3]:
import os
import re
from dotenv import load_dotenv
import json
import uuid

from langchain_openai import OpenAIEmbeddings

from azure.search.documents.indexes.models import (
    SearchIndex, 
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    CorsOptions,
    ComplexField,
    SemanticSearch,
    VectorSearch,
    HnswAlgorithmConfiguration, 
    VectorSearchProfile,
    SemanticConfiguration,
    SemanticField, 
    SemanticPrioritizedFields
    
)
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

from ETL.ContentProcessor import ContentDocProcessor
from ETL.TranscriptProcessor import TranscriptDocProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")
MODEL = "text-embedding-3-large"
INDEX_NAME = "emgt605_v3"

## 1. Scrape the HTML content and video transcripts
Check the README.md file for the instructions on how to scrape the HTML content and video transcripts.

## 2. Convert the HTML content and video transcripts to documents

In [5]:
# This it the chunking option for the text processing.
CHUNK_TOKEN_SIZE = 500
CHUNK_TOKEN_OVERLAP = 50
TEXT_SPLITTER_OPTIONS = {"chunk_token_size": CHUNK_TOKEN_SIZE, "chunk_token_overlap": CHUNK_TOKEN_OVERLAP}

# We want the output as Langchain Document
RETURN_DICT = False

In [6]:
HTML_CONTENT_DIR = "artifact/emgt605/html_content"

# The CSS elements to exclude when extracting text from the HTML content
EXCLUDED_ELEMENTS_CSS='div.quiz-card__feedback, div.block-knowledge__retake-container, a, iframe'

# Traverse the JSON_DIR and process all the JSON files 
html_content_docs = []
json_files = [f for f in os.listdir(HTML_CONTENT_DIR) if f.endswith('.json')]
content_doc_processor = ContentDocProcessor(text_splitter_options=TEXT_SPLITTER_OPTIONS, excluded_elements_css=EXCLUDED_ELEMENTS_CSS, return_dict=RETURN_DICT)

for json_file in json_files:
    json_path = os.path.join(HTML_CONTENT_DIR, json_file)
    docs = content_doc_processor.run(json_path)
    for doc in docs:
        doc.metadata['content_type'] = 'html_content'
    html_content_docs.extend(docs)

In [7]:
TRANSCRIPT_DIR = "artifact/emgt605/transcripts"

# Traverse the TRANSCRIPT_DIR and process all the transcript files
transcript_docs = []
module_dirs = os.listdir(TRANSCRIPT_DIR)
transcript_doc_processor = TranscriptDocProcessor(text_splitter_options=TEXT_SPLITTER_OPTIONS, return_dict=RETURN_DICT)

for module_dir in module_dirs:
    module_path = os.path.join(TRANSCRIPT_DIR, module_dir)
    docs = transcript_doc_processor.process_module_transcripts(module_path)
    for doc in docs:
        doc.metadata['content_type'] = 'video_transcript'
    transcript_docs.extend(docs)



In [8]:
combined_docs = html_content_docs + transcript_docs

## 3. Document Embeddings and Vector DB Loading

In [9]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model=MODEL)

We will use the text with contextual header for dense embedding. The added context will improve the representation of the embeddings. I don't modify the text directly in the document as it would complicate the indexing and deduplication step during the post-retrieval step.

In [10]:
def generate_contextual_header(doc):
    source_metadata = doc.metadata
    if source_metadata['content_type'] == 'video_transcript':
        # Replace newlines with a single space and truncate to 1000 characters
        video_desc = re.sub(r'\n+', ' ', source_metadata['video_desc'][:1000])
        return f"Video transcript snippet from video with a description of: {video_desc.strip()}."
        
    elif source_metadata['content_type'] == 'html_content':
        return (
            f"Content snippet of: {source_metadata['module_title']} - "
            f"{source_metadata['subsection']}: {source_metadata['submodule_title']}."
        )

In [11]:
contextual_texts = []
for doc in combined_docs:
    doc.metadata['contextual_header'] = generate_contextual_header(doc)
    contextual_texts.append(f"{doc.metadata['contextual_header']}\n{doc.page_content}")

In [21]:
# dense_vectors = embeddings.embed_documents(contextual_texts)
# with open("artifact/emgt605/openai_dense_vectors.json", "w") as f:
#     json.dump(dense_vectors, f)

In [12]:
dense_vectors = json.load(open("artifact/emgt605/openai_dense_vectors.json"))

Create the Azure Search Index with `SearchIndexClient`

In [13]:
def create_index(client: SearchIndexClient,
                index_name: str, embeddings_dim: int):
    
    vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw"
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        )
    ]
    )
    
    fields = [
        SimpleField(name="pk", type=SearchFieldDataType.String, key=True, filterable=True),
        SearchableField(name="text", type=SearchFieldDataType.String),
        SearchField(
        name="dense_vector",
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        searchable=True,
        vector_search_dimensions=embeddings_dim,
        vector_search_profile_name="myHnswProfile",
        ),
        ComplexField(name="metadata", fields=[
            SimpleField(name="start_index", type=SearchFieldDataType.Int32),
            SimpleField(name="module_title", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="subsection", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="submodule_title", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="submodule_url", type=SearchFieldDataType.String),
            SimpleField(name="video_title", type=SearchFieldDataType.String),
            SimpleField(name="video_url", type=SearchFieldDataType.String),
            SimpleField(name="video_desc", type=SearchFieldDataType.String),
            SimpleField(name="content_type", type=SearchFieldDataType.String, filterable=True),
            SimpleField(name="contextual_header", type=SearchFieldDataType.String)]),
        SimpleField(name="index_metadata", type=SearchFieldDataType.String),
        ]
    
    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    
    semantic_config = SemanticConfiguration(
        name="my-semantic-config",
        prioritized_fields=SemanticPrioritizedFields(
            title_field=None, 
            keywords_fields=None,
            content_fields=[SemanticField(field_name="text")]
        )
    )

    semantic_search = SemanticSearch(configurations=[semantic_config])
    
    index = SearchIndex(
        name=index_name,
        fields=fields,
        scoring_profiles=[],
        cors_options=cors_options, 
        vector_search=vector_search,
        semantic_search=semantic_search)

    client.create_index(index)

In [15]:
client = SearchIndexClient(AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY))
embeddings_dim = len(dense_vectors[0])

create_index(client, INDEX_NAME, embeddings_dim)

Convert the Langchain documents to a dictionary with the required fields.

In [16]:
def convert_doc_to_dict(doc):
    doc_dict = doc.dict()
    doc_dict['index_metadata'] = doc_dict['metadata'].pop('index_metadata', [])
    doc_dict['index_metadata'] = json.dumps(doc_dict['index_metadata'])
    doc_dict['text'] = doc_dict.pop('page_content', None)
    doc_dict.pop('id', None)
    doc_dict.pop('type', None)
    doc_dict['pk'] = uuid.uuid4().hex
    return doc_dict

In [17]:
combined_dict = [convert_doc_to_dict(doc) for doc in combined_docs]

C:\Users\Marselo\AppData\Local\Temp\ipykernel_15396\4040010649.py:2: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  doc_dict = doc.dict()


In [18]:
for i, doc in enumerate(combined_dict):
    doc['dense_vector'] = dense_vectors[i]

Load the Azure Search Index with  `SearchClient`

In [19]:
from azure.search.documents import SearchClient

search_client = SearchClient(endpoint=AZURE_SEARCH_ENDPOINT, index_name=INDEX_NAME, credential=AzureKeyCredential(AZURE_SEARCH_KEY))

In [20]:
result = search_client.upload_documents(combined_dict)

## Query documents

In [32]:
from azure.search.documents import SearchClient
from azure.search.documents.models import HybridSearch
from azure.search.documents.models import VectorizedQuery

In [33]:
search_client = SearchClient(endpoint=AZURE_SEARCH_ENDPOINT, index_name=INDEX_NAME, credential=AzureKeyCredential(AZURE_SEARCH_KEY))

In [52]:
query = "What is the difference between finance and economy?" 
embedded_query = embeddings.embed_query(query)

In [66]:
results = search_client.search(  
    search_text=query,  
    search_fields=["text"],
    # max_text_recall_size indicates the number of results to return from the text search
    hybrid_search=HybridSearch(max_text_recall_size=5),
    # k_nearest_neighbors indicates the number of results to return from the vector search
    vector_queries= [VectorizedQuery(vector=embedded_query, k_nearest_neighbors=5, fields="dense_vector")],
    # top indicates the number of results after reranking the vector and sparse results
    top=5,
    select=["pk", "text", "metadata", "index_metadata"],
    query_type="semantic",
    semantic_configuration_name="my-semantic-config"
)  

In [55]:
results = list(results)

# convert index_metadata back as a list of dictionaries from a string
for r in results:
    r['index_metadata'] = json.loads(r['index_metadata'])