# <span style="color:yellow;">1 - IMPORT SDK</span>

In [2]:
# Welcome to your new notebook
# Type here in the cell editor to add code!
import os
import re
import json
import pathlib
import tiktoken
import numpy as np
from openai import AzureOpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Process all PDF files in the /pdf folder and convert to markdown files in the /md folder
from markitdown import MarkItDown

# Get the current directory (using pathlib instead of __file__ which doesn't work in notebooks)
current_dir = str(pathlib.Path().absolute())

# Define input and output directories
in_dir = os.path.join(current_dir, "1-input")
md_dir = os.path.join(current_dir, "2-parsed")
out_dir = os.path.join(current_dir, "3-output")

gpt_model = "gpt-4.1-mini"
embedding_model = "text-embedding-3-small"

# Initialize Azure OpenAI Service client with key-based authentication    
client = AzureOpenAI(
    azure_endpoint=os.getenv("AOAI_ENDPOINT"),
    api_key=os.getenv("AOAI_API_KEY"),
    api_version="2025-01-01-preview",
)

# Get tokens length of the text
def get_tokens_length(text, encoding_name="cl100k_base"):
    """
    Returns the number of tokens in the text using the specified encoding.

    Args:
        text (str): The input text to be tokenized.
        encoding_name (str): The name of the tokenizer encoding to use.
    
    Returns:
        int: Number of tokens in the text.
    """
    # Get the encoding
    encoding = tiktoken.get_encoding(encoding_name)
    
    # Encode the text and return its length
    return len(encoding.encode(text))


def cosine_similarity(vec1, vec2):
    """
    Calculate the cosine similarity between two vectors.

    Args:
        vec1 (array-like): First vector.
        vec2 (array-like): Second vector.

    Returns:
        float: Cosine similarity score between -1 and 1.

    Raises:
        ValueError: If the vectors are not the same shape or if one vector is zero.
    """
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    
    if vec1.shape != vec2.shape:
        raise ValueError("Vectors must have the same dimensions.")
    
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    
    if norm1 == 0 or norm2 == 0:
        raise ValueError("One of the vectors is zero and cannot be normalized.")
    
    return np.dot(vec1, vec2) / (norm1 * norm2)




# <span style="color:yellow;">2 - Convert PDF to Markdown Text</span>

In [3]:
# Initialize the MarkItDown converter
md = MarkItDown()

# Get a list of all PDF files in the pdf folder
pdf_files = [f for f in os.listdir(in_dir) if f.lower().endswith('.pdf')]

def pdf_to_markdown():
    # Process each PDF file
    for pdf_file in pdf_files:
        try:
            # Create full path for input file
            pdf_path = os.path.join(in_dir, pdf_file)
            
            # Create output filename (replace .pdf extension with .md)
            output_filename = os.path.splitext(pdf_file)[0] + '.md'
            output_path = os.path.join(md_dir, output_filename)
            
            print(f"Converting: {pdf_file}")
            
            # Convert the PDF file to markdown
            result = md.convert(pdf_path)
            content = result.text_content
            
            # Write the content to the markdown file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(content)
            
            print(f"✓ Saved to: {output_filename}")
        except Exception as e:
            print(f"Error processing {pdf_file}: {str(e)}")

pdf_to_markdown()
print(f"\nConversion complete! {len(pdf_files)} PDF files processed.")
print(f"Markdown files saved to: {md_dir}")

Converting: architecture-styles.pdf
✓ Saved to: architecture-styles.md

Conversion complete! 1 PDF files processed.
Markdown files saved to: /workspaces/azworkshop/notebooks/2-parsed


# <span style="color:yellow;">3 - Convert Text to Vector DB</span>

In [4]:
#Prepare the chat prompt
system_topic = """
Using the input document below, return a JSON object separated by topic with the following fields: topic, content
- Make sure to faithfully retain all the document details. Do not remove or add any new information.
- Each topic should be a concise summary of the content.
- The content should be a detailed explanation of the topic.

- Sample JSON output structure:
{
  "topics": [
    {  
      "topic": "<topic>",
      "content": "<content>"
    },
    ...
}

Document:
"""

# Get a list of all Markdown files in the md folder
md_files = [f for f in os.listdir(md_dir) if f.lower().endswith('.md')]

def markdown_to_vectordb():
    for md_file in md_files:
        try:
            # Create full path for input file
            input_path = os.path.join(md_dir, md_file)
            print(f"Processing: {input_path}")
            
            with open(input_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Loop through the content and split it into pages
            pages = re.split(r'\f', content.strip())
            # Remove any empty pages
            pages = [page.strip() for page in pages if page.strip()]
            # Print the number of pages found
            print(f"Found {len(pages)} pages in {md_file}.")
            # Combine pages by 2 with 1 overlap
            # pages = [pages[i] + "\n\n" + pages[i+1] for i in range(len(pages)-1)]
            contents = []

            # Loop each page and print
            for i, page in enumerate(pages):
                print(f"Page {i+1}:")
                # print(page)
                print("-" * 40)

                messages = [
                    {"role": "system", "content": system_topic},
                    {"role": "user", "content": page},
                ]

                # Generate the completion  
                completion = client.chat.completions.create(
                    model=gpt_model,
                    messages=messages,
                    temperature=0,
                    top_p=1,
                    response_format={ "type": "json_object" },
                )
                print("topic chunking..", completion.choices[0].message.content)

                data = json.loads(completion.choices[0].message.content)
                # Loop through the data topics
                for topic in data['topics']:
                    topic_name = topic['topic']
                    topic_chunk = topic['content']

                    response1 = client.embeddings.create(
                        input=f"{topic_name}\n\n{topic_chunk}",
                        model=embedding_model
                    )
                    print(" - embedding created.")

                    found = False
                    # Loop through the contents
                    for item in contents:
                        # Calculate cosine similarity
                        similarity = cosine_similarity(response1.data[0].embedding, item['vector'])
                        # Print similarity
                        # print(f"Similarity: {similarity} - {topic_name}")

                        # Check if similarity is above the threshold
                        if float(similarity) > 0.95:
                            found = True

                    # Append the content to the list
                    if not found:
                        contents.append({
                            "topic": topic_name,
                            "chunk": topic_chunk,
                            "content": page,
                            "contentId": i+1,
                            "vector": response1.data[0].embedding,
                        })

            # print(contents)
            # break

            # Save the structured data to a JSON file
            output_filename = os.path.splitext(md_file)[0]
            output_path = os.path.join(out_dir, output_filename + '.json')

            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(contents, f, indent=4)
            print(f"Knowledge base saved to: {output_path}")

        except Exception as e:
            print(f"Error processing {md_file}: {str(e)}")


markdown_to_vectordb()

Processing: /workspaces/azworkshop/notebooks/2-parsed/architecture-styles.md
Found 6 pages in architecture-styles.md.
Page 1:
----------------------------------------
topic chunking.. {
  "topics": [
    {
      "topic": "Definition of Architecture Styles",
      "content": "An architecture style is a family of architectures that share specific characteristics. These styles provide a way to categorize and understand different architectural approaches. For example, the N-tier architecture style is a common and well-known style. More recently, microservice architectures have been gaining popularity. Architecture styles do not mandate the use of specific technologies, but certain technologies are better suited to particular styles. For instance, containers are particularly well-suited for microservice architectures."
    },
    {
      "topic": "Common Architecture Styles in Cloud Applications",
      "content": "A set of architecture styles commonly found in cloud applications has been i

# <span style="color:yellow;">4 - Vector Search</span>

In [8]:
# Get a list of all Markdown files in the md folder
vector_files = [f for f in os.listdir(out_dir) if f.lower().endswith('.json')]

def vector_search(query):
    # Create the vector for the query
    query_vector = client.embeddings.create(
        input=query,
        model=embedding_model
    )
    results = {}

    # Search for the most relevant documents using the vector
    for vector_file in vector_files:
        # Create full path for input file
        vector_path = os.path.join(out_dir, vector_file)
        # print(f"Processing: {vector_path}")
        
        with open(vector_path, 'r', encoding='utf-8') as f:
            vectors = json.load(f)

        # Loop through the vectors and check for similarity
        for vector in vectors:
            # Check if the vector is similar to the query vector
            similarity = cosine_similarity(vector['vector'], query_vector.data[0].embedding)
            # print(f"Similarity: {similarity} / {vector['chunkId']}")
            
            # If the similarity is above a certain threshold, add it to the results
            if float(similarity) > 0.5:
                results[vector['contentId']] = f"{vector['topic']}\n{vector['content']}"

    return results

user_query = "tell me about web queue"
# user_query = "how does it compare with big data?"
# user_query = "difference between web queue and message queue"
# user_query = "difference between web queue and big data"
# user_query = "is web queue invented by a computer scientist?"

search_results = vector_search(user_query)
print("Search results:", json.dumps(search_results, indent=4))


Search results: {
    "2": "Web-Queue-Worker Architecture\nN-tier is a traditional architecture for enterprise applications. Dependencies are managed by\n\ndividing the application into layers that perform logical functions, such as presentation, business\n\nlogic, and data access. A layer can only call into layers that sit below it. However, this horizontal\n\nlayering can be a liability. It can be hard to introduce changes in one part of the application\n\nwithout touching the rest of the application. That makes frequent updates a challenge, limiting\n\nhow quickly new features can be added.\n\nN-tier is well-suited for migrating existing applications that already use a layered architecture. For\n\nthat reason, N-tier is most often seen in infrastructure as a service (IaaS) solutions or applications\n\nthat use a combination of IaaS and managed services.\n\nWeb-Queue-Worker\n\nIdentity\nProvider\n\nRemote\nService\n\nClient\n\nWeb Front\nEnd\n\nCache\n\nWorker\n\nDatabase\n\nQueue\n\

# <span style="color:yellow;">5 - Final Response</span>

In [9]:
#Prepare the chat prompt
system_retrieval = """
You are a helpful AI Assistant. Follow these rules exactly:

1. PURPOSE
   - Use the supplied DOCUMENT_CHUNK to answer user questions.
   - Allow simple greetings and light chitchat.
   - If the user's question is outside the scope of DOCUMENT_CHUNK, respond with a polite refusal.

2. INPUT FORMAT
   The model always receives two inputs in this order:
   a) DOCUMENT_CHUNK: A block of text containing the relevant information.
   b) USER_QUERY: The user's message or question.

3. BEHAVIOR
   a) Greetings & Chitchat
      - If USER_QUERY is a greeting (e.g. “hi”, “hello”, “good morning”) or simple chitchat (“how are you?”, “what's up?”), respond with a friendly greeting or brief chitchat. Do not reference DOCUMENT_CHUNK.
   b) On-Topic Questions
      - If USER_QUERY asks about a fact or detail that is directly supported by DOCUMENT_CHUNK, answer accurately using only information from DOCUMENT_CHUNK.
      - Cite the relevant passage or phrase when possible: “According to the document: ‹…›”.
   c) Off-Topic or Irrelevant Questions
      - If USER_QUERY cannot be answered from DOCUMENT_CHUNK, reply:
        “I'm sorry, but I don't have information on that. Please ask something related to the document.”
      - Do NOT attempt to hallucinate or introduce outside knowledge.

4. RESPONSE FORMAT
   - Keep answers concise (2-4 sentences).
   - Use neutral, professional tone.
   - If refusing, use exactly: “I'm sorry, but I don't have information on that. Please ask something related to the document.”

5. EXAMPLES

Example 1 - Greeting  
DOCUMENT_CHUNK: “...”  
USER_QUERY: “Hey there!”  
→ “Hello! How can I help you with the document today?”

Example 2 - On-Topic  
DOCUMENT_CHUNK: “The Eiffel Tower is 300 meters tall.”  
USER_QUERY: “How tall is the Eiffel Tower?”  
→ “According to the document, the Eiffel Tower is 300 meters tall.”

Example 3 - Off-Topic  
DOCUMENT_CHUNK: “...”  
USER_QUERY: “What's the weather today?”  
→ “I'm sorry, but I don't have information on that. Please ask something related to the document.”
"""

user_retrieval = """
DOCUMENT_CHUNK: {{DOCUMENT}}

USER_QUERY: {{QUERY}}
"""


def chat_with_pdf(query, search):
    try:
        print(f"User Query: {query}")

        # Sort the data by similarity in descending order
        # top5 = sorted(search, key=lambda x: x['similarity'], reverse=True)[:5]
        # print("Top Doc Chunks: ", top5)

        # join with spaces (or use "\n" for newlines)
        # documents = "\n\n".join(item['content'] for item in top5)

        # Create a new system prompt for each file by replacing the placeholder
        user_prompt = user_retrieval.replace("{{DOCUMENT}}", json.dumps(search)).replace("{{QUERY}}", query)
        messages = [
            {"role": "system", "content": system_retrieval},
            {"role": "user", "content": user_prompt},
        ]

        # Generate the completion  
        completion = client.chat.completions.create(  
            model=gpt_model,
            messages=messages,
            temperature=0,
            top_p=1,
            response_format={ "type": "text" },
        )

        response = completion.choices[0].message.content
        print(f"Response: {response}")
        results = {
            "query": query,
            "response": response,
            "documents": search,
        }

        # Save the structured data to a JSON file
        output_path = os.path.join(current_dir, 'response.json')

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=4)
        print(f"Results saved to: {output_path}")

    except Exception as e:
        print(f"Error occurred: {e}")


chat_with_pdf(user_query, search_results)

User Query: tell me about web queue
Response: According to the document, the Web-Queue-Worker architecture is a style where the application has a web front end that handles HTTP requests and a back-end worker that performs CPU-intensive or long-running tasks. The front end communicates with the worker through an asynchronous message queue. This architecture is suitable for relatively simple domains with some resource-intensive tasks and is easy to understand, with managed services simplifying deployment and operations. However, it can lead to large, monolithic components that are hard to maintain and update, potentially reducing the frequency of updates and limiting innovation.
Results saved to: /workspaces/azworkshop/notebooks/response.json


In [10]:
user_retrieval

'\nDOCUMENT_CHUNK: {{DOCUMENT}}\n\nUSER_QUERY: {{QUERY}}\n'

In [14]:
user_query

'tell me about web queue'

In [13]:
search_results

{2: 'Web-Queue-Worker Architecture\nN-tier is a traditional architecture for enterprise applications. Dependencies are managed by\n\ndividing the application into layers that perform logical functions, such as presentation, business\n\nlogic, and data access. A layer can only call into layers that sit below it. However, this horizontal\n\nlayering can be a liability. It can be hard to introduce changes in one part of the application\n\nwithout touching the rest of the application. That makes frequent updates a challenge, limiting\n\nhow quickly new features can be added.\n\nN-tier is well-suited for migrating existing applications that already use a layered architecture. For\n\nthat reason, N-tier is most often seen in infrastructure as a service (IaaS) solutions or applications\n\nthat use a combination of IaaS and managed services.\n\nWeb-Queue-Worker\n\nIdentity\nProvider\n\nRemote\nService\n\nClient\n\nWeb Front\nEnd\n\nCache\n\nWorker\n\nDatabase\n\nQueue\n\nCDN\n\n\uf80a\nStatic\

In [16]:
user_prompt = user_retrieval.replace("{{DOCUMENT}}", json.dumps(search_results)).replace("{{QUERY}}", user_query)
messages = [
            {"role": "system", "content": system_retrieval},
            {"role": "user", "content": user_prompt},
        ]

In [17]:
user_prompt

'\nDOCUMENT_CHUNK: {"2": "Web-Queue-Worker Architecture\\nN-tier is a traditional architecture for enterprise applications. Dependencies are managed by\\n\\ndividing the application into layers that perform logical functions, such as presentation, business\\n\\nlogic, and data access. A layer can only call into layers that sit below it. However, this horizontal\\n\\nlayering can be a liability. It can be hard to introduce changes in one part of the application\\n\\nwithout touching the rest of the application. That makes frequent updates a challenge, limiting\\n\\nhow quickly new features can be added.\\n\\nN-tier is well-suited for migrating existing applications that already use a layered architecture. For\\n\\nthat reason, N-tier is most often seen in infrastructure as a service (IaaS) solutions or applications\\n\\nthat use a combination of IaaS and managed services.\\n\\nWeb-Queue-Worker\\n\\nIdentity\\nProvider\\n\\nRemote\\nService\\n\\nClient\\n\\nWeb Front\\nEnd\\n\\nCache\\n\

In [20]:
        completion = client.chat.completions.create(  
            model=gpt_model,
            messages=messages,
            temperature=0,
            top_p=1,
            response_format={ "type": "text" },
        )



In [21]:
        response = completion.choices[0].message.content
        print(f"Response: {response}")
        results = {
            "query": user_query,
            "response": response,
            "documents": search_results,
        }

Response: According to the document, the Web-Queue-Worker architecture is a style where the application has a web front end that handles HTTP requests and a back-end worker that performs CPU-intensive or long-running tasks. The front end communicates with the worker through an asynchronous message queue. This architecture is suitable for relatively simple domains with some resource-intensive tasks and is easy to understand, with managed services simplifying deployment and operations. However, it can lead to large, monolithic components that are hard to maintain and update, potentially reducing the frequency of updates and limiting innovation.


In [23]:
results

{'query': 'tell me about web queue',
 'response': 'According to the document, the Web-Queue-Worker architecture is a style where the application has a web front end that handles HTTP requests and a back-end worker that performs CPU-intensive or long-running tasks. The front end communicates with the worker through an asynchronous message queue. This architecture is suitable for relatively simple domains with some resource-intensive tasks and is easy to understand, with managed services simplifying deployment and operations. However, it can lead to large, monolithic components that are hard to maintain and update, potentially reducing the frequency of updates and limiting innovation.',
 'documents': {2: 'Web-Queue-Worker Architecture\nN-tier is a traditional architecture for enterprise applications. Dependencies are managed by\n\ndividing the application into layers that perform logical functions, such as presentation, business\n\nlogic, and data access. A layer can only call into layer