In [3]:
!pip install boto3 faiss-cpu sentence-transformers transformers accelerate pypdf PyPDF2 PyCryptodome qdrant-client streamlit
!pip install nltk pdfminer.six scikit-learn localtunnel

import io
import os
import uuid
import boto3
import nltk
import string
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, FieldCondition, Filter, MatchValue
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import userdata

nltk.download('punkt')
nltk.download('punkt_tab')

AWS_ACCESS_KEY = userdata.get("AWS_ACCESS_KEY")
AWS_SECRET_KEY = userdata.get("AWS_SECRET_KEY")
S3_BUCKET = "rag-vector-db-poc"
embed_model = SentenceTransformer("all-MiniLM-L6-v2")


s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY
)

QDRANT_HOST = "34.228.56.127"
QDRANT_PORT = 6333
COLLECTION = "docs_chunks"

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=30.0)

client.recreate_collection(
    collection_name=COLLECTION,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

Collecting pdfminer.six
  Using cached pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
[31mERROR: Could not find a version that satisfies the requirement localtunnel (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for localtunnel[0m[31m
[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  client.recreate_collection(


True

In [4]:
# === S3 Utilities ===
def list_text_and_pdf_keys(bucket, prefix=""):
    keys = []
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            if key.endswith(".pdf") or key.endswith(".txt"):
                keys.append(key)
    return keys

def download_file_from_s3(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return io.BytesIO(response['Body'].read())

# === Text Extraction ===
def extract_text_from_pdf(pdf_io):
    reader = PdfReader(pdf_io)
    return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

def extract_text_from_txt(txt_io):
    return txt_io.read().decode("utf-8")

# === Chunking ===
def chunk_paragraphs(text, s3_path, chunk_token_limit=500):
    # Extract the S3 key path (excluding bucket name)
    key_path = s3_path.replace("s3://", "").split("/", 1)[1]  # get "finance/2023/report.pdf"
    department = key_path.split("/")[0] if "/" in key_path else "root"
    file_name = os.path.basename(s3_path)

    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []
    token_count = 0

    for sentence in sentences:
        tokens = sentence.split()
        token_len = len(tokens)
        if token_count + token_len > chunk_token_limit:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            token_count = 0
        current_chunk.append(sentence)
        token_count += token_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return [
        {
            "text": chunk,
            "s3_path": s3_path,
            "file_name": file_name,
            "chunk_id": f"{s3_path}_{i}",
            "chunk_index": i,
            "token_count": len(chunk.split()),
            "department": department,  # ✅ renamed here
            "top_keywords": extract_top_keywords(chunk)
        }
        for i, chunk in enumerate(chunks)
    ]


# === Keyword Extraction ===
def extract_top_keywords(text, top_n=5):
    try:
        vectorizer = TfidfVectorizer(stop_words="english", max_features=top_n)
        X = vectorizer.fit_transform([text])
        scores = X.toarray().flatten()
        return {word: round(score, 4) for word, score in zip(vectorizer.get_feature_names_out(), scores)}
    except Exception:
        return {}

# === Embedding & Upload ===
def embed_chunks(chunks):
    texts = [c["text"] for c in chunks]
    return embed_model.encode(texts, show_progress_bar=True)

def upload_chunks_to_qdrant(chunks, embeddings):
    points = [
        PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload=chunk
        )
        for chunk, embedding in zip(chunks, embeddings)
    ]
    client.upsert(collection_name=COLLECTION, points=points)

# === Orchestration ===
def process_and_upload_files(bucket, keys):
    for key in keys:
        try:
            print(f"📄 Processing: {key}")
            file_io = download_file_from_s3(bucket, key)

            if key.endswith(".pdf"):
                text = extract_text_from_pdf(file_io)
            elif key.endswith(".txt"):
                text = extract_text_from_txt(file_io)
            else:
                print(f"Skipping unsupported file type: {key}")
                continue

            chunks = chunk_paragraphs(text, s3_path=f"s3://{bucket}/{key}")
            embeddings = embed_chunks(chunks)
            upload_chunks_to_qdrant(chunks, embeddings)
            print(f"✅ Uploaded: {key}")
        except Exception as e:
            print(f"❌ Failed: {key}\n{e}")

# === Run ===
file_keys = list_text_and_pdf_keys(S3_BUCKET)
process_and_upload_files(S3_BUCKET, file_keys)

📄 Processing: Arrakis.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Arrakis.pdf
📄 Processing: Crash_Consistency.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Crash_Consistency.pdf
📄 Processing: Demikernel.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Demikernel.pdf
📄 Processing: Exokernel.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Exokernel.pdf
📄 Processing: FFS_Unix.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: FFS_Unix.pdf
📄 Processing: FireCracker.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: FireCracker.pdf
📄 Processing: HeMem.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: HeMem.pdf
📄 Processing: LogFS.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: LogFS.pdf
📄 Processing: Pond.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Pond.pdf
📄 Processing: Unix.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Unix.pdf
📄 Processing: ghOSt.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: ghOSt.pdf
📄 Processing: navarro.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: navarro.pdf
📄 Processing: scheduler-activations.pdf
❌ Failed: scheduler-activations.pdf
Missed the stop code in LZWDecode!
📄 Processing: the_linux_schedule_a_decade_of_wasted_cores.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: the_linux_schedule_a_decade_of_wasted_cores.pdf
📄 Processing: xen_and_the_art_of_virtualization.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: xen_and_the_art_of_virtualization.pdf


In [12]:
from sklearn.metrics import precision_score, recall_score
from difflib import SequenceMatcher

def compute_overlap_score(a, b):
    return SequenceMatcher(None, a, b).ratio()

def evaluate_rag_query(query, expected_sources=None, expected_answer=None, k=5):
    start = time.time()
    chunks = search_qdrant(query, k)
    prompt, refs = build_prompt(query, chunks)

    # LLM Call
    response = azure_client.chat.completions.create(
        model=AZURE_DEPLOYMENT_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant..."},
            {"role": "user", "content": prompt}
        ]
    )

    answer = response.choices[0].message.content.strip()
    latency = round((time.time() - start) * 1000, 2)

    # Monitoring & Logging
    retrieved_sources = list(refs.values())
    top_chunks = chunks
    prompt_tokens = getattr(response.usage, "prompt_tokens", None)
    completion_tokens = getattr(response.usage, "completion_tokens", None)

    # Evaluation Metrics
    precision = recall = mrr = None
    hallucination_rate = 0
    overlap_score = None

    if expected_sources:
        retrieved_set = set(retrieved_sources)
        relevant_set = set(expected_sources)
        true_positives = len(retrieved_set & relevant_set)
        precision = true_positives / len(retrieved_set) if retrieved_set else 0
        recall = true_positives / len(relevant_set) if relevant_set else 0
        for i, s in enumerate(retrieved_sources):
            if s in relevant_set:
                mrr = 1 / (i + 1)
                break
        else:
            mrr = 0

    if expected_answer:
        overlap_score = compute_overlap_score(answer, expected_answer)

    query_eval = {
        "query": query,
        "answer": answer,
        "expected_answer": expected_answer,
        "expected_sources": expected_sources,
        "retrieved_sources": retrieved_sources,
        "precision@k": precision,
        "recall@k": recall,
        "mrr@k": mrr,
        "overlap_score": overlap_score,
        "prompt_tokens": prompt_tokens,
        "completion_tokens": completion_tokens,
        "latency_ms": latency,
        "citations_used": len(refs),
        "top_chunks": top_chunks,
        "errors": [],
    }

    monitoring["query_log"].append(query_eval)
    for src in retrieved_sources:
        monitoring["access_count"][src] += 1
    monitoring["latencies"].append(latency)

    return query_eval


In [11]:
import uuid, time
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from openai import AzureOpenAI

# Azure OpenAI Config (replace with actual values)
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

azure_client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

# ------------------- QDRANT -------------------
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=120)

# ------------------- MONITORING -------------------
monitoring = {
    "access_count": defaultdict(int),
    "latencies": [],
    "query_log": []
}

# ------------------- RAG FUNCTIONS -------------------
def search_qdrant(query, k=5):
    vec = embed_model.encode([query])[0]

    results = client.query_points(
        collection_name=COLLECTION,
        query=vec,
        limit=k,
        with_payload=True,
        timeout=120
    )

    metadata_list = []
    for point in results.points:
        metadata = {
            "score": point.score,
            "text": point.payload.get("text", ""),
            "s3_path": point.payload.get("s3_path", ""),
            "file_name": point.payload.get("file_name", ""),
            "department": point.payload.get("department", ""),
            "top_keywords": point.payload.get("top_keywords", {}),
        }
        metadata_list.append(metadata)
    return metadata_list

def build_prompt(query, top_chunks):
    context = ""
    source_refs = {}

    for i, chunk in enumerate(top_chunks):
        ref = f"[{i+1}]"
        source = chunk.get("s3_path", "unknown")
        context += f"{ref} ({source}):\n{chunk['text']}\n\n"
        source_refs[ref] = source

    prompt = f"""You are a helpful assistant. Use only the following context to answer the question.
Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned.

Context:
{context}

Question: {query}

Answer:"""
    return prompt, source_refs

def rag_query(query, k=5):
    start = time.time()
    chunks = search_qdrant(query, k)
    prompt, refs = build_prompt(query, chunks)

    response = azure_client.chat.completions.create(
        model=AZURE_DEPLOYMENT_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use only the following context to answer the question. Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned."},
            {"role": "user", "content": prompt}
        ],
    )

    answer = response.choices[0].message.content.strip()
    latency = round((time.time() - start) * 1000, 2)

    for ref in refs.values():
        monitoring["access_count"][ref] += 1
    monitoring["latencies"].append(latency)
    monitoring["query_log"].append({
        "query": query,
        "sources": list(refs.values()),
        "latency_ms": latency
    })

    return answer, refs, latency, chunks

# ------------------- MAIN SCRIPT -------------------

questions = [
    "How does Arrakis ensure security and isolation for applications that have direct access to hardware devices, bypassing traditional kernel mediation?",
    "What are the key hardware support features required to enable Arrakis's direct I/O access model, and how do these features impact hardware complexity and cost?",
    "In what ways can the Arrakis architecture be extended or adapted to virtualized environments and multi-tenant cloud data centers, and what challenges might arise in such contexts?",
    "How does OptFS's approach to decoupling ordering and durability via osync() and dsync() primitives impact overall system performance and reliability?",
    "In what ways does optimistic crash consistency differ from traditional journaling or soft updates methods, and what are its primary advantages and potential drawbacks?",
    "How do the case studies with gedit and SQLite demonstrate the practical benefits and limitations of the proposed optimistic crash consistency techniques?",
    "How does Demikernel achieve nanosecond-scale I/O processing overheads while maintaining portability across heterogeneous kernel-bypass devices?",
    "In what ways does the PDPIX API improve programmability for µs-scale datacenter systems compared to traditional POSIX or existing kernel-bypass APIs?",
    "What challenges did the authors face when integrating networking (e.g., DPDK, RDMA) and storage (e.g., SPDK) libOSes in a single Demikernel datapath OS, and how were they addressed?",
    "How does Pond balance the trade-off between latency sensitivity and DRAM savings when determining VM memory allocation, and what role do its machine learning models play in this process?",
    "Given the increasing access latency with larger CXL memory pool sizes, what are the practical scalability limits of Pond's architecture, and how do these limits impact overall datacenter design?",
    "How does Pond’s zNUMA approach differ from traditional NUMA memory management, and what mechanisms ensure performance remains consistent even with incorrect memory usage predictions?",
    "How does HeMem's asynchronous sampling via PEBS compare in scalability and accuracy to traditional page table scanning for hot data identification, especially as memory capacity reaches terabyte scale?",
    "Given HeMem’s user-space implementation and its reliance on userfaultfd and DMA migration, what are the potential challenges or limitations in extending it to support kernel-level memory or shared memory scenarios in multi-tenant cloud environments?",
    "Considering that HeMem achieves significant reductions in NVM wear, what design principles can be abstracted and applied to emerging memory technologies (e.g., MRAM, ReRAM) with similar asymmetries in read/write performance or endurance?",
    "What architectural and implementation choices enabled Firecracker to achieve both low overhead and strong isolation compared to traditional hypervisors like QEMU?",
    "How does Firecracker’s design and integration with AWS Lambda enable fast function startup and efficient resource utilization at massive scale?",
    "In what ways does Firecracker’s minimal device model and use of Rust for VMM development contribute to reducing the trusted computing base (TCB) and improving security?",
    "What key design strategies did the authors implement to manage physical memory fragmentation and ensure sustained superpage performance under memory pressure?",
    "How does the reservation-based allocation mechanism proposed in the paper differ from eager promotion or relocation-based superpage management strategies used in other operating systems like HP-UX or IRIX?",
    "What are the primary trade-offs involved in the incremental promotion and speculative demotion of superpages, and how do these impact system performance and memory overhead?",
    "What are the main advantages of using a log-structured file system (LFS) compared to traditional Unix file systems, especially in handling small file workloads?",
    "How does Sprite LFS manage free space using segment cleaning, and what is the role of the cost-benefit policy in optimizing write performance?",
    "In what ways does the crash recovery mechanism in Sprite LFS leverage the log structure to improve reliability and recovery time, and how does this differ from traditional file system recovery approaches?"

]

for i,q in enumerate(questions):
    print(f"\n========== QUESTION {i+1}: {q} ==========\n")
    answer, refs, latency, retrieved_chunks = rag_query(q)

    print("->> Answer:\n")
    print(answer)

    print("\n->> Top 5 Retrieved Chunks:\n")
    for i, chunk in enumerate(retrieved_chunks):
        print(f"[{i+1}] File: {chunk['file_name']}")
        print(f"    Path: {chunk['s3_path']}")
        print(f"    Department: {chunk.get('department', 'unknown')}")
        print(f"    Text: {chunk['text'][:50].strip()}...\n")

    print(f"->>Latency: {latency} ms")
    print("\n" + "=" * 50)

# ------------------- SUMMARY -------------------

print("\n========== SUMMARY ==========\n")
print(f"Total Queries: {len(monitoring['query_log'])}")
if monitoring["latencies"]:
    print(f"Average Latency: {np.mean(monitoring['latencies']):.2f} ms")
    print("\nTop Accessed Sources:")
    top_sources = sorted(monitoring["access_count"].items(), key=lambda x: x[1], reverse=True)
    for src, count in top_sources:
        print(f"• {src}: {count}x")




->> Answer:

Arrakis ensures security and isolation for applications with direct access to hardware devices by using device hardware to deliver I/O directly to a customized user-level library, without compromising process isolation. The Arrakis kernel operates in the control plane, configuring the hardware to limit application misbehavior. It achieves this by effectively isolating I/O operations, relying on hardware support for virtualization to present multiple instances of devices that are mapped to separate protection domains. This setup allows applications to conduct I/O through their protected virtual device instance without requiring kernel intervention, ensuring that security checks are integrated into the device's management interface accessible from the control plane [3], [4]. Additionally, Arrakis uses secure user-level networking and storage with a hardware-independent device model that captures the functionality required to implement data plane operations traditionally ha

In [None]:
%%writefile app.py
import streamlit as st
import uuid, time
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from openai import AzureOpenAI
from google.colab import userdata

# ------------------- CONFIG -------------------
QDRANT_HOST = "54.152.12.154"
QDRANT_PORT = 6333
COLLECTION = "docs_chunks"
EMBED_DIM = 384

# Azure OpenAI Config (replace with actual values)
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

# ------------------- LOAD MODELS -------------------
@st.cache_resource
def load_embedder():
    return SentenceTransformer("all-MiniLM-L6-v2")

@st.cache_resource
def load_azure_client():
    return AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        api_version="2023-05-15",
        azure_endpoint=AZURE_OPENAI_ENDPOINT
    )

embed_model = load_embedder()
azure_client = load_azure_client()

# ------------------- QDRANT -------------------
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=120)

# ------------------- MONITORING -------------------
monitoring = {
    "access_count": defaultdict(int),
    "latencies": [],
    "query_log": []
}

# ------------------- RAG -------------------
def search_qdrant(query, k=3):
    vec = embed_model.encode([query])[0]

    results = client.query_points(
        collection_name=COLLECTION,
        query=vec,
        limit=k,
        with_payload=True,
        timeout=120
    )

    metadata_list = []

    for point in results.points:
        metadata = {
            "score": point.score,
            "text": point.payload.get("text", ""),
            "s3_path": point.payload.get("s3_path", ""),
            "file_name": point.payload.get("file_name", ""),
            "department": point.payload.get("department", ""),
            "top_keywords": point.payload.get("top_keywords", {}),
        }
        metadata_list.append(metadata)
    print(metadata_list)
    return metadata_list

def build_prompt(query, top_chunks):
    context = ""
    source_refs = {}

    for i, chunk in enumerate(top_chunks):
        ref = f"[{i+1}]"
        source = chunk.get("s3_path", "unknown")
        context += f"{ref} ({source}):\n{chunk['text']}\n\n"
        source_refs[ref] = source

    prompt = f"""You are a helpful assistant. Use only the following context to answer the question.
Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned.

Context:
{context}

Question: {query}

Answer:"""
    return prompt, source_refs

def rag_query(query, k=3):
    start = time.time()
    chunks = search_qdrant(query, k)
    prompt, refs = build_prompt(query, chunks)

    response = azure_client.chat.completions.create(
        model=AZURE_DEPLOYMENT_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use only the following context to answer the question. Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned."},
            {"role": "user", "content": prompt}
        ],
    )

    answer = response.choices[0].message.content.strip()
    latency = round((time.time() - start) * 1000, 2)

    for ref in refs.values():
        monitoring["access_count"][ref] += 1
    monitoring["latencies"].append(latency)
    monitoring["query_log"].append({
        "query": query,
        "sources": list(refs.values()),
        "latency_ms": latency
    })

    return answer, refs, latency, chunks


# ------------------- UI -------------------
st.set_page_config(page_title="RAG Chat with Citations", layout="wide")
st.title("💬 RAG Assistant with Qdrant + Azure OpenAI")

st.markdown("Ask questions based on the preloaded document index.")

query = st.text_input("Enter your question here:")

if query:
    with st.spinner("Searching and generating response..."):
        answer, refs, latency, retrieved_chunks = rag_query(query)

        st.markdown("### 🧠 Answer")
        st.write(answer)

        st.markdown("### 📦 Retrieved Chunks")

        for i, chunk in enumerate(retrieved_chunks):
            with st.expander(f"[{i+1}] Source: {chunk['file_name']}", expanded=True):
                st.write(chunk["text"])
                st.caption(
                    f"📂 Department: `{chunk.get('department', 'unknown')}` | "
                    f"🧾 File Path: {chunk.get('s3_path', 'N/A')}"
                )


        st.markdown("### ⏱️ Latency")
        st.write(f"{latency} ms")

# ------------------- Sidebar: Monitoring -------------------
st.sidebar.title("📊 Monitoring")
st.sidebar.write(f"Total queries: {len(monitoring['query_log'])}")
if monitoring["latencies"]:
    st.sidebar.write(f"Average latency: {np.mean(monitoring['latencies']):.2f} ms")
    st.sidebar.write("Top documents accessed:")
    top_sources = sorted(monitoring["access_count"].items(), key=lambda x: x[1], reverse=True)
    for src, count in top_sources:
        st.sidebar.write(f"• {src}: {count}x")


Writing app.py


In [1]:
!wget -q -O - ipv4.icanhazip.com
! streamlit run app.py & npx localtunnel --port 8501

104.198.250.217
/bin/bash: line 1: streamlit: command not found
[1G[0K⠙^C


In [None]:
import streamlit as st
import uuid, time
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from openai import AzureOpenAI
from google.colab import userdata

# ------------------- CONFIG -------------------
QDRANT_HOST = "54.152.12.154"
QDRANT_PORT = 6333
COLLECTION = "docs_chunks"
EMBED_DIM = 384

# Azure OpenAI Config (replace with actual values)
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"
def load_embedder():
    return SentenceTransformer("all-MiniLM-L6-v2")

def load_azure_client():
    return AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        api_version="2023-05-15",
        azure_endpoint=AZURE_OPENAI_ENDPOINT
    )

embed_model = load_embedder()
azure_client = load_azure_client()
import time
def search_qdrant(query, k=3):
    vec = embed_model.encode([query])[0]

    results = client.query_points(
        collection_name=COLLECTION,
        query=vec,
        limit=k,
        with_payload=True,
        timeout=120
    )

    metadata_list = []

    for point in results.points:
        metadata = {
            "score": point.score,
            "text": point.payload.get("text", ""),
            "s3_path": point.payload.get("s3_path", ""),
            "file_name": point.payload.get("file_name", ""),
            "department": point.payload.get("department", ""),
            "top_keywords": point.payload.get("top_keywords", {}),
        }
        metadata_list.append(metadata)
    print(metadata_list)
    return metadata_list

def build_prompt(query, top_chunks):
    context = ""
    source_refs = {}

    for i, chunk in enumerate(top_chunks):
        ref = f"[{i+1}]"
        source = chunk.get("s3_path", "unknown")
        context += f"{ref} ({source}):\n{chunk['text']}\n\n"
        source_refs[ref] = source

    prompt = f"""You are a helpful assistant. Use only the following context to answer the question.
Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned.

Context:
{context}

Question: {query}

Answer:"""
    return prompt, source_refs

def rag_query(query, k=3):
    start = time.time()
    chunks = search_qdrant(query, k)
    prompt, refs = build_prompt(query, chunks)

    response = azure_client.chat.completions.create(
        model=AZURE_DEPLOYMENT_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use only the following context to answer the question. Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned."},
            {"role": "user", "content": prompt}
        ],
    )

    answer = response.choices[0].message.content.strip()
    print(answer)

rag_query("what is the backend service that powers up aws lambda")

[{'score': 0.466636, 'text': 'The seccomp-bpf proﬁle whitelists 24\nsyscalls, each with additional argument ﬁltering, and 30 ioctls\n(of which 22 are required by KVM ioctl-based API). 4 Firecracker In Production\n4.1 Inside A WS Lambda\nLambda [51] is a compute service which runs functions in re-\nsponse to events. Lambda offers a number of built-in language\nruntimes (including Python, Java, NodeJS, and C#) which al-\nlows functions to be provided as snippets of code implement-\ning a language-speciﬁc runtime interface. A "Hello, World!" Lambda function can be implemented in as few as three lines\nof Python or Javascript. It also supports an HTTP/REST run-\ntime API, allowing programs which implement this API to\nbe developed in any language, and provided either as bina-\nries or a bundle alongside their language implementation. Lambda functions run within a sandbox, which provides a\nminimal Linux userland and some common libraries and utili-\nties. When Lambda functions are created,