In [32]:
!pip install boto3 faiss-cpu sentence-transformers transformers accelerate pypdf PyPDF2 PyCryptodome qdrant-client streamlit
!pip install nltk pdfminer.six scikit-learn localtunnel

import io
import os
import uuid
import boto3
import nltk
import string
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, FieldCondition, Filter, MatchValue
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import userdata

nltk.download('punkt')
nltk.download('punkt_tab')

AWS_ACCESS_KEY = userdata.get("AWS_ACCESS_KEY")
AWS_SECRET_KEY = userdata.get("AWS_SECRET_KEY")
S3_BUCKET = "rag-vector-db-poc"
embed_model = SentenceTransformer("all-MiniLM-L6-v2")


s3 = boto3.client(
    's3',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY
)

QDRANT_HOST = "54.152.12.154"
QDRANT_PORT = 6333
COLLECTION = "docs_chunks"

client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

client.recreate_collection(
    collection_name=COLLECTION,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE)
)

Collecting pdfminer.six
  Using cached pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
[31mERROR: Could not find a version that satisfies the requirement localtunnel (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for localtunnel[0m[31m
[0m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  client.recreate_collection(


True

In [33]:
# === S3 Utilities ===
def list_text_and_pdf_keys(bucket, prefix=""):
    keys = []
    paginator = s3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            key = obj["Key"]
            if key.endswith(".pdf") or key.endswith(".txt"):
                keys.append(key)
    return keys

def download_file_from_s3(bucket, key):
    response = s3.get_object(Bucket=bucket, Key=key)
    return io.BytesIO(response['Body'].read())

# === Text Extraction ===
def extract_text_from_pdf(pdf_io):
    reader = PdfReader(pdf_io)
    return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])

def extract_text_from_txt(txt_io):
    return txt_io.read().decode("utf-8")

# === Chunking ===
def chunk_paragraphs(text, s3_path, chunk_token_limit=500):
    # Extract the S3 key path (excluding bucket name)
    key_path = s3_path.replace("s3://", "").split("/", 1)[1]  # get "finance/2023/report.pdf"
    department = key_path.split("/")[0] if "/" in key_path else "root"
    file_name = os.path.basename(s3_path)

    sentences = sent_tokenize(text)
    chunks, current_chunk = [], []
    token_count = 0

    for sentence in sentences:
        tokens = sentence.split()
        token_len = len(tokens)
        if token_count + token_len > chunk_token_limit:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            token_count = 0
        current_chunk.append(sentence)
        token_count += token_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return [
        {
            "text": chunk,
            "s3_path": s3_path,
            "file_name": file_name,
            "chunk_id": f"{s3_path}_{i}",
            "chunk_index": i,
            "token_count": len(chunk.split()),
            "department": department,  # ✅ renamed here
            "top_keywords": extract_top_keywords(chunk)
        }
        for i, chunk in enumerate(chunks)
    ]


# === Keyword Extraction ===
def extract_top_keywords(text, top_n=5):
    try:
        vectorizer = TfidfVectorizer(stop_words="english", max_features=top_n)
        X = vectorizer.fit_transform([text])
        scores = X.toarray().flatten()
        return {word: round(score, 4) for word, score in zip(vectorizer.get_feature_names_out(), scores)}
    except Exception:
        return {}

# === Embedding & Upload ===
def embed_chunks(chunks):
    texts = [c["text"] for c in chunks]
    return embed_model.encode(texts, show_progress_bar=True)

def upload_chunks_to_qdrant(chunks, embeddings):
    points = [
        PointStruct(
            id=str(uuid.uuid4()),
            vector=embedding,
            payload=chunk
        )
        for chunk, embedding in zip(chunks, embeddings)
    ]
    client.upsert(collection_name=COLLECTION, points=points)

# === Orchestration ===
def process_and_upload_files(bucket, keys):
    for key in keys:
        try:
            print(f"📄 Processing: {key}")
            file_io = download_file_from_s3(bucket, key)

            if key.endswith(".pdf"):
                text = extract_text_from_pdf(file_io)
            elif key.endswith(".txt"):
                text = extract_text_from_txt(file_io)
            else:
                print(f"Skipping unsupported file type: {key}")
                continue

            chunks = chunk_paragraphs(text, s3_path=f"s3://{bucket}/{key}")
            embeddings = embed_chunks(chunks)
            upload_chunks_to_qdrant(chunks, embeddings)
            print(f"✅ Uploaded: {key}")
        except Exception as e:
            print(f"❌ Failed: {key}\n{e}")

# === Run ===
file_keys = list_text_and_pdf_keys(S3_BUCKET)
process_and_upload_files(S3_BUCKET, file_keys)

📄 Processing: GHC.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: GHC.pdf
📄 Processing: Internship Certificate.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Internship Certificate.pdf
📄 Processing: Joining Letter.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: Joining Letter.pdf
📄 Processing: academic standing.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: academic standing.pdf
📄 Processing: buddy4study.pdf


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Uploaded: buddy4study.pdf


In [39]:
%%writefile app.py
import streamlit as st
import uuid, time
import numpy as np
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from openai import AzureOpenAI
from google.colab import userdata

# ------------------- CONFIG -------------------
QDRANT_HOST = "54.152.12.154"
QDRANT_PORT = 6333
COLLECTION = "docs_chunks"
EMBED_DIM = 384

# Azure OpenAI Config (replace with actual values)
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

# ------------------- LOAD MODELS -------------------
@st.cache_resource
def load_embedder():
    return SentenceTransformer("all-MiniLM-L6-v2")

@st.cache_resource
def load_azure_client():
    return AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        api_version="2023-05-15",
        azure_endpoint=AZURE_OPENAI_ENDPOINT
    )

embed_model = load_embedder()
azure_client = load_azure_client()

# ------------------- QDRANT -------------------
client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

# ------------------- MONITORING -------------------
monitoring = {
    "access_count": defaultdict(int),
    "latencies": [],
    "query_log": []
}

# ------------------- RAG -------------------
def search_qdrant(query, k=5):
    vec = embed_model.encode([query])[0]
    results = client.search(COLLECTION, query_vector=vec, limit=k)
    return [
        {
            "text": r.payload.get("text", ""),
            "file_name": r.payload.get("file_name", "unknown"),
            "s3_path": r.payload.get("s3_path", "unknown"),
            "department": r.payload.get("department", "unknown")
        }
        for r in results
    ]


def build_prompt(query, top_chunks):
    context = ""
    source_refs = {}

    for i, chunk in enumerate(top_chunks):
        ref = f"[{i+1}]"
        source = chunk.get("s3_path", "unknown")
        context += f"{ref} ({source}):\n{chunk['text']}\n\n"
        source_refs[ref] = source

    prompt = f"""You are a helpful assistant. Use only the following context to answer the question.
Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned.

Context:
{context}

Question: {query}

Answer:"""
    return prompt, source_refs

def rag_query(query, k=5):
    start = time.time()
    chunks = search_qdrant(query, k)
    prompt, refs = build_prompt(query, chunks)

    response = azure_client.chat.completions.create(
        model=AZURE_DEPLOYMENT_NAME,
        messages=[
            {"role": "system", "content": "You are a helpful assistant. Use only the following context to answer the question. Cite sources using [1], [2], etc., based only on the exact chunks below. Do not make up citations. Do not include sources not explicitly mentioned."},
            {"role": "user", "content": prompt}
        ],
    )

    answer = response.choices[0].message.content.strip()
    latency = round((time.time() - start) * 1000, 2)

    for ref in refs.values():
        monitoring["access_count"][ref] += 1
    monitoring["latencies"].append(latency)
    monitoring["query_log"].append({
        "query": query,
        "sources": list(refs.values()),
        "latency_ms": latency
    })

    return answer, refs, latency, chunks


# ------------------- UI -------------------
st.set_page_config(page_title="RAG Chat with Citations", layout="wide")
st.title("💬 RAG Assistant with Qdrant + Azure OpenAI")

st.markdown("Ask questions based on the preloaded document index.")

query = st.text_input("Enter your question here:")

if query:
    with st.spinner("Searching and generating response..."):
        answer, refs, latency, retrieved_chunks = rag_query(query)

        st.markdown("### 🧠 Answer")
        st.write(answer)

        st.markdown("### 📦 Retrieved Chunks")

        for i, chunk in enumerate(retrieved_chunks):
            with st.expander(f"[{i+1}] Source: {chunk['file_name']}", expanded=True):
                st.write(chunk["text"])
                st.caption(
                    f"📂 Department: `{chunk.get('department', 'unknown')}` | "
                    f"🧾 File Path: {chunk.get('s3_path', 'N/A')}"
                )


        st.markdown("### ⏱️ Latency")
        st.write(f"{latency} ms")

# ------------------- Sidebar: Monitoring -------------------
st.sidebar.title("📊 Monitoring")
st.sidebar.write(f"Total queries: {len(monitoring['query_log'])}")
if monitoring["latencies"]:
    st.sidebar.write(f"Average latency: {np.mean(monitoring['latencies']):.2f} ms")
    st.sidebar.write("Top documents accessed:")
    top_sources = sorted(monitoring["access_count"].items(), key=lambda x: x[1], reverse=True)
    for src, count in top_sources:
        st.sidebar.write(f"• {src}: {count}x")


Overwriting app.py


In [None]:
!wget -q -O - ipv4.icanhazip.com
! streamlit run app.py & npx localtunnel --port 8501

34.106.210.38

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.106.210.38:8501[0m
[0m
[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0Kyour url is: https://wet-wolves-marry.loca.lt
2025-07-03 07:24:00.145724: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751527440.194626   16109 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751527440.212941   16109 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register fac