<a href="https://colab.research.google.com/github/vinaykrshnn-git2026/advanced-rag/blob/main/09_utility_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## General Purpose Utility

In [None]:
!git clone https://github.com/vinaykrshnn-git2026/advanced-rag-refactored.git
%cd advanced-rag-refactored
!pip install -q -r requirement_rag_refactored.txt

In [None]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Copy all pdf files from Drive to RAG Labs

import os
import shutil
import glob
from pathlib import Path
import fnmatch

# Define source and destination root paths within your mounted Drive
# Replace 'SourceFolder' and 'DestinationFolder' with your actual folder names/paths
SRC_ROOT = '/content/drive/MyDrive'
DEST_ROOT = '/content/drive/MyDrive/RAG_Labs/pdf_files'

# 2. Create destination folder if it doesn't exist
if not os.path.exists(DEST_ROOT):
    os.makedirs(DEST_ROOT)
    print(f"Created folder: {DEST_ROOT}")

# 3. Recursive copy with flattening and skipping
files_copied = 0
files_skipped = 0

for root, dirs, files in os.walk(SRC_ROOT):
    for file in files:
        if file.lower().endswith('.pdf'):
            source_file_path = os.path.join(root, file)
            destination_file_path = os.path.join(DEST_ROOT, file)

            # Check if file exists to skip
            if not os.path.exists(destination_file_path):
                shutil.copy2(source_file_path, destination_file_path)
                print(f"Copied: {file}")
                files_copied += 1
            else:
                print(f"Skipped (exists): {file}")
                files_skipped += 1

print(f"\nSummary:\nCopied: {files_copied}\nSkipped: {files_skipped}")



## Upsert a new image to existing Qdrant collection

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.exceptions import UnexpectedResponse
from colpali_engine.models import ColPali, ColPaliProcessor
from google.colab import userdata
import torch

#####################################################################
#   Initializing Cloud Qdrant collection
#####################################################################


from qdrant_client import QdrantClient

# Replace these with your actual Cloud credentials
QDRANT_URL = "https://f7369634-b961-4d15-ba60-8b230e810658.us-east4-0.gcp.cloud.qdrant.io"

try:
    # Initialize the Cloud Client
    qdrant_client = QdrantClient(
        url=QDRANT_URL,
        api_key=userdata.get('QDRANT_API_KEY'),
    )
    print("Connected to Qdrant Cloud!")
except Exception as e:
    print(f"Cloud connection failed: {e}")



#####################################################################
#   Initializing Colpali
#####################################################################


# Initialize ColPali model and processor
model_name = (
    "vidore/colpali-v1.2"  # Use the latest version available
)
colpali_model = ColPali.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",  # Use "cuda:0" for GPU, "cpu" for CPU, or "mps" for Apple Silicon
)
colpali_processor = ColPaliProcessor.from_pretrained(
    "vidore/colpaligemma-3b-pt-448-base"
)


In [None]:
import PIL.Image
import torch
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
import base64
import uuid
from io import BytesIO

# 1. Load your image
image_path = "/content/drive/MyDrive/PHOTO-2022-01-12-09-24-39.jpg"
image = PIL.Image.open(image_path).convert("RGB")

# 2. Encode to Base64 (to store in Qdrant payload so the UI can display it later)
buffered = BytesIO()
image.save(buffered, format="PNG")
base64_string = base64.b64encode(buffered.getvalue()).decode("utf-8")

# 3. Generate ColPali Embeddings
# Use the same processor/model currently in your 'models' dictionary
with torch.no_grad():
    batch_images = colpali_processor.process_images([image]).to(colpali_model.device)
    image_embeddings = colpali_model(**batch_images)
    # Convert to list for Qdrant and flatten the list
    vector = image_embeddings.cpu().float().numpy().tolist()[0]


# 4. Upsert to Qdrant
unique_id = str(uuid.uuid4())
try:
      qdrant_client.upsert(
          collection_name="identity_documents",
          points=[
              PointStruct(
                  id=unique_id, # A unique integer or UUID
                  vector=vector,
                  payload={
                      "doc": "manual_upload",
                      "page": 1,
                      "base64_image": base64_string
                  }
              )
          ]
      )
except Exception as e:
                print(f"Error during upsert: {e}")