<a href="https://colab.research.google.com/github/vinay-jose/rag-nbs/blob/main/Bi_encoder_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo apt-get install poppler-utils
!sudo apt-get install tesseract-ocr

In [None]:
!pip install -Uqq "unstructured[all-docs]" pytesseract "qdrant-client[fastembed]" transformers huggingface_hub "rerankers"

In [None]:
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from qdrant_client import QdrantClient, models
from tqdm import tqdm
from rerankers import Reranker

In [None]:
fname = "/content/THINKZ schedule of work.pdf"

In [None]:
elements = partition_pdf(filename=fname,
                         infer_table_structure=True,
                         strategy='hi_res', # https://docs.unstructured.io/open-source/concepts/partitioning-strategies
                         hi_res_model_name="yolox_quantized" # https://docs.unstructured.io/open-source/concepts/models - yolox, yolox_quantized, detectron2_onnx
           )

In [None]:
chunks = chunk_by_title(elements)

In [None]:
# client = QdrantClient(url="http://localhost:6334")
client = QdrantClient(":memory:")

In [None]:
client.set_model("sentence-transformers/all-MiniLM-L6-v2")
# comment the line below to use dense vectors only
client.set_sparse_model("prithivida/Splade_PP_en_v1")

In [None]:
if not client.collection_exists("docs"):
    client.create_collection(
            collection_name="docs",
            vectors_config=client.get_fastembed_vector_params(),
            # comment this line to use dense vectors only
            sparse_vectors_config=client.get_fastembed_sparse_vector_params(),
        )

In [None]:
data = {
    "tenant": "thinkz",
    "project": "tender"
}

In [None]:
documents = []
metadata = []
for chunk in chunks:
    meta = chunk.metadata.to_dict()
    meta["tenant"] = data.get("tenant")
    meta["project"] = data.get("project")
    documents.append(chunk.text)
    metadata.append(meta)

In [None]:
client.add(
    collection_name="tender_docs",
    documents=documents,
    metadata=metadata,
    parallel=0,  # Use all available CPU cores to encode data.
    ids=tqdm(range(len(documents))),
)

In [None]:
# Add filter queries

# query_filter = None

tenant_of_interest = "thinkz"

# Define a filter for tenants
query_filter = models.Filter(
    must=[
        models.FieldCondition(
            key="tenant",
            match=models.MatchValue(value=tenant_of_interest)
        )
    ]
)

In [None]:
search_result = client.query (
                    collection_name="tender_docs",
                    query_text="Total cost",
                    query_filter=query_filter,
                    limit=5,  # num of the closest results
                    # with_vectors=True, # to fetch stored vectors.
                )

search_result

In [None]:
metadata = [hit.metadata for hit in search_result]
metadata

In [None]:
scores = [hit.score for hit in search_result]
scores

In [None]:
documents = [hit.document for hit in search_result]
documents

In [None]:
ranker = Reranker("colbert")

results = ranker.rank(
    query="Total cost",
    docs= documents,
    metadata = metadata,
    # doc_ids=[0,1]
)
results

In [None]:
results.top_k(3)

In [None]:
prompt_template = f"""
You are an AI assistant. Given the user query: "{query}", and the following list of relevant documents:

{results}

Generate a coherent and informative response that addresses the user's query using the information from the documents.
"""