In [1]:
import fitz  # PyMuPDF

def extract_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# List your textbook PDF file paths here
pdf_files = ["C:\\Users\\prask\\OneDrive\\Desktop\\Internship\\Binary Quantization Testing\\level-4-Crime-Story-Penguin-Readers-1.pdf"]

# Extract text from each textbook
texts = []
for file in pdf_files:
    text = extract_text(file)
    texts.append(text)

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assuming 'texts' is a list of strings, each containing the extracted text from a textbook
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=0
)

all_chunks = []
for text in texts:
    chunks = splitter.split_text(text)
    all_chunks.extend(chunks)

print(f"Total number of chunks: {len(all_chunks)}")
# Optional: Inspect the first chunk
print(all_chunks[0][:500])  # Print first 500 characters of the first chunk

Total number of chunks: 83
Crime Story Collection 
 
 
 
 
 
Level 4 
 
Retold by John and Celia Turvey                                                
Series Editors: Andy Hopkins and Jocelyn Potter 
Pearson Education limited 
Edinburgh Gate, Harlow, 
Essex CM20 2JE, England 
and Associated Companies throughout the world. 
ISBN 0 582 419190 
 
This compilation first published in Longman Fiction 1998                                                        
This edition first published 1999 
NEW  EDITION 
 
5 7 9 10 8 6 
 



In [4]:
import requests

JINA_API_KEY = "JINA_API_KEY"  # Replace with your actual API key
endpoint = "https://api.jina.ai/v1/embeddings"

def get_jina_embeddings(texts, dimensions=1024, task="retrieval.passage"):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {JINA_API_KEY}"
    }
    data = {
        "input": texts,
        "model": "jina-embeddings-v3",
        "dimensions": dimensions,
        "task": task
    }
    response = requests.post(endpoint, headers=headers, json=data)
    response.raise_for_status()
    return [item["embedding"] for item in response.json()["data"]]

# Example: batch your chunks to avoid API limits
batch_size = 32
dense_embeddings = []
for i in range(0, len(all_chunks), batch_size):
    batch = all_chunks[i:i+batch_size]
    dense_embeddings.extend(get_jina_embeddings(batch))

In [6]:
from fastembed import SparseTextEmbedding

# Replace all_chunks with your list of text chunks
model = SparseTextEmbedding(model_name="Qdrant/bm25")
sparse_embeddings = list(model.embed(all_chunks))
# Example: Get indices and values for the first chunk
indices = sparse_embeddings[0].indices
values = sparse_embeddings[0].values

In [6]:
sparse_vectors = [
    {"indices": emb.indices, "values": emb.values}
    for emb in sparse_embeddings
]

In [None]:
from qdrant_client import QdrantClient, models

client = QdrantClient(host="192.168.1.13", port=6333)

collection_name = "With_BQ"

try:
    client.get_collection(collection_name)
    print(f"Collection '{collection_name}' already exists.")
except Exception:
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config={
            "dense": models.VectorParams(
                size=1024,
                distance=models.Distance.COSINE,
            ),
        },
        sparse_vectors_config={
            "sparse": models.SparseVectorParams()
        },
        shard_number=1,
    )
    print(f"Collection '{collection_name}' created.")

Collection 'finance_docs' already exists.


In [None]:
from qdrant_client import QdrantClient

# Connect to your Qdrant instance
client = QdrantClient(host="192.168.1.13", port=6333)

def ensure_sparse_dict(sparse_emb):
    """
    Converts a sparse embedding to Qdrant's expected dict format.
    """
    if isinstance(sparse_emb, dict):
        return sparse_emb
    elif hasattr(sparse_emb, "indices") and hasattr(sparse_emb, "values"):
        return {"indices": list(sparse_emb.indices), "values": list(sparse_emb.values)}
    else:
        raise ValueError(f"Unknown sparse embedding format: {type(sparse_emb)}")

# Prepare points for upsert
points = []
for idx, (text, dense_emb, sparse_emb) in enumerate(zip(all_chunks, dense_embeddings, sparse_embeddings)):
    dense_emb_list = dense_emb.tolist() if hasattr(dense_emb, "tolist") else dense_emb
    sparse_emb_dict = ensure_sparse_dict(sparse_emb)
    points.append({
        "id": idx,
        "vector": {
            "dense": dense_emb_list,
            "sparse": sparse_emb_dict
        },
        "payload": {"text": text}
    })

# Upsert points in batches for efficiency
batch_size = 256
for i in range(0, len(points), batch_size):
    batch = points[i:i+batch_size]
    client.upsert(collection_name="With_BQ", points=batch)
    print(f"Upserted points {i} to {i+len(batch)-1}")

print(f"Inserted {len(points)} points into collection 'test'.")

In [10]:
from qdrant_client import QdrantClient, models

collection_name = "With_BQ"  # Replace with your actual collection name

client = QdrantClient(
    url="http://192.168.1.13:6333"
)

# Update collection configuration
client.update_collection(
    collection_name=collection_name,
    hnsw_config=models.HnswConfigDiff(
        m=32,
        ef_construct=200,
    ),
    quantization_config=models.BinaryQuantization(
        binary=models.BinaryQuantizationConfig(always_ram=True),
    ),
)

search_params = models.SearchParams(
    quantization=models.QuantizationSearchParams(
        ignore=False,
        rescore=True,
        oversampling=2.0,
    )
)