In [1]:
import fitz  # PyMuPDF

def extract_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# List your textbook PDF file paths here
pdf_files = ["C:\\Users\\prask\\OneDrive\\Desktop\\Internship\\1000 page pdf testing\\India_Complete_Information_1000_Pages.pdf"]

# Extract text from each textbook
texts = []
for file in pdf_files:
    text = extract_text(file)
    texts.append(text)

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Assuming 'texts' is a list of strings, each containing the extracted text from a textbook
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=0
)

all_chunks = []
for text in texts:
    chunks = splitter.split_text(text)
    all_chunks.extend(chunks)

print(f"Total number of chunks: {len(all_chunks)}")
# Optional: Inspect the first chunk
print(all_chunks[0][:500])  # Print first 500 characters of the first chunk

Total number of chunks: 2046
INDIA
Comprehensive Encyclopedia
Complete Information about India, States, Cities and Culture
Compiled from Wikipedia
India
India, officially the Republic of India, is a country in South Asia. It is the seventh-largest country by area; the
most populous country since 2023; and, since its independence in 1947, the world's most populous democracy.
Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the
southeast, it shares land borders with Pakistan


In [5]:
import requests
from typing import List, Dict

def encode_chunks(chunks: List[str], 
                  api_url: str = "http://192.168.1.11:8074/encode_text") -> List[Dict]:
    """
    Encode chunks using the embedding API.
    
    Args:
        chunks: List of text chunks to embed
        api_url: Embedding API endpoint
    
    Returns:
        List of dictionaries containing chunk text and embeddings
    """
    embedded_chunks = []
    
    for idx, chunk in enumerate(chunks):
        try:
            # Prepare request payload - only text field
            payload = {'text': chunk}
            
            # Make POST request
            response = requests.post(
                api_url,
                headers={
                    'accept': 'application/json',
                    'Content-Type': 'application/x-www-form-urlencoded'
                },
                data=payload,
                timeout=30
            )
            
            # Check response
            if response.status_code == 200:
                result = response.json()
                
                if result.get('status') == 'success':
                    embedded_chunks.append({
                        'text': chunk,
                        'embedding': result['embeddings'],
                        'chunk_id': idx
                    })
                    
                    print(f"Embedded chunk {idx + 1}/{len(chunks)}")
                else:
                    print(f"Failed to embed chunk {idx}: {result}")
            else:
                print(f"HTTP Error {response.status_code} for chunk {idx}")
                
        except Exception as e:
            print(f"Error embedding chunk {idx}: {str(e)}")
    
    return embedded_chunks

# Embed all chunks
print("Starting embedding process...")
embedded_data = encode_chunks(all_chunks)
print(f"Successfully embedded {len(embedded_data)} chunks")

Starting embedding process...
Embedded chunk 1/2046
Embedded chunk 2/2046
Embedded chunk 3/2046
Embedded chunk 4/2046
Embedded chunk 5/2046
Embedded chunk 6/2046
Embedded chunk 7/2046
Embedded chunk 8/2046
Embedded chunk 9/2046
Embedded chunk 10/2046
Embedded chunk 11/2046
Embedded chunk 12/2046
Embedded chunk 13/2046
Embedded chunk 14/2046
Embedded chunk 15/2046
Embedded chunk 16/2046
Embedded chunk 17/2046
Embedded chunk 18/2046
Embedded chunk 19/2046
Embedded chunk 20/2046
Embedded chunk 21/2046
Embedded chunk 22/2046
Embedded chunk 23/2046
Embedded chunk 24/2046
Embedded chunk 25/2046
Embedded chunk 26/2046
Embedded chunk 27/2046
Embedded chunk 28/2046
Embedded chunk 29/2046
Embedded chunk 30/2046
Embedded chunk 31/2046
Embedded chunk 32/2046
Embedded chunk 33/2046
Embedded chunk 34/2046
Embedded chunk 35/2046
Embedded chunk 36/2046
Embedded chunk 37/2046
Embedded chunk 38/2046
Embedded chunk 39/2046
Embedded chunk 40/2046
Embedded chunk 41/2046
Embedded chunk 42/2046
Embedded chun

In [11]:
# Install the missing package in the notebook environment
%pip install fastembed

from fastembed import SparseTextEmbedding #type:ignore

# Replace all_chunks with your list of text chunks
model = SparseTextEmbedding(model_name="Qdrant/bm25")
sparse_embeddings = list(model.embed(all_chunks))
# Example: Get indices and values for the first chunk
indices = sparse_embeddings[0].indices
values = sparse_embeddings[0].values
print("Indices:", indices)
print("Values:", values)


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.Collecting fastembed
  Using cached fastembed-0.7.3-py3-none-any.whl.metadata (10 kB)
Collecting loguru<0.8.0,>=0.7.2 (from fastembed)
  Using cached loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting py-rust-stemmers<0.2.0,>=0.1.0 (from fastembed)
  Downloading py_rust_stemmers-0.1.5-cp312-none-win_amd64.whl.metadata (3.5 kB)
Collecting win32-setctime>=1.0.0 (from loguru<0.8.0,>=0.7.2->fastembed)
  Using cached win32_setctime-1.2.0-py3-none-any.whl.metadata (2.4 kB)
Using cached fastembed-0.7.3-py3-none-any.whl (105 kB)
Using cached loguru-0.7.3-py3-none-any.whl (61 kB)
Downloading py_rust_stemmers-0.1.5-cp312-none-win_amd64.whl (209 kB)
Using cached win32_setctime-1.2.0-py3-none-any.whl (4.1 kB)
Installing collected packages: py-rust-stemmers, win32-setctime, loguru, fastembed
Successfully installed fastembed-0.7.3 loguru-0.7.3 py-rust-stemmers-0.1.5 win32-setctime-1.2.0



Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

arabic.txt: 0.00B [00:00, ?B/s]

dutch.txt:   0%|          | 0.00/453 [00:00<?, ?B/s]

greek.txt: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

german.txt: 0.00B [00:00, ?B/s]

french.txt:   0%|          | 0.00/813 [00:00<?, ?B/s]

danish.txt:   0%|          | 0.00/424 [00:00<?, ?B/s]

finnish.txt: 0.00B [00:00, ?B/s]

hungarian.txt: 0.00B [00:00, ?B/s]

norwegian.txt:   0%|          | 0.00/851 [00:00<?, ?B/s]

italian.txt: 0.00B [00:00, ?B/s]

portuguese.txt: 0.00B [00:00, ?B/s]

russian.txt: 0.00B [00:00, ?B/s]

spanish.txt: 0.00B [00:00, ?B/s]

romanian.txt: 0.00B [00:00, ?B/s]

[32m2025-11-04 12:50:40.660[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m430[0m - [31m[1mCould not download model from HuggingFace: [WinError 1314] A required privilege is not held by the client: '..\\..\\blobs\\cafa0324b53763f7efadda5b0f3d0321ffa7ab38' -> 'C:\\Users\\prask\\AppData\\Local\\Temp\\fastembed_cache\\models--Qdrant--bm25\\snapshots\\e499a1f8d6bec960aab5533a0941bf914e70faf9\\dutch.txt' Falling back to other sources.[0m
[32m2025-11-04 12:50:40.668[0m | [31m[1mERROR   [0m | [36mfastembed.common.model_management[0m:[36mdownload_model[0m:[36m452[0m - [31m[1mCould not download model from either source, sleeping for 3.0 seconds, 2 retries left.[0m


Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

turkish.txt:   0%|          | 0.00/260 [00:00<?, ?B/s]

swedish.txt:   0%|          | 0.00/559 [00:00<?, ?B/s]

Indices: [1844140545  760114740  638192719  307881509  647566671  722829366
 1966288579  620348882 1830956466 1139059686 2068970101 1679100045
  397050504  943113075 1873684806 1543697338  331359638 1214902428
  710973585  874403274  423733604 1448801182    9660375   74040069
  744461849 1564465246 1672556481 1852515656 1479549668 1815470626
 1570775410 1176126011 1432568069 1423243818 1181836714  142582585
 1069335456  138150528  828010589 1886779727 1932493447  443967653
 1417454340  959374964 1559768741  907197629 2005953269  625320497
  496968477    3608315 1718225784  346328789  440637894 1082468256
 1762032986 1572467819  208271046 1200518484 1356719865 1979157630
  822745112  880581804  953824239  274274677  109190994  898985849
  492650564 2099192796  842018159 1547122275 1612531086  760321374
 1438269958 2019785588 2073907658 1004152463 1360229003 1556279568
   12344922 1054717118  613148321 1016550947 1431284986 1127868892
  500560476 1391639301  978121085  397057502  5577738

In [12]:
sparse_vectors = [
    {"indices": emb.indices, "values": emb.values}
    for emb in sparse_embeddings
]

In [14]:
%pip install qdrant-client

from qdrant_client import QdrantClient, models

client = QdrantClient(host="192.168.1.13", port=6333)

collection_name = "1000_pages_no_overlap"

try:
    client.get_collection(collection_name)
    print(f"Collection '{collection_name}' already exists.")
except Exception:
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config={
            "dense": models.VectorParams(
                size=1024,
                distance=models.Distance.COSINE,
            ),
        },
        sparse_vectors_config={
            "sparse": models.SparseVectorParams()
        },
        shard_number=1,
    )
    print(f"Collection '{collection_name}' created.")

Collecting qdrant-client
  Using cached qdrant_client-1.15.1-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Using cached portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Using cached qdrant_client-1.15.1-py3-none-any.whl (337 kB)
Using cached portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant-client
Successfully installed portalocker-3.2.0 qdrant-client-1.15.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collection '1000_pages_no_overlap' created.


  client.recreate_collection(


In [16]:
from qdrant_client.models import PointStruct

# Prepare points for upsert
points = []
for idx, (embedded_chunk, sparse_vec) in enumerate(zip(embedded_data, sparse_vectors)):
    point = PointStruct(
        id=idx,
        vector={
            "dense": embedded_chunk['embedding'],
            "sparse": models.SparseVector(
                indices=sparse_vec['indices'],
                values=sparse_vec['values']
            )
        },
        payload={
            "text": embedded_chunk['text'],
            "chunk_id": embedded_chunk['chunk_id']
        }
    )
    points.append(point)

# Upsert points in batches (recommended for large datasets)
batch_size = 100
total_points = len(points)

print(f"Starting upsert of {total_points} points...")

for i in range(0, total_points, batch_size):
    batch = points[i:i + batch_size]
    client.upsert(
        collection_name=collection_name,
        points=batch
    )
    print(f"Upserted {min(i + batch_size, total_points)}/{total_points} points")

print("Upsert completed successfully!")

# Verify the collection
collection_info = client.get_collection(collection_name)
print(f"\nCollection info: {collection_info.points_count} points in collection")

Starting upsert of 2046 points...
Upserted 100/2046 points
Upserted 200/2046 points
Upserted 300/2046 points
Upserted 400/2046 points
Upserted 500/2046 points
Upserted 600/2046 points
Upserted 700/2046 points
Upserted 800/2046 points
Upserted 900/2046 points
Upserted 1000/2046 points
Upserted 1100/2046 points
Upserted 1200/2046 points
Upserted 1300/2046 points
Upserted 1400/2046 points
Upserted 1500/2046 points
Upserted 1600/2046 points
Upserted 1700/2046 points
Upserted 1800/2046 points
Upserted 1900/2046 points
Upserted 2000/2046 points
Upserted 2046/2046 points
Upsert completed successfully!

Collection info: 2046 points in collection
