In [2]:
import pandas as pd
import re
import os
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct
from FlagEmbedding import BGEM3FlagModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_fixed_chunks(text, max_word_count=400):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks, current_chunk, word_count = [], "", 0
    for sentence in sentences:
        wc = len(sentence.split())
        if word_count + wc > max_word_count:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk, word_count = sentence, wc
        else:
            current_chunk += " " + sentence if current_chunk else sentence
            word_count += wc
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


In [3]:
corpus_csv = r"D:\Data\Legal-Retrieval\data\corpus.csv"
collection_name = "law_corpus_bge"
df = pd.read_csv(corpus_csv)
print(f"Loaded {len(df)} corpus entries")

# Init model
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

# Init Qdrant
qdrant = QdrantClient("http://localhost", port=6333)

# Create collection
vector_size = 1024  # BGEM3 output dim
qdrant.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
)

Loaded 261597 corpus entries


Fetching 30 files: 100%|██████████| 30/30 [00:00<00:00, 29995.02it/s]
  qdrant.recreate_collection(


True

In [None]:
points = []
point_id = 0

for row in tqdm(df.itertuples(), total=len(df)):
    cid = int(row.cid)
    chunks = create_fixed_chunks(row.text, max_word_count=400)

    for chunk_index, chunk_text in enumerate(chunks):
        vec = model.encode(chunk_text)["dense_vecs"]
        payload = {
            "cid": cid,
            "chunk_index": chunk_index,
            "text": chunk_text
        }
        points.append(PointStruct(id=point_id, vector=vec, payload=payload))
        point_id += 1

In [5]:
print(f"Uploading {len(points)} vectors to Qdrant...")
qdrant.upsert(collection_name=collection_name, points=points)
print("✅ Done indexing corpus into Qdrant.")

Uploading 52 vectors to Qdrant...
✅ Done indexing corpus into Qdrant.


In [4]:
from qdrant_client.models import ScrollRequest

scroll_result = qdrant.scroll(
    collection_name="law_corpus_bge",
    limit=5,
    with_payload=True,
    with_vectors=True
)

for point in scroll_result[0]:
    print(f"ID: {point.id}")
    print(f"Text: {point.payload.get('text')}")
    print(f"Vector: {point.vector} ...")  # In 5 phần tử đầu của vector
    print("-" * 50)

In [None]:
query_text = df.iloc[0].text[:500]  # lấy 500 ký tự đầu
query_chunks = create_fixed_chunks(query_text, max_word_count=400)
query_vec = model.encode(query_chunks[0])["dense_vecs"]

hits = qdrant.search(
    collection_name=collection_name,
    query_vector=query_vec,
    limit=5,
    with_payload=True
)

for hit in hits:
    print(f"Score: {hit.score}")
    print(f"Text: {hit.payload.get('text')}")
    print("-" * 50)

In [None]:
stopp

In [None]:
def index_corpus_to_qdrant(corpus_path: str, collection_name: str):
    # Load data
    df = pd.read_csv(corpus_path)
    print(f"Loaded {len(df)} corpus entries")

    # Init model
    model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)

    # Init Qdrant
    qdrant = QdrantClient("http://localhost", port=6333)

    # Create collection
    vector_size = 1024  # BGEM3 output dim
    qdrant.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )

    # Build points
    points = []
    point_id = 0
    for row in tqdm(df.itertuples(), total=len(df)):
        chunks = split_text_keeping_sentences(row.text)
        for i, chunk in enumerate(chunks):
            vec = model.encode(chunk)["dense_vecs"]
            payload = {
                "cid": int(row.cid),
                "chunk_index": i,
                "text": chunk
            }
            points.append(PointStruct(id=point_id, vector=vec, payload=payload))
            point_id += 1

    # Upsert to Qdrant
    print(f"Uploading {len(points)} vectors to Qdrant...")
    qdrant.upsert(collection_name=collection_name, points=points)
    print("✅ Done indexing corpus.")