In [None]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()

# **Load data**

In [None]:
loader = DirectoryLoader('./data', glob='*.pdf', loader_cls=PyPDFLoader)
document = loader.load()
len(document)

In [None]:
document[0]

# **Divide the document into chunks**

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(document)
print(f"Số lượng chunks: {len(docs)}")
# print(docs[0].page_content[:300])  # xem thử 300 ký tự đầu tiên
texts = [d.page_content for d in docs]  # trích nội dung
print(texts[0][:300])

# **Set Qdrant and save document embedding vectors**

In [None]:
qdrant_client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY"),
)
print(qdrant_client.get_collections())

In [None]:
# TẠO COLLECTION (database cho embedding)
qdrant_client.recreate_collection(
    collection_name="pdf_chunks",
    vectors_config=VectorParams(
        size=768,                   # kích thước vector (phải đúng với embedding model)
        distance=Distance.COSINE    # kiểu đo khoảng cách (Cosine, Euclidean, Dot)
    )
)

In [None]:
print(qdrant_client.get_collections())

# **Embedding document chunks**

In [None]:
embedding = SentenceTransformer('intfloat/multilingual-e5-base')

In [None]:
embedded_vectors = embedding.encode(texts, show_progress_bar=True)
print("Số vector:", len(embedded_vectors))
print("Kích thước vector:", embedded_vectors.shape)
print(embedded_vectors[0][:25])  # xem 5 phần tử đầu tiên của vector đầu tiên

# **Submit data to Qdrant Cloud**

In [None]:
# Prepare PointStruct data for upsert
points = []
for i, doc in enumerate(docs):
    point = PointStruct(
        id=i,
        vector=embedded_vectors[i].tolist(),
        payload={
            "text": doc.page_content,
            "source": doc.metadata.get("source", "unknown"),
            "page": doc.metadata.get("page", 0)
        }
    )
    points.append(point)

In [None]:
qdrant_client.upsert(
    collection_name="pdf_chunks",
    points=points
)

In [None]:
info = qdrant_client.get_collection("pdf_chunks")
print(info)  # in thông tin collection

count = qdrant_client.count("pdf_chunks", exact=True)
print("Tổng số điểm đã lưu:", count.count)

# **Search**

In [None]:
query = "Đối tượng dự tuyển sinh yêu cầu như thế nào?"
query_vector = embedding.encode([query])[0]

results = qdrant_client.search(
    collection_name="pdf_chunks",
    query_vector=query_vector.tolist(),
    limit=3
)

for result in results:
    print(f"Score: {result.score:.4f}")
    print(f"Source: {result.payload.get('source', 'unknown')} - Page: {result.payload.get('page', 0)}")
    print(f"Text: {result.payload.get('text', '')[:300]}...")
    print("-" * 50)

# **Concatenate query and embedding vector into llm model**

In [None]:
GEMINI_API_KEY = os.getenv("GOOOEL_GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)

context = "\n".join([result.payload.get('text', '') for result in results])
question = "Đối tượng dự tuyển sinh yêu cầu như thế nào?"

prompt = f"""
Dựa trên ngữ cảnh sau, trả lời câu hỏi:
Ngữ cảnh:
{context}

Câu hỏi:
{question}
"""

model = genai.GenerativeModel("gemini-2.5-flash-preview-05-20")
response = model.generate_content(prompt)
print("Response:", response.text)