In [None]:
!pip install transformers sentencepiece langchain faiss-cpu

!pip install pypdf unstructured bs4

# vinai/phobert-base


In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

from langchain.embeddings.base import Embeddings
from transformers import AutoTokenizer, AutoModel
import torch

class PhoBERTEmbeddings(Embeddings):
    def __init__(self, model_name="vinai/phobert-base"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()
        self.model.to("cpu")

    def embed_documents(self, texts):
        return [self._get_cls_embedding(text) for text in texts]

    def embed_query(self, text):
        return self._get_cls_embedding(text)

    def _get_cls_embedding(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = self.model(**inputs)
        cls_embedding = outputs.last_hidden_state[0][0]
        return cls_embedding.cpu().tolist()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
phobert_embedder = PhoBERTEmbeddings()

# Tải docs / tạo vector , lưu vector



In [14]:
import os
from langchain.document_loaders import PyPDFLoader

def load_all_pdfs(folder_path):
    all_docs = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(folder_path, filename))
            docs = loader.load()
            all_docs.extend(docs)
    return all_docs

documents = load_all_pdfs("Vpbank_docs/pdf_docs")


Multiple definitions in dictionary at byte 0x7cc814 for key /Info
Multiple definitions in dictionary at byte 0x7cc821 for key /Info
Multiple definitions in dictionary at byte 0x7cc82e for key /Info
Multiple definitions in dictionary at byte 0xb2723 for key /Info
Multiple definitions in dictionary at byte 0xb2730 for key /Info
Multiple definitions in dictionary at byte 0xb273d for key /Info
Multiple definitions in dictionary at byte 0xb90bc for key /Info
Multiple definitions in dictionary at byte 0xb90c9 for key /Info
Multiple definitions in dictionary at byte 0xb90d6 for key /Info
Multiple definitions in dictionary at byte 0xa0857f for key /Info
Multiple definitions in dictionary at byte 0xa0858c for key /Info
Multiple definitions in dictionary at byte 0xa08599 for key /Info
Multiple definitions in dictionary at byte 0xd0d94 for key /Info
Multiple definitions in dictionary at byte 0xd0da1 for key /Info
Multiple definitions in dictionary at byte 0xd0dae for key /Info
Multiple definition

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
docs = splitter.split_documents(documents)


In [17]:
len(documents)

2393

In [16]:
len(docs)

18529

In [18]:
from langchain.vectorstores import FAISS

phobert_embedder = PhoBERTEmbeddings()
vectorstore = FAISS.from_documents(docs, phobert_embedder)

In [24]:
vectorstore.save_local("phobert_faiss_index")

# truy vấn

In [3]:
from langchain.vectorstores import FAISS

# Khởi tạo lại PhoBERTEmbeddings (nếu chưa)
phobert_embedder = PhoBERTEmbeddings()

# Load lại FAISS index đã lưu
vectorstore = FAISS.load_local(
    folder_path="phobert_faiss_index",
    embeddings=phobert_embedder,
    allow_dangerous_deserialization=True  # Bắt buộc khi không dùng OpenAI
)


In [4]:
# Cách ổn định và dễ đọc nhất
def print_docs(docs):
    for i, doc in enumerate(docs):
        print(f"\n🔹 Kết quả {i+1}")
        filename = doc.metadata['source'].split("Vpbank_docs\\")[-1]  # hoặc dùng os.path.basename
        page = doc.metadata.get("page_label", doc.metadata.get("page", "?"))
        print(f"📄 Tệp: {filename}")
        print(f"🔖 Trang: {page}")
        print(f"📝 Nội dung:\n{doc.page_content.strip()}")


In [5]:
import numpy as np

query_vec = phobert_embedder.embed_query("Tôi muốn vay tiền mua nhà")
query_vec = np.array(query_vec, dtype=np.float32)  # ép kiểu bắt buộc

results = vectorstore.similarity_search_by_vector(query_vec)
print_docs(results)


🔹 Kết quả 1
📄 Tệp: Vpbank_docs/pdf_docs\20250708-bieu-lai-suat-niem-yet.pdf
🔖 Trang: 4
📝 Nội dung:
- 
- 
 
 
 
  
    
 


🔹 Kết quả 2
📄 Tệp: Vpbank_docs/pdf_docs\dieu-kien-giao-dich-chung-ve-cap-tin-dung-danh-cho-khach-hang-ca-nhan-tai-vpbank-15112022-hhl.pdf
🔖 Trang: 14
📝 Nội dung:
14

🔹 Kết quả 3
📄 Tệp: Vpbank_docs/pdf_docs\mb23-qd-gdcn-200.pdf
🔖 Trang: 1
📝 Nội dung:
giá tiền mà KH thực nộp

🔹 Kết quả 4
📄 Tệp: Vpbank_docs/pdf_docs\mb24-qd-gdcn-200.pdf
🔖 Trang: 1
📝 Nội dung:
giá tiền mà KH thực nộp
