In [1]:
import json
import pickle
from sentence_transformers import SentenceTransformer

# Hàm trích xuất văn bản từ sub_content (như trong ví dụ trước)
def extract_text_from_subcontent(subcontent):
    """
    subcontent có thể là:
    - 1 list các string
    - 1 list các dict (mỗi dict có thể có {"subheader":..., "items": [...]} hoặc {"paragraph": "...", "details": [...]} )
    Hàm trả về list các đoạn text.
    """
    results = []

    # Trường hợp subcontent là list nhưng phần tử bên trong chỉ là chuỗi
    if all(isinstance(item, str) for item in subcontent):
        results.extend(subcontent)
    else:
        # Trường hợp subcontent là list các dict
        for elem in subcontent:
            if isinstance(elem, dict):
                # subheader
                if "subheader" in elem and elem["subheader"]:
                    results.append(elem["subheader"])

                # paragraph
                if "paragraph" in elem and elem["paragraph"]:
                    results.append(elem["paragraph"])

                # items
                if "items" in elem and isinstance(elem["items"], list):
                    for it in elem["items"]:
                        if "paragraph" in it:
                            results.append(it["paragraph"])
                        if "details" in it and it["details"]:
                            for d in it["details"]:
                                results.append(d)

                # details (trực tiếp trong elem)
                if "details" in elem and elem["details"]:
                    for d in elem["details"]:
                        results.append(d)
    return results

# 1. Đọc file JSON
with open("cleaned_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. Gom toàn bộ nội dung của mỗi header thành 1 chunk duy nhất
all_data_chunks = []
for idx, item in enumerate(data):
    # Lấy tên header
    header_name = item["header"]

    # Lấy danh sách các đoạn text từ sub_content
    extracted_text_list = extract_text_from_subcontent(item["sub_content"])

    # Gộp tất cả những đoạn này lại thành 1 string
    combined_text = "\n".join(extracted_text_list)

    # Lưu vào list (mỗi phần tử là 1 chunk duy nhất cho từng header)
    all_data_chunks.append({
        "header_index": idx,
        "header": header_name,
        "text": combined_text
    })

# 3. Tạo embeddings với sentence-transformers (chỉ 23 embedding cho 23 header)
model = SentenceTransformer("all-MiniLM-L6-v2")
texts_for_embedding = [item["text"] for item in all_data_chunks]
embeddings = model.encode(texts_for_embedding, show_progress_bar=True)

# 4. Gộp kết quả vào 1 list
indexed_data = []
for i, emb in enumerate(embeddings):
    indexed_data.append({
        "header_index": all_data_chunks[i]["header_index"],
        "header": all_data_chunks[i]["header"],
        "text": all_data_chunks[i]["text"],
        "embedding": emb  # numpy array
    })

# 5. (Tùy chọn) Lưu ra file .pkl để tái sử dụng
with open("indexed_data.pkl", "wb") as f:
    pickle.dump(indexed_data, f)

print("Số lượng chunk thu được:", len(indexed_data))


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]

Số lượng chunk thu được: 23





In [15]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pickle
import google.generativeai as genai

############################################
# 1. Cấu hình Gemini API
############################################
genai.configure(api_key="")

############################################
# 2. Khởi tạo mô hình SentenceTransformer
############################################
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

############################################
# 3. Đọc file data_embeddings.pkl
#    (lưu dạng list các dict)
############################################
with open('indexed_23_data.pkl', 'rb') as f:
    data = pickle.load(f)
    # data là list chứa các dict như:
    # [
    #   {"header_index": ..., "header": ..., "text": ..., "embedding": np.array(...)},
    #   {"header_index": ..., "header": ..., "text": ..., "embedding": np.array(...)},
    #   ...
    # ]

############################################
# 4. Tách embeddings và chunks
############################################
embeddings_list = []
chunks_list = []
for item in data:
    embeddings_list.append(item["embedding"])  # vector (1D)
    chunks_list.append(item["text"])           # string

# Chuyển list embedding (1D) thành mảng 2D
# Mỗi row là embedding của một chunk
embeddings_array = np.vstack(embeddings_list)

############################################
# 5. Hàm tìm đoạn văn phù hợp nhất
############################################
def find_best_answer(question: str) -> str:
    # Tạo embedding cho câu hỏi
    question_embedding = model.encode([question])  # shape (1, dim)

    # Tính cosine similarity với toàn bộ embeddings_array (shape (N, dim))
    similarities = cosine_similarity(question_embedding, embeddings_array)
    # similarities có shape (1, N)

    best_idx = np.argmax(similarities)  # lấy chỉ số chunk có độ tương đồng cao nhất
    return chunks_list[best_idx]

############################################
# 6. Hàm gọi Gemini để tạo câu trả lời
############################################
def generate_answer(text: str) -> str:
    # Tạo phiên hội thoại với model Gemini (thay tên model nếu cần)
    chat_session = genai.GenerativeModel(model_name="gemini-2.0-flash-exp").start_chat()

    # Gửi nội dung (text) tới Gemini
    response = chat_session.send_message(text)

    # Trả về phần text của phản hồi
    return response.text.strip()

############################################
# 7. Chatbot CLI
############################################
def chatbot():
    print("Hello! Ask me anything about the privacy policy. (Gõ 'exit' để thoát)")
    while True:
        question = input("\nYour question: ")
        if question.lower() == "exit":
            print("Goodbye!")
            break

        # 7.1. Tìm đoạn phù hợp nhất
        best_answer_text = find_best_answer(question)
        print(f"Found relevant text: {best_answer_text}")

        # 7.2. Gửi đoạn tìm được tới Gemini để tạo câu trả lời
        answer = generate_answer(best_answer_text)

        # 7.3. In câu trả lời
        print("Answer:", answer)

############################################
# 8. Thực thi (chạy chatbot)
############################################
if __name__ == "__main__":
    chatbot()


Hello! Ask me anything about the privacy policy. (Gõ 'exit' để thoát)

Your question: When is the lastest update?
Found relevant text: Last updated 15 Sep 2023
At Presight, we are committed to protecting the privacy of our customers and visitors to our website. This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.
Answer: Okay, this is a good, concise opening to a privacy policy. Here are some of my thoughts and observations:

**Strengths:**

* **Clear and Direct:** It immediately states its purpose: to explain how Presight handles information.
* **Focus on Protection:** The opening line emphasizes commitment to protecting privacy, building trust.
* **Identifies Target Audience:** It specifies "customers and visitors to our website," clearly defining who the policy applies to.
* **States Scope:** It clearly says the policy will explain collection, use, and disclosure of information.
* **Up-to-date:** The date indicates the policy i