In [18]:
import json
import pickle
from sentence_transformers import SentenceTransformer

def extract_text_from_subcontent(subcontent):
    """
    subcontent can be:
    - a list of strings
    - a list of dicts (each may have subheader, paragraph, items, details, etc.)
    This function returns a list of text blocks.
    """
    results = []

    # If every item in subcontent is a string
    if all(isinstance(item, str) for item in subcontent):
        results.extend(subcontent)
    else:
        # subcontent is a list of dicts
        for elem in subcontent:
            if isinstance(elem, dict):
                # subheader
                if "subheader" in elem and elem["subheader"]:
                    results.append(elem["subheader"])

                # paragraph
                if "paragraph" in elem and elem["paragraph"]:
                    results.append(elem["paragraph"])

                # items
                if "items" in elem and isinstance(elem["items"], list):
                    for it in elem["items"]:
                        # sub-subparagraph
                        if "paragraph" in it and it["paragraph"]:
                            results.append(it["paragraph"])
                        # details in items
                        if "details" in it and it["details"]:
                            for d in it["details"]:
                                results.append(d)

                # top-level details
                if "details" in elem and elem["details"]:
                    for d in elem["details"]:
                        results.append(d)

    return results


########################################
# 1. Read the JSON file
########################################
with open("cleaned_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

########################################
# 2. Create one chunk per header
########################################
all_data_chunks = []
for idx, item in enumerate(data):
    header_name = item["header"]
    extracted_text_list = extract_text_from_subcontent(item["sub_content"])

    # Combine header name + extracted text
    combined_text = f"{header_name}\n" + "\n".join(extracted_text_list)
    # Alternatively:
    # combined_text = "\n".join(extracted_text_list) + f"\nHeader: {header_name}"

    all_data_chunks.append({
        "header_index": idx,
        "header": header_name,
        "text": combined_text
    })

########################################
# 3. Encode each chunk (header + subcontent) with sentence-transformers
########################################
model = SentenceTransformer("all-MiniLM-L6-v2")
texts_for_embedding = [item["text"] for item in all_data_chunks]
embeddings = model.encode(texts_for_embedding, show_progress_bar=True)

########################################
# 4. Gather results in a list
########################################
indexed_data = []
for i, emb in enumerate(embeddings):
    indexed_data.append({
        "header_index": all_data_chunks[i]["header_index"],
        "header": all_data_chunks[i]["header"],
        "text": all_data_chunks[i]["text"],
        "embedding": emb
    })

########################################
# 5. (Optional) Save to .pkl file
########################################
with open("indexed_data.pkl", "wb") as f:
    pickle.dump(indexed_data, f)

print("Number of chunks created:", len(indexed_data))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Number of chunks created: 23


In [19]:
import pickle

# Giả sử bạn có file data.pkl
with open("indexed_data.pkl", "rb") as f:
    data = pickle.load(f)


In [21]:
data

[{'header_index': 0,
  'header': 'By Role',
  'text': 'By Role\nBusiness Leaders\nData Leaders\nData Analysts\nBusiness Teams',
  'embedding': array([-1.18501643e-02,  4.14280780e-02, -1.62397828e-02,  4.56051044e-02,
         -6.30490109e-02, -3.65320891e-02, -1.25255091e-02, -1.59797911e-02,
         -7.91620836e-03,  2.32125800e-02, -2.18259916e-02,  1.89399663e-02,
          6.11355826e-02,  3.04034725e-02,  1.45449666e-02,  3.98719944e-02,
         -4.34052572e-02, -1.00315465e-02, -7.07916245e-02, -1.18882850e-01,
         -9.72364843e-02, -1.09466836e-01, -6.91230223e-02, -5.42902425e-02,
         -2.42530070e-02, -2.09350046e-03, -2.55640899e-03, -3.91703919e-02,
          9.92233586e-03, -6.47066161e-02, -6.42732382e-02, -7.67711774e-02,
          8.40579569e-02,  8.40199813e-02, -4.67182547e-02,  2.83426195e-02,
          2.99433973e-02,  2.90693697e-02,  6.78663328e-02,  5.69281615e-02,
         -4.39241789e-02,  1.58043427e-03, -6.93218485e-02, -2.43740659e-02,
         -2.

In [46]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import pickle
import google.generativeai as genai

##############################################
# 1. Configure Gemini API
##############################################
genai.configure(api_key="AIzaSyDKBsKeCWQXk5vVnSBTTql_cXiV-JzO8bg")  # Replace with your real key

##############################################
# 2. Load your data and model
##############################################
model = SentenceTransformer("all-MiniLM-L6-v2")

# Load indexed data (headers and content chunks)
with open("indexed_data.pkl", "rb") as f:
    indexed_data = pickle.load(f)

##############################################
# 3. Separate header and content embeddings
##############################################
header_embeddings = []
content_embeddings = []
chunks_list = []

for item in indexed_data:
    header_embeddings.append(model.encode([item["header"]])[0])  # Header embedding
    content_embeddings.append(item["embedding"])                # Content embedding
    chunks_list.append(item)                                    # Save chunk

header_embeddings_array = np.vstack(header_embeddings)  # (N, dim)
content_embeddings_array = np.vstack(content_embeddings)  # (N, dim)

##############################################
# 4. Retrieval: Header Priority with Fallback
##############################################
def find_best_chunks(question, header_weight=2.0, content_weight=1.0, k=2):
    """
    Retrieve the most relevant chunks for a user query by prioritizing header matches.
    - Combines header similarity and content similarity with adjustable weights.
    - Returns the top-k chunks based on the combined score.
    """
    # Embed the question
    question_embedding = model.encode([question])

    # Compute header and content similarities
    header_similarities = cosine_similarity(question_embedding, header_embeddings_array)[0]  # (N,)
    content_similarities = cosine_similarity(question_embedding, content_embeddings_array)[0]  # (N,)

    # Compute a weighted combined score
    combined_scores = header_weight * header_similarities + content_weight * content_similarities

    # Get the top-k chunks based on the combined score
    sorted_indices = np.argsort(combined_scores)[::-1]  # Sort in descending order
    top_indices = sorted_indices[:k]

    # Retrieve the top-k chunks
    top_chunks = [chunks_list[i] for i in top_indices]

    return top_chunks

##############################################
# 5. Send combined context to Gemini
##############################################
def generate_answer(question, chunks):
    """
    Combine the top chunks and send them to Gemini for answer generation.
    """
    context = "\n\n---\n\n".join([chunk["text"] for chunk in chunks])
    prompt = f"""
You are an AI assistant. Use the provided context to answer the user's question in English.

Context:
{context}

User's Question:
{question}

Answer in natural language:
"""
    # Call Gemini
    chat_session = genai.GenerativeModel(model_name="gemini-2.0-flash-exp").start_chat()
    response = chat_session.send_message(prompt)
    return response.text.strip()

##############################################
# 6. Chatbot CLI
##############################################
def chatbot():
    print("Hello! Ask me anything about the privacy policy. Type 'exit' to quit.")
    while True:
        user_question = input("\nYour question: ")
        if user_question.lower() == "exit":
            print("Goodbye!")
            break

        # Retrieve the best chunk(s)
        best_chunks = find_best_chunks(user_question, header_weight=3.0, content_weight=2.0, k=3)
        print(f"Found relevant chunks:\n{[chunk['header'] for chunk in best_chunks]}")

        # Generate the answer from Gemini
        answer = generate_answer(user_question, best_chunks)
        print("\nAnswer:", answer)

##############################################
# 7. Run Chatbot
##############################################
if __name__ == "__main__":
    chatbot()


Hello! Ask me anything about the privacy policy. Type 'exit' to quit.

Your question: What measures does Presight take to ensure data security?
Found relevant chunks:
['Data Security', 'Use of Data', 'PRIVACY POLICY']

Answer: Presight employs several measures to ensure data security. They encrypt all data both while it's being transferred and when it's stored using industry-standard encryption methods. Additionally, they regularly conduct security audits and vulnerability assessments. Furthermore, their employees are trained on data security best practices, and access to customer data is restricted to those who need it for their job functions.

Your question: exit
Goodbye!
