In [18]:
import json

tables_sources = json.load(open("final_results.json"))
sources = sorted(list(tables_sources.keys()))
for source in sources:
    print(f"source {source} has {len(tables_sources[source])} tables")
    print("-" * 100)
    # Loop through each table in the source
    # for table in tables_sources[source]:
        
    #     ## Mỗi 1 table có các keys là ['table_content', 'page_numbers', 'source', 'table_idx']
    #     ## Trong đó table_content là markdown của bảng
    #     ## page_numbers là list các page number mà bảng xuất hiện
    #     ## source là tên của pdf
    #     ## table_idx là index của bảng trong pdf
    #     print(table['table_content'])
    #     print("-" * 100)


source 0c92f65db928c431023f59603039aa1e has 15 tables
----------------------------------------------------------------------------------------------------
source 0ed7f2ecfd607a42c745b0889e500ecf has 3 tables
----------------------------------------------------------------------------------------------------
source 0f2eac764c8d04d8d13d8a999342d106 has 9 tables
----------------------------------------------------------------------------------------------------
source 2c98e99a08ec5392d50e60370d871319 has 1 tables
----------------------------------------------------------------------------------------------------
source 2d64938a8d3e12dbb709f760ecd53e14 has 1 tables
----------------------------------------------------------------------------------------------------
source 2e1b34aa28e718f66bfe600045ed7387 has 5 tables
----------------------------------------------------------------------------------------------------
source 3e8e568ec576f0189cee0b6032720054 has 9 tables
----------------------

In [19]:
for table in tables_sources["0c92f65db928c431023f59603039aa1e"]:
    print(table["table_content"])

# 0c92f65db928c431023f59603039aa1e_table_0
## Associated Context(s) Before Table:
- films: 7th Heaven (1927), Street Table key

This is a single-page table. Page number: 2

| 0   | 1                              |
|:----|:-------------------------------|
|     | Table key                      |
| ‡   | Indicates the winner           |
| †   | Indicates a posthumous nominee |

Shape: (3, 2)

# 0c92f65db928c431023f59603039aa1e_table_1
## Associated Context(s) Before Table:
- 1920s ​[ edit ]

This is a single-page table. Page number: 2

| 0        | 1                | 2                  | 3                      | 4    |
|:---------|:-----------------|:-------------------|:-----------------------|:-----|
| Year     | Actress          | Role(s)            | Film                   | Ref. |
| 1927/28  | Janet Gaynor ‡   | Diane              | 7th Heaven             | [6]  |
| (1st)    |                  |                    |                        |      |
|          |                  | Ang

## Summarize

In [20]:
import os

from dotenv import load_dotenv
from langchain.chains.summarize.chain import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")

In [21]:
# source nghĩa là tên của pdf.
# tables_sources[sources[0]]

# list of this
# {
#     'table_content': '# c935e2902adf7040a6ffe0db0f7c11e6_table_0\n....
#     'page_numbers': [4, 5, 7, 8, 10, 11, 12, 13],
#     'source': 'c935e2902adf7040a6ffe0db0f7c11e6',
#     'table_idx': 0
#     }

### Cách đánh Metadata

In [22]:
from uuid import uuid4

documents = []
for source in sources:
    for table_entry in tables_sources[source]:
        documents.append(Document(
            page_content=table_entry["table_content"],
            metadata={
                "source": str(table_entry["source"]),
                "page_numbers": str(table_entry["page_numbers"]),
                "source_table_idx": str(f"{table_entry['source']}_table_{table_entry['table_idx']}"),
                "is_full_table": True
            }
        ))

In [23]:
len(documents)

180

In [24]:
from google import genai
from google.genai import types
import base64

def summary_fn(prompt):
    client = genai.Client(
        vertexai=True,
        project="oval-crawler-460614-d3",
        location="global",
    )

    text1 = types.Part.from_text(text=f"""You are given a table extracted from a PDF document using OCR.
            Your task is to generate a detailed and comprehensive summary of the content in this table.
            The summary should clearly describe:
                The main subject or topic of the table
                Key columns and their meanings
                Important patterns, trends, or observations in the data
                Any notable values or anomalies
                Contextual information needed to understand the data
            Write the summary in 1–3 full sentences , using clear and precise language.
            If applicable, mention that this fragment appears to be part of a larger table, and include any inferred continuity from the data.\nData: \n{prompt}""")

    model = "gemini-2.0-flash-lite-001"
    contents = [
    types.Content(
        role="user",
        parts=[
        text1
        ]
    )
    ]

    generate_content_config = types.GenerateContentConfig(
    temperature = 1,
    top_p = 1,
    max_output_tokens = 3000,
    safety_settings = [types.SafetySetting(
        category="HARM_CATEGORY_HATE_SPEECH",
        threshold="OFF"
    ),types.SafetySetting(
        category="HARM_CATEGORY_DANGEROUS_CONTENT",
        threshold="OFF"
    ),types.SafetySetting(
        category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
        threshold="OFF"
    ),types.SafetySetting(
        category="HARM_CATEGORY_HARASSMENT",
        threshold="OFF"
    )],
    )

    response = client.models.generate_content(
        model = model,
        contents = contents,
        config = generate_content_config,
        )
    return response.text


### Thêm metadata vào summarize documents

In [25]:
# nhận vào một cái Document chứa bảng lớn và các metadata
# Summarize cái bảng sau đó tạo document mới với page_content là summary và metadata là: source, page_numbers, is_summary
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace


def summarize_table(table_document):
    # llm = ...

    # summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce")

    # summary_output = summary_chain.invoke([table_document])
    # summary = summary_output['output_text']
    
    summary = summary_fn(table_document.page_content)

    summary_document = Document(
        page_content=summary,
        metadata={
            "source": table_document.metadata["source"],
            "page_numbers": table_document.metadata["page_numbers"],
            "is_summary": True,
            "source_table_idx": table_document.metadata["source_table_idx"]
        }
    )
    return summary_document


def get_detail_chunks(table_document):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300, chunk_overlap=150)
    chunks = text_splitter.split_documents([table_document])

    for i, chunk in enumerate(chunks):
        chunk.metadata.update({
            "chunk_id": i,
            "source": table_document.metadata["source"],
            "page_numbers": table_document.metadata["page_numbers"],
            "is_summary": False,
            "is_full_table": False
        })
    return chunks

## Vectorstore

In [26]:
from langchain_milvus import Milvus
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-large")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


def create_vector_store(documents, name):
    URI = f"./{name}.db"
    vector_store = Milvus(
        embedding_function=embeddings,
        connection_args={"uri": URI},
        index_params={"index_type": "FLAT", "metric_type": "L2"},
        drop_old=True,
    )
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents=documents, ids=uuids)
    return uuids, vector_store

def load_vector_store(name):
    URI = f"./{name}.db"
    vector_store = Milvus(
        embeddings,
        connection_args={"uri": URI},
    )
    return vector_store

In [27]:
def encode_table_document(table_documents):
    summary_documents = []
    original_table_and_chunks = []

    for table_document in table_documents:
        summary_documents.append(summarize_table(table_document))
        original_table_and_chunks.append(table_document)
        # original_table_and_chunks.extend(get_detail_chunks(table_document))

    uuids_summary, summary_vectorstore = create_vector_store(
        summary_documents, "summary_vectorstore")
    uuids_detail, detail_vectorstore = create_vector_store(
        original_table_and_chunks, "detail_vectorstore")

    return summary_vectorstore, detail_vectorstore

In [28]:
summary_vectorstore, detail_vectorstore = encode_table_document(documents)

In [29]:
# ## RELOAD VECTOR STORE
# summary_vectorstore = load_vector_store("summary_vectorstore")
# detail_vectorstore = load_vector_store("detail_vectorstore")

In [30]:
# from time import time


# def retrieve_hierachical(query, summary_vectorstore, detail_vectorstore, k_summaries=3, k_details=5, verbose=False):
#     start_time = time()
#     top_summaries = summary_vectorstore.similarity_search_with_score(
#         query, k=k_summaries)

#     if verbose:
#         for summary, score in top_summaries:
#             print(
#                 f"* [SIM={score:3f}] {summary.page_content} [{summary.metadata}]")
#         print("-"*100)
#     relevant_chunks = []
#     seen_chunks = set()  # Track seen chunks by their content

#     for summary, score in top_summaries:
#         source_table_idx = summary.metadata["source_table_idx"]

#         def source_filter(
#             metadata): return (metadata["source_table_idx"] == source_table_idx) and (metadata["is_full_table"] is True)
#         detail_results = detail_vectorstore.similarity_search_with_score(
#             query,
#             k=k_details,
#             filter=source_filter,
#         )
        
#         # Add detail results to relevant chunks if not seen before
#         for doc, doc_score in detail_results:
#             # Create a unique identifier for the chunk using its content
#             chunk_id = doc.page_content
#             if chunk_id not in seen_chunks:
#                 seen_chunks.add(chunk_id)
#                 relevant_chunks.append((doc, doc_score))
                
#     # Sort by relevance score (lower score is better)
#     if relevant_chunks:
#         relevant_chunks.sort(key=lambda x: x[1])
#     end_time = time()
#     print(f"Time taken: {end_time - start_time:.2f} seconds")
#     return relevant_chunks

## Bắt đầu đánh giá

In [31]:
import json

generated_qa_pairs = json.load(open("generated_qa_pairs_final.json"))

In [32]:
def calculate_recall(qa, vectorstore, k = 5):
    question = qa["question"]
    # answer = qa["answer"]
    # context = qa["context"]
    
    retrieved_top_k_items = vectorstore.similarity_search(question, k = k)

    ground_truth_source_ids = [f"{qa['source']}_table_{qa['table_idx']}"]
    relevant = sum(1 for item in retrieved_top_k_items if item.metadata.get("source_table_idx") in ground_truth_source_ids)
    return relevant/len(ground_truth_source_ids) if len(ground_truth_source_ids) > 0 else 0

def calculate_precision(qa, vectorstore, k=5):
    question = qa["question"]
    # answer = qa["answer"] 
    # context = qa["context"]
    
    retrieved_top_k_items = vectorstore.similarity_search(question, k=k)
    ground_truth_source_ids = [f"{qa['source']}_table_{qa['table_idx']}"]
    num_retrieved_and_relevant = 0
    if retrieved_top_k_items: # Đảm bảo có item được truy xuất để kiểm tra
        num_retrieved_and_relevant = sum(
            1
            for item in retrieved_top_k_items
            if item.metadata.get("source_table_idx") in ground_truth_source_ids
        )

    if k > 0:
        precision_at_k = num_retrieved_and_relevant / k
    else:
        precision_at_k = 0

    return precision_at_k
    
def calculate_metrics(generated_qa_pairs, vectorstore, k = 5):
    epsilon = 1e-10
    
    precision_k = []
    recall_k = []
    f1_k = []
    
    for qa in generated_qa_pairs:
        precision_score = calculate_precision(qa, vectorstore, k)
        recall_score = calculate_recall(qa, vectorstore, k)
        f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score + epsilon)
        
        precision_k.append(precision_score)
        recall_k.append(recall_score)
        f1_k.append(f1_score)
        
    precision_k = sum(precision_k) / len(precision_k)
    recall_k = sum(recall_k) / len(recall_k)
    f1_k = sum(f1_k) / len(f1_k)
    
    return precision_k, recall_k, f1_k

In [33]:
K = 5
precision_k, recall_k, f1_k = calculate_metrics(generated_qa_pairs, summary_vectorstore, K)
print(f"- Summary RAG\n    Precision@K: {precision_k:.3f}, Recall@K: {recall_k:.3f}, F1@K: {f1_k:.3f}")

precision_k, recall_k, f1_k = calculate_metrics(generated_qa_pairs, detail_vectorstore, K)
print(f"- Detail RAG\n     Precision@K: {precision_k:.3f}, Recall@K: {recall_k:.3f}, F1@K: {f1_k:.3f}")

- Summary RAG
    Precision@K: 0.038, Recall@K: 0.188, F1@K: 0.062
- Detail RAG
     Precision@K: 0.053, Recall@K: 0.263, F1@K: 0.087
