In [1]:
import json
from typing import Any, Dict, List

tables_sources = json.load(open("results.json"))
sources = list(tables_sources.keys())
for source in sources:
    print(source)
    print(len(tables_sources[source]), "tables")
    print("-" * 100)

c935e2902adf7040a6ffe0db0f7c11e6
3 tables
----------------------------------------------------------------------------------------------------
659dfd3e5c22a64d7597cb50c6fc455e
4 tables
----------------------------------------------------------------------------------------------------


## Summarize

In [2]:
import os

from dotenv import load_dotenv
from langchain.chains.summarize.chain import load_summarize_chain
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI

load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv("GOOGLE_API_KEY")

In [3]:
tables_sources[sources[0]]

[{'table_content': '# c935e2902adf7040a6ffe0db0f7c11e6_table_0\nThis is a cross-page table. It spans multiple pages. Page numbers: [4, 5, 7, 8, 10, 11, 12, 13]\n| Air Jordan         | Year   | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |\n|:-------------------|:-------|:-----------------------------------------------------------------------------------------------------------------------------

In [64]:
from uuid import uuid4

table_entry = tables_sources[sources[0]][0]

documents = [Document(
    page_content=table_entry["table_content"],
    metadata={
        "source": str(table_entry["source"]),
        "page_numbers": str(table_entry["page_numbers"]),
        "source_table_idx": str(f"{table_entry['source']}_table_{table_entry['table_idx']}")
    }
)
    for table_entry in tables_sources[source] for source in sources]

In [70]:
#### nhận vào một cái Document chứa bảng lớn và các metadata
#### Summarize cái bảng sau đó tạo document mới với page_content là summary và metadata là: source, page_numbers, is_summary
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace



def summarize_table(table_document):
    # api
    # llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0, max_tokens=1000)
    # local
    llm = HuggingFaceEndpoint(
        repo_id="microsoft/Phi-3-mini-4k-instruct",
        task="text-generation",
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
    )
    
    summary_chain = load_summarize_chain(llm=llm, chain_type="map_reduce")

    summary_output = summary_chain.invoke([table_document])
    summary = summary_output['output_text']

    summary_document = Document(
        page_content=summary,
        metadata={
            "source": table_document.metadata["source"],
            "page_numbers": table_document.metadata["page_numbers"],
            "is_summary": True,
            "source_table_idx": table_document.metadata["source_table_idx"]
        }
    )
    return summary_document

# def get_detail_chunks(table_document):
#     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#     chunks = text_splitter.split_documents([table_document])
    
#     for i, chunk in enumerate(chunks):
#         chunk.metadata.update({
#             "chunk_id": i,
#             "source": table_document.metadata["source"],
#             "page_numbers": table_document.metadata["page_numbers"],
#             "is_summary": False
#         })
#     return chunks




## Vectorstore

In [67]:
from langchain_milvus import Milvus
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

def create_vector_store(documents, name):
    URI = f"./{name}.db"
    vector_store = Milvus(
        embedding_function=embeddings,
        connection_args={"uri": URI},
        index_params={"index_type": "FLAT", "metric_type": "L2"},
        drop_old=True,
    )
    uuids = [str(uuid4()) for _ in range(len(documents))]
    vector_store.add_documents(documents=documents, ids=uuids)
    return uuids,vector_store


In [68]:
# def encode_table_document(table_documents):
#     summary_documents = []
#     detail_chunks = []
    
#     for table_document in table_documents:
#         summary_documents.append(summarize_table(table_document))
#         detail_chunks.extend(get_detail_chunks(table_document))
    
#     uuids_summary, summary_vectorstore = create_vector_store(summary_documents, "summary_vectorstore")
#     uuids_detail, detail_vectorstore = create_vector_store(detail_chunks, "detail_vectorstore")
    
#     return summary_vectorstore, detail_vectorstore


def encode_table_document(table_documents):
    summary_documents = []
    original_documents = []
    
    for table_document in table_documents:
        summary_documents.append(summarize_table(table_document))
        original_documents.append(table_document)
    
    uuids_summary, summary_vectorstore = create_vector_store(summary_documents, "summary_vectorstore")
    uuids_detail, detail_vectorstore = create_vector_store(original_documents, "detail_vectorstore")
    
    return summary_vectorstore, detail_vectorstore

In [None]:
summary_vectorstore, detail_vectorstore = encode_table_document(documents)

In [83]:
def retrieve_hierachical(query, summary_vectorstore, detail_vectorstore, k_summaries=3, k_details=5, verbose=False):
    top_summaries = summary_vectorstore.similarity_search_with_score(query, k=k_summaries)
    
    if verbose:
        for summary, score in top_summaries:
            print(f"* [SIM={score:3f}] {summary.page_content} [{summary.metadata}]")
        print("-"*100)
    relevant_chunks = []
    # Track seen documents to avoid duplicates
    seen_documents = set()
    for summary, score in top_summaries:
        source_table_idx = summary.metadata["source_table_idx"]
        source_filter = lambda metadata: metadata["source_table_idx"] == source_table_idx
        detail_results = detail_vectorstore.similarity_search_with_score(
            query,
            k=k_details,
            filter=source_filter,
        )
        
        # Only add documents that haven't been seen before
        for doc, doc_score in detail_results:
            doc_id = (doc.page_content, doc.metadata.get("source_table_idx"))
            if doc_id not in seen_documents:
                relevant_chunks.append((doc, doc_score))
                seen_documents.add(doc_id)
    
    # Sort by relevance score
    relevant_chunks.sort(key=lambda x: x[1])
    
    return relevant_chunks

In [84]:
query = "Who directed the film 'Star Trek'?"
relevant_chunks = retrieve_hierachical(query, summary_vectorstore, detail_vectorstore)

In [85]:
for chunk in relevant_chunks:
    print("table_idx: ", chunk[0].metadata['source_table_idx'])
    print("page_content: ", chunk[0].page_content)

table_idx:  659dfd3e5c22a64d7597cb50c6fc455e_table_0
page_content:  # 659dfd3e5c22a64d7597cb50c6fc455e_table_0
This is a cross-page table. It spans multiple pages. Page numbers: [5, 6, 7]
| Year   | Title                                         | Role                    | Director                     | Notes                                                                          |
|:-------|:----------------------------------------------|:------------------------|:-----------------------------|:-------------------------------------------------------------------------------|
| 2000   | A Man Is Mostly Water                         | Augie                   | Fred Parnes                  |                                                                                |
| 2001   | Delivering Milo                               | Milo                    | Nick Castle                  |                                                                                |
| 2001   | 15 Minutes   