In [18]:
import camelot

pdf_path = "content.pdf"

tables = camelot.read_pdf(pdf_path, pages="all", flavor='lattice')
print(f"Tổng cộng: {tables.n} bảng được tìm thấy.")

table_strings = []

for i, table in enumerate(tables):
    table_str = table.df.to_markdown(index=False)  # Dễ đọc hơn với index=False
    table_strings.append(table_str)
    print(f"\n--- Bảng {i + 1} ---")
    print(table_str)



Tổng cộng: 10 bảng được tìm thấy.

--- Bảng 1 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | Elected to the Hall of Fame on this ballot (named |
|     | in bold italics).                                 |
|     | Elected subsequently, as of 2025 (named in plain  |
|     | italics).                                         |
|     | Renominated for the 2019 BBWAA election by        |
|     | adequate performance on this ballot and has not   |
|     | subsequently been eliminated.                     |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expiration on subsequent      |

--- Bảng 2 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | ballots.                                          |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expira

In [19]:
import requests
import json

def generate_response(prompt):
    """
    Gửi prompt đến Ollama và nhận về một đoạn phản hồi hoàn chỉnh.
    
    Tham số:
        prompt (str): Câu hỏi hoặc yêu cầu bạn muốn gửi đến mô hình.

    Trả về:
        str: Phản hồi hoàn chỉnh từ mô hình.
    """
    full_response = ""
    context = f"""
    You are given a list of tables extracted from a PDF document using OCR. Due to page breaks or layout issues, some tables have been split across multiple pages, resulting in several smaller table fragments.

    Your task is to:
    1. Identify which table fragments belong to the same original table.
    2. Merge these fragments back into complete, coherent tables.
    3. Ensure that rows and columns align correctly as they would appear in the original document.
    4. Preserve all data without duplication or omission.

    Please just return the final merged tables in a clean format (e.g., markdown or plain text), clearly labeled if there are multiple tables.

    Here is the input list of table fragments:

    {prompt}
    """
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "gemma3:latest",
        "prompt": context,
        "stream": True
    }

    with requests.post(url, json=payload, stream=True) as response:
        for line in response.iter_lines():
            if line:
                try:
                    data = json.loads(line)
                    if 'response' in data:
                        chunk = data['response']
                        full_response += chunk
                    if data.get('done', False):
                        break
                except json.JSONDecodeError:
                    continue  

    return full_response.strip()

In [20]:
all_tables_combined = ""

for i, table in enumerate(tables):
    table_str = table.df.to_markdown(index=False)  
    all_tables_combined += f"\n\n--- Bảng {i + 1} ---\n"
    all_tables_combined += table_str





In [21]:
print("\n--- Toàn bộ bảng đã gộp ---")
print(all_tables_combined)


--- Toàn bộ bảng đã gộp ---


--- Bảng 1 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | Elected to the Hall of Fame on this ballot (named |
|     | in bold italics).                                 |
|     | Elected subsequently, as of 2025 (named in plain  |
|     | italics).                                         |
|     | Renominated for the 2019 BBWAA election by        |
|     | adequate performance on this ballot and has not   |
|     | subsequently been eliminated.                     |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expiration on subsequent      |

--- Bảng 2 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | ballots.                                          |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expiration

In [22]:
if all_tables_combined.strip():
    prompt = "Dưới đây là toàn bộ các bảng dữ liệu được trích xuất từ tài liệu PDF:\n\n" + all_tables_combined
    print(generate_response(prompt))

Okay, here's the merged and reconstructed table data, based on the provided fragments.  I've aimed for a coherent table structure, combining fragments where possible.  Note that due to the OCR quality and fragmented nature of the input, there might be minor inconsistencies.

**Table 1: BBWAA Election Candidates**

| Rank | Candidate            | Votes | Percent | Change | Year |
|------|----------------------|-------|---------|--------|------|
| 1    | Chipper Jones†        | 410   | 97.2%   | –      | 1st  |
| 2    | Vladimir Guerrero†    | 392   | 92.9%   | 21.2%  | 2nd  |
| 3    | Jim Thome†            | 379   | 89.8%   | –      | 1st  |
| 4    | Trevor Hoffman       | 337   | 79.9%   | 5.9%   | 3rd  |
| 5    | Edgar Martínez        | 297   | 70.4%   | 11.8%  | 9th  |
| 6    | Mike Mussina          | 268   | 63.5%   | 11.7%  | 5th  |
| 7    | Roger Clemens         | 242   | 57.3%   | 3.2%   | 6th  |
| 8    | Barry Bonds           | 238   | 56.4%   | 2.6%   | 6th  |
| 9    | Curt Sch