In [1]:
import camelot

pdf_path = "content.pdf"

tables = camelot.read_pdf(pdf_path, pages="all", flavor='lattice')
print(f"Tổng cộng: {tables.n} bảng được tìm thấy.")

table_strings = []

for i, table in enumerate(tables):
    table_str = table.df.to_markdown(index=False)  # Dễ đọc hơn với index=False
    table_strings.append(table_str)
    print(f"\n--- Bảng {i + 1} ---")
    print(table_str)



  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Tổng cộng: 10 bảng được tìm thấy.

--- Bảng 1 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | Elected to the Hall of Fame on this ballot (named |
|     | in bold italics).                                 |
|     | Elected subsequently, as of 2025 (named in plain  |
|     | italics).                                         |
|     | Renominated for the 2019 BBWAA election by        |
|     | adequate performance on this ballot and has not   |
|     | subsequently been eliminated.                     |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expiration on subsequent      |

--- Bảng 2 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | ballots.                                          |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expira

In [2]:
import requests
import json

def generate_response(prompt):
    """
    Gửi prompt đến Ollama và nhận về một đoạn phản hồi hoàn chỉnh.
    
    Tham số:
        prompt (str): Câu hỏi hoặc yêu cầu bạn muốn gửi đến mô hình.

    Trả về:
        str: Phản hồi hoàn chỉnh từ mô hình.
    """
    full_response = ""
    context = f"""
   You are given a list of table fragments extracted from a PDF document using OCR. Some tables were split across multiple pages and therefore appear as separate fragments in the input.

### 📌 STRICT MERGE RULES:
- ONLY merge tables that have **exactly the same number of columns**
- DO NOT merge if data types or content meaning do not match
- Merge by appending rows (row-wise concatenation), NOT by adding new columns
- DO NOT add explanations, summaries, interpretations, or any text outside the tables themselves
- DO NOT infer missing rows, extrapolate values, or fill in gaps
- DO NOT modify any content — preserve original values exactly as they appear

Your task is to:
1. Identify which table fragments have the exact same number of columns and matching structure
2. Merge them into complete tables by appending rows
3. Leave all other tables unchanged
4. Output only:
   - The merged tables
   - The remaining original tables that couldn't be merged
5. Format all output in clean markdown tables.
6. Preserve original formatting and alignment — do not restructure or reorder data
Please follow these examples to understand the merging process:
---

### ✅ EXAMPLE 1 – Tables with matching column count and data pattern


---

### ✅ EXAMPLE 2 – Tables without headers but matching columns


---

### ✅ EXAMPLE 3 – Tables with empty cells or partial data



---

### ✅ EXAMPLE 4 – Tables with different structures should stay separate


---

### ✅ EXAMPLE 5 – Tables with similar names but mismatched columns


---

### ✅ EXAMPLE 6 – Multiple small fragments with matching columns



---

### ✅ EXAMPLE 7 – Fragments with blank rows


---

### ✅ EXAMPLE 8 – Table with inconsistent spacing


---

### ✅ EXAMPLE 9 – Merging based on numeric value patterns

---

### ✅ EXAMPLE 10 – Don’t merge even if similar content



---
Please concatenate all the table parts below row by row into one single table, making sure they form a single table. The result should be the concatenated table and any completely separate tables remaining.
{prompt}
    """
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": "gemma3:latest",
        "prompt": context,
        "stream": True
    }

    with requests.post(url, json=payload, stream=True) as response:
        for line in response.iter_lines():
            if line:
                try:
                    data = json.loads(line)
                    if 'response' in data:
                        chunk = data['response']
                        full_response += chunk
                    if data.get('done', False):
                        break
                except json.JSONDecodeError:
                    continue  

    return full_response.strip()

In [3]:
all_tables_combined = ""

for i, table in enumerate(tables):
    table_str = table.df.to_markdown(index=False)  
    all_tables_combined += f"\n\n--- Bảng {i + 1} ---\n"
    all_tables_combined += table_str





In [4]:
print("\n--- Toàn bộ bảng đã gộp ---")
print(all_tables_combined)


--- Toàn bộ bảng đã gộp ---


--- Bảng 1 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | Elected to the Hall of Fame on this ballot (named |
|     | in bold italics).                                 |
|     | Elected subsequently, as of 2025 (named in plain  |
|     | italics).                                         |
|     | Renominated for the 2019 BBWAA election by        |
|     | adequate performance on this ballot and has not   |
|     | subsequently been eliminated.                     |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expiration on subsequent      |

--- Bảng 2 ---
| 0   | 1                                                 |
|:----|:--------------------------------------------------|
|     | ballots.                                          |
|     | Eliminated from annual BBWAA consideration by     |
|     | poor performance or expiration

In [5]:
if all_tables_combined.strip():
    prompt = all_tables_combined
    print(generate_response(prompt))

```markdown
## Merged Tables

### Table 1 & 2

| 0                 | 1                                                 |
|:------------------|:--------------------------------------------------|
| Elected to the Hall of Fame on this ballot (named |
| in bold italics). | Elected subsequently, as of 2025 (named in plain  |
| italics).          | Renominated for the 2019 BBWAA election by        |
|                     | adequate performance on this ballot and has not   |
|                     | subsequently been eliminated.                     |
|                     | Eliminated from annual BBWAA consideration by     |
|                     | poor performance or expiration on subsequent      |

### Table 3 & 7

| 0   | 1             | 2   |
|:----|:--------------|:----|
|     |               |     |
|     |               |     |
|     | Chipper Jones |     |
|     | Jim Thome |     |

### Table 4 & 5

| 0                 | 1     | 2       | 3      | 4    |
|:------------------|:------|: