## Định nghĩa Pipeline

In [1]:
import pymupdf
from extract_tables import (
    generate,
    get_column_types,
    get_input_df,
    preprocess_df,
    solve_non_header_table,
    ProcessedTableEntry,
    main_concatenation_logic
)
    

In [2]:
from typing import List, Tuple, TypedDict

import pandas as pd
import pymupdf


# Define TypedDict for main extracted data
class ExtractedDataType(TypedDict):
    dataframes: List[pd.DataFrame]
    page_numbers: List[int] # User-visible page numbers (e.g., 1, 2, 3...)

def get_limited_text_before_table(
    page: pymupdf.Page, 
    current_table_bbox: pymupdf.Rect, 
    all_table_bboxes_on_page: List[pymupdf.Rect], # List of bboxes for all tables on the page
    n_tokens: int, 
    search_height_multiplier: float = 2.0, 
    min_search_height: int = 30
) -> str:
    
    # Ensure current_table_bbox is a Rect object
    # In this context, it's passed from table_obj.bbox, so it's already a tuple or Rect
    if isinstance(current_table_bbox, tuple):
        if len(current_table_bbox) == 4:
            try:
                current_table_bbox = pymupdf.Rect(current_table_bbox)
            except Exception:
                return ""
        else:
            return ""
    elif not isinstance(current_table_bbox, pymupdf.Rect):
        return ""

    if not current_table_bbox or current_table_bbox.is_empty:
        return ""

    page_width = page.rect.width
    
    try:
        table_height = current_table_bbox.height
        if table_height <= 0: 
            table_height = 10 
    except AttributeError: # Should be caught by the isinstance check above
        return ""

    search_area_height = max(table_height * search_height_multiplier, min_search_height)
    # y0 is the top coordinate of the table, so search_rect_y0 is the bottom edge of the search area
    search_rect_y0 = max(0, current_table_bbox.y0 - 1) # Immediately above the table (-1 to avoid touching)
    search_rect_y1 = max(0, search_rect_y0 - search_area_height) # Top edge of the search area

    if search_rect_y1 >= search_rect_y0: # Invalid or too small search area
         # Try creating a smaller default search area if the table is too close to the top
        if current_table_bbox.y0 <= min_search_height / 2:
            search_rect_y0 = max(0, current_table_bbox.y0 -1)
            search_rect_y1 = max(0, search_rect_y0 - (min_search_height / 2)) # shrink search area
            if search_rect_y1 >= search_rect_y0:
                return ""
        else: # old logic
            search_rect_y1 = max(0, current_table_bbox.y0 - min_search_height) # y0 is the top edge of the table
            search_rect_y0 = max(0, current_table_bbox.y0 - 1)
            if search_rect_y1 >= search_rect_y0:
                return ""


    search_rect = pymupdf.Rect(0, search_rect_y1, page_width, search_rect_y0)
    if search_rect.is_empty or search_rect.height <= 0:
        return ""

    # Get text blocks in the search area
    # block = (x0, y0, x1, y1, "lines in block", block_no, block_type)
    # block_type: 0 for text, 1 for image
    blocks_in_search_area = page.get_text("blocks", clip=search_rect, sort=True)
    
    filtered_text_lines = []
    for block_data in blocks_in_search_area:
        if block_data[6] != 0: # Only process text blocks (block_type == 0)
            continue

        block_rect = pymupdf.Rect(block_data[0], block_data[1], block_data[2], block_data[3])
        block_text_content = block_data[4]
        
        is_part_of_a_table = False
        # Check if this text block overlaps with ANY table on the page
        for table_bbox_on_page in all_table_bboxes_on_page:
            # No need to compare with current_table_bbox here, as search_rect
            # is defined to be above current_table_bbox.
            # We want to remove text if it's part of *any* table structure.
            
            # Safely convert to Rect if it's not already
            if isinstance(table_bbox_on_page, tuple):
                test_table_rect = pymupdf.Rect(table_bbox_on_page)
            else:
                test_table_rect = table_bbox_on_page

            if not test_table_rect.is_empty:
                intersection = block_rect.irect & test_table_rect.irect # Use .irect for integer coordinates
                if not intersection.is_empty: # If there is an overlap
                    is_part_of_a_table = True
                    break 
        
        if not is_part_of_a_table:
            # Handle newlines and add to list
            cleaned_block_text = block_text_content.replace("\n", " ").strip()
            if cleaned_block_text: # Only add if there's content after cleaning
                filtered_text_lines.append(cleaned_block_text)

    # Join filtered text lines, normalize whitespace
    text_above = " ".join(filtered_text_lines)
    text_above = " ".join(text_above.split()) # Remove extra whitespace

    if not text_above:
        return ""

    tokens = text_above.split()
    if len(tokens) > n_tokens:
        limited_text = " ".join(tokens[-n_tokens:])
    else:
        limited_text = " ".join(tokens)

    return limited_text

def extract_tables_and_contexts(doc: pymupdf.Document, 
                                page_numbers_to_process, 
                                n_tokens_context: int,
                                search_height_multiplier: float = 2.5, 
                                min_search_height: int = 40
                               ) -> Tuple[ExtractedDataType, List[str]]:
    all_dataframes: List[pd.DataFrame] = []
    all_contexts: List[str] = []
    all_df_page_numbers: List[int] = []

    default_extracted_data: ExtractedDataType = {"dataframes": [], "page_numbers": []}

    if not isinstance(doc, pymupdf.Document):
        print("Lỗi: 'doc' phải là một đối tượng pymupdf.Document.")
        return default_extracted_data, []

    if isinstance(page_numbers_to_process, int):
        pages_to_process_indices = [page_numbers_to_process - 1]
    elif isinstance(page_numbers_to_process, list) and all(isinstance(pn, int) for pn in page_numbers_to_process):
        pages_to_process_indices = sorted(list(set(p - 1 for p in page_numbers_to_process)))
    else:
        print("Lỗi: page_numbers_to_process phải là số nguyên hoặc danh sách số nguyên (bắt đầu từ 1).")
        return default_extracted_data, []

    for page_index in pages_to_process_indices:
        if not (0 <= page_index < len(doc)):
            print(f"Cảnh báo: Số trang {page_index + 1} (index {page_index}) không hợp lệ. Bỏ qua.")
            continue
        
        page = doc[page_index]
        current_page_human_readable = page.number + 1
        
        # Step 1: Find all tables on the page and get their bboxes
        page_table_finder = page.find_tables()
        all_found_table_bboxes_on_page: List[pymupdf.Rect] = []
        if page_table_finder.tables:
             all_found_table_bboxes_on_page = [pymupdf.Rect(tbl.bbox) for tbl in page_table_finder.tables if tbl.bbox]


        if page_table_finder.tables:
            for table_obj in page_table_finder.tables:
                if not table_obj.bbox: # Skip if table has no bbox
                    continue

                df_original = table_obj.to_pandas()
                df_processed = preprocess_df(df_original)

                if not df_processed.empty:
                    current_table_actual_bbox = pymupdf.Rect(table_obj.bbox)
                    
                    context_text = get_limited_text_before_table(
                        page,
                        current_table_actual_bbox,
                        all_found_table_bboxes_on_page, # Pass the list of bboxes of all tables
                        n_tokens_context,
                        search_height_multiplier=search_height_multiplier,
                        min_search_height=min_search_height
                    )
                    all_contexts.append(context_text)
                    all_dataframes.append(df_processed)
                    all_df_page_numbers.append(current_page_human_readable) 
    
    extracted_data_output: ExtractedDataType = {
        "dataframes": all_dataframes,
        "page_numbers": all_df_page_numbers
    }
    
    if not (len(all_dataframes) == len(all_contexts) == len(all_df_page_numbers)):
        print(f"Cảnh báo logic nghiêm trọng: Độ dài các list không khớp! DFs: {len(all_dataframes)}, Contexts: {len(all_contexts)}, PageNums: {len(all_df_page_numbers)}")
        return default_extracted_data, []
        
    return extracted_data_output, all_contexts

In [3]:
import os

def extract_tables_from_sources_pymu(
    sources: List[str],
) -> List[ProcessedTableEntry]:
    all_final_results: List[ProcessedTableEntry] = []
    
    for source_path in sources:
        doc = pymupdf.open(source_path)
        pages_to_scan_list = list(range(1, len(doc)))
        results_multiple_pages = extract_tables_and_contexts(doc, pages_to_scan_list, n_tokens_context=15)
        base_name_with_ext = os.path.basename(source_path)
        source_name_short = os.path.splitext(base_name_with_ext)[0]
        
        full_prompt = ""
        for i, df in enumerate(results_multiple_pages[0]['dataframes']):
            if df.empty:
                continue
            
            df_string = get_input_df(df, n_rows=7)
            column_types = get_column_types(df=df)
            n_columns = len(df.columns)
            
            prompt_part = f"Table {i}:\n"
            if results_multiple_pages[1][i]:
                prompt_part += f"Context before table:\n{results_multiple_pages[1][i]}\n\n"
            prompt_part += f"{df_string}\nNumber of columns: {n_columns}\nColumn types: {column_types}\n\n"

            full_prompt += prompt_part
            
        concat_json = generate(full_prompt)
        results_for_current_source = main_concatenation_logic(
            extracted_data=ExtractedDataType(
                dataframes=results_multiple_pages[0]['dataframes'],
                page_numbers=results_multiple_pages[0]['page_numbers']
            ),
            concat_json=concat_json,
            source_name=source_name_short
        )
        all_final_results.extend(
            results_for_current_source) 
    return all_final_results

## Chạy Pipeline

In [4]:
results = extract_tables_from_sources_pymu([
    "2c98e99a08ec5392d50e60370d871319.pdf",
    "4f37fc393094547bbbe030cecedf9a3b.pdf",
    "b014b8ca3c8ee543b655c29747cc6090.pdf",
    "c46a0e327c07d6d3a04b23f7de59b55d.pdf",
    "c935e2902adf7040a6ffe0db0f7c11e6.pdf"
])

In [12]:
from extract_tables import get_table_content
json_result = get_table_content(results)
# save
import json
with open('new_result.json', 'w') as f:
    json.dump(json_result, f)



source c935e2902adf7040a6ffe0db0f7c11e6 has 1 tables
source c46a0e327c07d6d3a04b23f7de59b55d has 3 tables
source b014b8ca3c8ee543b655c29747cc6090 has 8 tables
source 2c98e99a08ec5392d50e60370d871319 has 2 tables
source 4f37fc393094547bbbe030cecedf9a3b has 7 tables


In [13]:
import json

tables_sources = json.load(open("new_result.json"))
sources = list(tables_sources.keys())
for source in sources:
    print(f"source {source} has {len(tables_sources[source])} tables")
    print("-" * 100)

source c935e2902adf7040a6ffe0db0f7c11e6 has 1 tables
----------------------------------------------------------------------------------------------------
source c46a0e327c07d6d3a04b23f7de59b55d has 3 tables
----------------------------------------------------------------------------------------------------
source b014b8ca3c8ee543b655c29747cc6090 has 8 tables
----------------------------------------------------------------------------------------------------
source 2c98e99a08ec5392d50e60370d871319 has 2 tables
----------------------------------------------------------------------------------------------------
source 4f37fc393094547bbbe030cecedf9a3b has 7 tables
----------------------------------------------------------------------------------------------------


In [16]:
for source in sources:
    print(f"----SOURCE: {source}----")
    for table in tables_sources[source]:
        print(table['table_content'])
        print("-" * 100)
    print("=" * 100)

----SOURCE: c935e2902adf7040a6ffe0db0f7c11e6----
# c935e2902adf7040a6ffe0db0f7c11e6_table_0
This is a cross-page table. It spans multiple pages. Page numbers: [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13]
| Model              | Year   | Notes                                                            |
|:-------------------|:-------|:-----------------------------------------------------------------|
| Model              | Year   | Notes                                                            |
| Air Jordan II      | 1986   | The success of the Air Jordan I encouraged Nike to release a     |
|                    |        | new Air Jordan in 1986 for the new basketball season.            |
|                    |        | Designed by Peter Moore and Bruce Kilgore, the Air Jordan II     |
|                    |        | was originally made in Italy.[24][25] In early tests, Michael    |
|                    |        | Jordan wore a prototype that fused the upper of the original     |
|          

## Test

In [4]:
pdf_file = "c935e2902adf7040a6ffe0db0f7c11e6.pdf" 

doc = pymupdf.open(pdf_file)

# Ví dụ 2: Xử lý nhiều trang
pages_to_scan_list = list(range(1, len(doc)))
results_multiple_pages = extract_tables_and_contexts(doc, pages_to_scan_list, n_tokens_context=15)

In [5]:
for df in results_multiple_pages[0]['dataframes']:
    display(df)
    print("-" * 100)

Unnamed: 0,Models ​[ edit ],None,None.1
0,Model,Year,Notes


----------------------------------------------------------------------------------------------------


Unnamed: 0,Air Jordan I,1984,"The first Air Jordan was produced for use by Michael Jordan\nin November 1984. They were designed by Peter B. Moore,\nand released during Jordan's sophomore season with the\nChicago Bulls. The Jordan 1 Royal was never worn by\nMichael Jordan on an NBA court. The black and red Air\nJordan 1 has been re-released several times, starting in\n[20][21]\n1994.\nThe red and black colorway of the Nike Air Ship, the\nprototype for the Jordan I, was later outlawed by then-NBA\nCommissioner David Stern for having very little white on\nthem. (This rule, known as the ""51 percent"" rule, was\n[22][23]\nrepealed in the late 2000s.)"
0,Air Jordan II,1986,The success of the Air Jordan I encouraged Nik...
1,Air Jordan III,1988,The Air Jordan III featured the debut of the J...


----------------------------------------------------------------------------------------------------


Unnamed: 0,Air Jordan IV,1989,"In December 1988, Nike released the Air Jordan IV to the\npublic. Designed by Tinker Hatfield, it was the first Air Jordan\nreleased on the global market. It had four colorways:\nWhite/Black, Black/Cement Grey, White/Fire Red-Black, and\nOff White/Military Blue. Nike featured director and actor\nSpike Lee in ads for the shoe.[27] Lee had featured the shoe\n[25]\nin his movie Do The Right Thing.\nMichael Jordan wore the Air Jordan IV when he made ""The\nShot"", a series winner in Game 5 of the 1989 NBA First\nRound between the Chicago Bulls and the Cleveland\nCavaliers. In 2012 a Cavalier colorway dubbed the ""Cavs""\nwas released to honor ""The Shot""."
0,Air Jordan V,1990,The Air Jordan V was released in February 1990...
1,Air Jordan VI,1991,"Designed after a German sports car, Michael Jo..."
2,Air Jordan VII,1992,"The Air Jordan VII introduced ""huarache"" techn..."


----------------------------------------------------------------------------------------------------


Unnamed: 0,None,None.1,"Olympic color combo of the Air Jordan VII model which had\nJordan's Olympic jersey number 9, instead of the usual ""23""\n[25]\nfound on other colorways.\nVarious models of the Air Jordan VII were re-released\n[33]\nbeginning with its 10-year anniversary in 2002."
0,Air Jordan VIII,1993,The Air Jordan VIII was released to coincide w...
1,Air Jordan IX,1993,"Originally released in November 1993, the Air ..."
2,Air Jordan X,1994,This was released in different colors represen...
3,Air Jordan XI,1995,This model was designed by Tinker Hatfield. Wh...


----------------------------------------------------------------------------------------------------


Unnamed: 0,None,None.1,[39]\nMichael Jordan's favorite.
0,Air Jordan XII,1996,"The Air Jordan XII featured a ""Rising Sun"" mot..."
1,Air Jordan XIII,1997,"This model had a carbon fiber plate, designed ..."
2,Air Jordan XIV,1998,Inspired by the Ferrari 550 M which Michael Jo...


----------------------------------------------------------------------------------------------------


Unnamed: 0,None,None.1,in his final game with the Chicago Bulls in the 1998 NBA\nFinals.\nThere are 14 Jumpman logos—7 on each shoe—\ncorresponding the shoes' number in the series.
0,Air Jordan XV,1999,This was the first shoe after Jordan's retirem...
1,Air Jordan XVI,2001,"The shoe came with spats, and the design was i..."
2,Air Jordan XVII,2002,This pair of Jordans came with a multimedia CD...
3,Air Jordan XVIII,2003,The Air Jordan XVIII shoe was released during ...


----------------------------------------------------------------------------------------------------


Unnamed: 0,Air Jordan XIX,2004,"This is the first Jordan release after his third, and final,\nretirement which came after the 2002–03 NBA season. The\ndesign was inspired by the black mamba snake,[25] and two\noriginal colorways where released: white/flint gray and\nblack/red. Three regional colorways and three special edition\ncolorways were released. They consisted of the East, West,\nand Midwest edition for regular and West, East, and Olympic\nfor the SE (special edition).\nThe Air Jordan XIX used innovative materials. The upper\nsection of shoe was developed in collaboration with the\nglobal materials consultancy Material ConneXion, who\nsourced Nike a sleeving normally used in architectural\napplications for protecting PVC pipes from bursting.[42] In\ntheory, this allowed for a shoe without laces, because the\nsleeving does not stretch. Nonetheless, the Air Jordan XIX\nmodel did include a set of laces behind the sleeve to better\nsecure the shoe. They are known to be the lightest Air\n[43]\nJordans ever made.\nThe shoes appeared on the sitcom My Wife and Kids, in the\nepisode ""Fantasy Camp: Part 2"", when the protagonist\nMichael Kyle (Damon Wayans) steals it from Jordan's hotel\nroom and uses it to play against Jordan himself later in the\nepisode. Michael Jordan wears ""AJ IV Cool Grey"" in the\nepisode. The shoe was re-released in 2008."
0,Air Jordan XX,2005,The Air Jordan XX was inspired by low-cut moto...


----------------------------------------------------------------------------------------------------


Unnamed: 0,Air Jordan XXI,2006,"The Air Jordan XXI model of shoes was designed by\nD'Wayne Edwards and inspired by sport touring vehicles.\nThe shoe features lower-foot air grilles, double-overlasted\nPhylon midsole, a carbon fiber shank plate and a seamless\ndiamond-quilted bootie. It came with removable parts that\ncould make the cushioning firm or soft, and had text that\nblacklight.[25]\ncould be seen under a\nThe Air Jordan XXI was introduced on television by the\n""Second Generation"" advertisement."
0,Air Jordan XX2,2007,The XX2 was inspired by the F-22 Raptor.[25] T...
1,Air Jordan XX3,2008,The Air Jordan XX3 was designed by Tinker Hatf...
2,Air Jordan 2009,2009,The Air Jordan 2009 was designed by Jason Mayd...
3,Air Jordan 2010,2010,This was released during the 25th anniversary ...


----------------------------------------------------------------------------------------------------


Unnamed: 0,Air Jordan 2011,2011,"[25]\nThe shoe has interchangeable insoles – a red one for\npower and a blue one for quickness. Four colorways of the\nshoe were released corresponding with the 2011 All Star\nGame: White/Black, White/Red and White/Blue that\nrepresented the East/West Jersey Colors. The ""Year of the\nRabbit"" colorway was a limited release that celebrated\nMichael Jordan's Chinese zodiac sign.\nThe 2011 has a star-constellation pattern that also serves as\nventilation. It uses patent leather wrapped around the shoe.\nThe shoes are hand burnished and crafted. A dress shoe that\nfeels similar to the XI was reportedly the goal.\nThe shoe has not been re-released."
0,Air Jordan 2012,2012,The Air Jordan 2012 offers six customization c...
1,Air Jordan XX8,2013,"The Air Jordan XX8, designed by Tinker Hatfiel..."
2,Air Jordan XX9,2014,"The Air Jordan XX9, also designed by Hatfield,..."
3,Air Jordan XXX,2016,The Air Jordan XXX was again designed by Tinke...


----------------------------------------------------------------------------------------------------


Unnamed: 0,Air Jordan XXXI,2016,"The Air Jordan XXXI is heavily influenced by the Air Jordan\n1s, having a leather upper and swoosh, Jumpman, and\nJordan ""Wings"" logos. Its retail debut was on September 3,\n2016, in the ""Banned"" colorway, for the 30th anniversary of\n[25][46]\nthe NBA banning the Air Jordan 1. Notable\nappearances of the shoe include the ""USA"" colorway worn\nduring the 2016 Olympic basketball tournament by members\nof team USA.[47]"
0,Air Jordan XXXII,2017,The Air Jordan XXXII was influenced by the Air...
1,Air Jordan XXXIII,2018,The Air Jordan XXXIII was released on October ...
2,Air Jordan XXXIV,2019,The Air Jordan XXXIV was released on September...
3,Air Jordan XXXV,2020,The Air Jordan XXXV debuted in the Fall of 202...
4,Air Jordan XXXVI,2021,The Air Jordan XXXVI was first teased by Germa...
5,Air Jordan XXXVII,2022,[53]\nThe Air Jordan XXXVII was released on Ju...
6,Air Jordan XXXVIII,2023,The Air Jordan XXXVIII contains a Cushlon 3.0 ...


----------------------------------------------------------------------------------------------------


Unnamed: 0,None,None.1,"18, 2023, for $200."
0,Air Jordan XXXIX,2024,The Air Jordan XXXIX was released in July 2024.


----------------------------------------------------------------------------------------------------


In [6]:
full_prompt = ""
for i, df in enumerate(results_multiple_pages[0]['dataframes']):
    if df.empty:
        continue
    
    df_string = get_input_df(df, n_rows=7)
    column_types = get_column_types(df=df)
    n_columns = len(df.columns)
    
    prompt_part = f"Table {i}:\n"
    if results_multiple_pages[1][i]:
        prompt_part += f"Context before table:\n{results_multiple_pages[1][i]}\n\n"
    prompt_part += f"{df_string}\nNumber of columns: {n_columns}\nColumn types: {column_types}\n\n"

    full_prompt += prompt_part
print(full_prompt)

Table 0:
Context before table:
become one of the most recognizable logos in the athletics industry.[19] Models ​[ edit ]

Models ​[ edit ]=None=None
Model=Year=Notes

Number of columns: 3
Column types: ['object', 'object', 'object']

Table 1:
Air Jordan I=1984=The first Air Jordan was produced for use by Michael Jordan in November 1984. They
Air Jordan II=1986=The success of the Air Jordan I encouraged Nike to release a new Air Jordan
Air Jordan III=1988=The Air Jordan III featured the debut of the Jumpman [25] logo. Jordan Brand reintroduced

Number of columns: 3
Column types: ['object', 'object', 'object']

Table 2:
Air Jordan IV=1989=In December 1988, Nike released the Air Jordan IV to the public. Designed by Tinker
Air Jordan V=1990=The Air Jordan V was released in February 1990 and designed by Hatfield. Inspired by
Air Jordan VI=1991=Designed after a German sports car, Michael Jordan wore [25] the VI for his first
Air Jordan VII=1992="The Air Jordan VII introduced ""huarache"" tec

In [7]:
json_concat = generate(full_prompt)
json_concat

{'concatable_tables': [{'table_index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}],
 'has_headers': [0],
 'headers_info': [{'table_index': [0],
   'headers': ['Model', 'Year', 'Notes'],
   'is_header_guessed': False},
  {'table_index': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
   'headers': ['Column 1', 'Column 2', 'Column 3'],
   'is_header_guessed': True}],
 'input_tokens': CountTokensResponse(total_tokens=3377, cached_content_token_count=None),
 'output_tokens': CountTokensResponse(total_tokens=253, cached_content_token_count=None)}

In [8]:
processed_tables_with_pages = main_concatenation_logic(
    extracted_data=ExtractedDataType(
        dataframes=results_multiple_pages[0]['dataframes'],
        page_numbers=results_multiple_pages[0]['page_numbers']
    ),
    concat_json=json_concat,
    source_name="test"
)

In [9]:
for result in processed_tables_with_pages:
    df = result['dataframe']
    # df_example = df.replace(['<NA>', "None", "None ", None, ""], pd.NA)
    # df_example = df_example.dropna(how='all')
    display(df)

Unnamed: 0,Model,Year,Notes
0,Model,Year,Notes
1,Air Jordan I,1984,The first Air Jordan was produced for use by M...
2,Air Jordan II,1986,The success of the Air Jordan I encouraged Nik...
3,Air Jordan III,1988,The Air Jordan III featured the debut of the J...
4,Air Jordan IV,1989,"In December 1988, Nike released the Air Jordan..."
5,Air Jordan V,1990,The Air Jordan V was released in February 1990...
6,Air Jordan VI,1991,"Designed after a German sports car, Michael Jo..."
7,Air Jordan VII,1992,"The Air Jordan VII introduced ""huarache"" techn..."
8,,,Olympic color combo of the Air Jordan VII mode...
9,Air Jordan VIII,1993,The Air Jordan VIII was released to coincide w...


In [56]:
processed_tables_with_pages[0]['source']

'test'