In [5]:
import json
import os

filepath = "raw/"
outpath = "output/"

os.makedirs(outpath, exist_ok=True)

In [None]:
# A good example json to look at: 3_totto391-2.json

# In the Github Repo, It says: "If a cell is merged, only 
# its core cell (the top-left) will be non-empty"

In [13]:

def check_hierarchy(data):
    """
    Check if the JSON data has a hierarchical structure.
    """
    for node in data["top_root"]["children"]:
        if node["children"] != []:
            return True
    return False

def convert_plain_csv(data):
    """
    Convert the JSON table to a CSV format plainly. 
    """
    result = ""
    for row in data["texts"]:
        result += ",".join(["\"" + cell + "\"" for cell in row]) + "\n"  
    return result

def convert_expanded_csv(data):
    """
    Conver the JSON table to a CSV with cell content expansion on merged cells.
    """
    table = data["texts"]

    for line in data["merged_regions"]:
        r1 = line["first_row"]
        r2 = line["last_row"]
        c1 = line["first_column"]
        c2 = line["last_column"]

        r2 = min(r2, len(table) - 1)
        c2 = min(c2, len(table[0]) - 1)

        for r in range(r1, r2 + 1):
            for c in range(c1, c2 + 1):
                table[r][c] = table[r1][c1]
        
    result = ""
    for row in table:
        result += ",".join(["\"" + cell + "\"" for cell in row]) + "\n"
    return result
    

files = os.listdir(filepath)

for file in files[:]:
    if not file.endswith('.json'):
        continue

    with open(os.path.join(filepath, file), 'r', encoding='utf-8') as f:
        data = json.load(f)

        output = convert_expanded_csv(data)

        with open(os.path.join(outpath, file.replace('.json', '.csv')), 'w', encoding='utf-8') as out_f:
            out_f.write(output)

In [None]:
# Fetching 100 random tables and relevant queries

import os
import random
import json

filepath = "output/"
files = os.listdir(filepath)
sample = random.sample(files, 100)

queries = []

with open("dev_samples.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        if data["table_id"]+".csv" in sample:
            queries.append(data)

with open("sample100_queries.jsonl", "w") as f:
    for query in queries:
        f.write(json.dumps(query) + "\n")

tablespath = "sample100_tables/"
os.makedirs(tablespath, exist_ok=True)
for file in sample:
    src = os.path.join(filepath, file)
    dest = os.path.join(tablespath, file)

    with open(src, "r") as src, open(dest, "w") as dst:
        for line in src:
            dst.write(line)


JSONDecodeError: Extra data: line 1 column 7 (char 6)