# Step Three: 3. Clean and Integrate GPT-processed Tables

## From customized XML/HTML to customized key-value pairs
#### 1. Run Table-Processing section powered GPT (Fill in your own OpenAI key and base_url, prompts, examples dicectly in code; Fill in examples for GPT in CSV)
#### 2. Check GPT output Tables.
#### 3. Sort, Clean and Integrate GPT-processed Tables (Remove File Name Folder, Table-Clean and integrate table1, table2, table3 ... into a JSON);
#### Note: Examples for E are saved in FewShot-XML.csv, and examples for RSC and SN are saved in FewShot-HTML.csv
#### Each customized key-value pairs are divided by "###", such as: 

In [14]:
"""###<Title of the Table><Corresponding Row Header><Corresponding Column Header>=<Entry Value>###"""

'###<Title of the Table><Corresponding Row Header><Corresponding Column Header>=<Entry Value>###'

## Table Clean

In [2]:
""" Remove "```json", "```", and unnecessary strip """

' Remove "```json", "```", and unnecessary strip '

In [5]:
import os
import re
import json
from concurrent.futures import ThreadPoolExecutor

# Path
root_dir = r"Data/GPT-CheckOut/OutputTable"
clear_json_dir = r"Table-Cleaned"

# Clean json markdown
def clean_json_content(content):
    # Remove markdown Sentences: ```json, ```
    cleaned_content = re.sub(r"```json|```", "", content).strip()
    return cleaned_content

# Read
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Save
def save_cleaned_content(cleaned_content, save_path):
    with open(save_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)
        print(f"Cleaned and saved successfully: {save_path}")

# Process
def process_file(file_path):
    try:
        # Read 
        content = read_file(file_path)
        # print(content)
        # Clean
        cleaned_content = clean_json_content(content)
        # json_data = json.loads(cleaned_content)
        json_data = cleaned_content

        # New Path
        relative_path = os.path.relpath(file_path, root_dir)
        new_save_path = os.path.join(clear_json_dir, relative_path)

        # Test New Path
        os.makedirs(os.path.dirname(new_save_path), exist_ok=True)

        # Save
        save_cleaned_content(json.dumps(json_data, ensure_ascii=False, indent=4), new_save_path)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# ThreadPoolExecutor
def get_files(directory):
    with ThreadPoolExecutor(max_workers=5) as executor:
        for file in os.listdir(directory):
            if file.endswith('.json'):
                file_path = os.path.join(directory, file)
                executor.submit(process_file, file_path)

if __name__ == "__main__":
    get_files(root_dir)

Cleaned and saved successfully: Table-Cleaned/10.1039-b811775f-table3.json
Cleaned and saved successfully: Table-Cleaned/10.1007-s11665-019-04152-6-table3(Fail2).json
Cleaned and saved successfully: Table-Cleaned/10.1039-b811775f-table2.json
Cleaned and saved successfully: Table-Cleaned/10.1016-j.jallcom.2012.07.110-table5(Fail1).json
Cleaned and saved successfully: Table-Cleaned/10.1016-j.msea.2018.12.103-table3.json
Cleaned and saved successfully: Table-Cleaned/10.1016-j.msea.2018.12.103-table2.json
Cleaned and saved successfully: Table-Cleaned/10.1007-s00170-019-04167-2-table1.json
Cleaned and saved successfully: Table-Cleaned/10.1016-j.msea.2018.12.103-table1.json
Cleaned and saved successfully: Table-Cleaned/10.1007-s00170-019-04167-2-table2.json
Cleaned and saved successfully: Table-Cleaned/10.1016-j.jallcom.2018.02.271-table1.json
Cleaned and saved successfully: Table-Cleaned/10.1007-s00170-019-04167-2-table3.json
Cleaned and saved successfully: Table-Cleaned/10.1039-b811775f-ta

## Integrate GPT-processed Tables

In [8]:
import os
import json
import re

# Path settings
path1 = 'Data/Table-Cleaned'  # Input path for the tables
path2 = 'Data/Table-Combined'  # Output path

def main():
    # Ensure the output directory exists
    os.makedirs(path2, exist_ok=True)
    
    files = os.listdir(path1)
    data = {}

    # Process files and organize by table number
    for file in files:
        file_path = os.path.join(path1, file)
        if ".DS_Store" in file_path:
            continue
        # Retrieve file_id
        file_id = file.split("-table")[0]
        
        # Initialize the list for this file_id if not already created
        if file_id not in data:
            data[file_id] = []
        
        # Extract table number to facilitate sorting
        table_num = int(re.search(r'table(\d+)', file).group(1))
        data[file_id].append((table_num, file_path))

    # Sort files by table number and write to output files
    for file_id, table_files in data.items():
        # Sort by table number
        table_files.sort(key=lambda x: x[0])

        # Combine all content into a list
        combined_content = {"Tables": []}
        for _, file_path in table_files:
            with open(file_path, "r", encoding="utf-8") as f:
                table_content = f.read()
                combined_content["Tables"].append(table_content)
        
        # Write combined content to output file
        file_output_path = os.path.join(path2, file_id + ".json")
        with open(file_output_path, "w", encoding="utf-8") as f:
            json.dump(combined_content, f, indent=4, ensure_ascii=False)
        print(f"Processed and saved: {file_output_path}")

if __name__ == "__main__":
    main()

Processed and saved: Data/Table-Combined/10.1039-b811775f.json
Processed and saved: Data/Table-Combined/10.1007-s11665-019-04152-6.json
Processed and saved: Data/Table-Combined/10.1016-j.msea.2018.12.103.json
Processed and saved: Data/Table-Combined/10.1016-j.jallcom.2012.07.110.json
Processed and saved: Data/Table-Combined/10.1007-s00170-019-04167-2.json
Processed and saved: Data/Table-Combined/10.1016-j.jallcom.2018.02.271.json
Processed and saved: Data/Table-Combined/10.1016-j.energy.2017.03.162.json
Processed and saved: Data/Table-Combined/10.1007-s11665-018-3478-4.json
