# Step Three: 1. Table-Processing

## From customized XML/HTML to customized key-value pairs
#### 1. Run Table-Processing section powered GPT (Fill in your own OpenAI key and base_url, prompts, examples dicectly in code; Fill in examples for GPT in CSV)
#### 2. Check GPT output Tables.
#### 3. Sort, Clean and Integrate GPT-processed Tables (Remove File Name Folder, Table-Clean and integrate table1, table2, table3 ... into a JSON);
#### Note: Examples for E are saved in FewShot-XML.csv, and examples for RSC and SN are saved in FewShot-HTML.csv
#### Each customized key-value pairs are divided by "###", such as: 

In [6]:
"""###<Title of the Table><Corresponding Row Header><Corresponding Column Header>=<Entry Value>###"""

'###<Title of the Table><Corresponding Row Header><Corresponding Column Header>=<Entry Value>###'

## 1. Table-Processing

In [1]:
import csv
import os
import json
from platform import system
import time
import chardet
from tqdm import tqdm
from openai import OpenAI
import openai
from utils import *


# -- global var start --
api_key = "..." ### Fill in your own OpenAI key
base_url = "..." ### Fill in base_url 
input_csv = 'Prompt+Example/Example/FewShot-HTML.csv'  # Examples for GPT, CSV file must contain 'Input column for GPT' and 'Output column for GPT';
input_data_path = 'Data/SN-Table'  # Directory containing input table data
output_path = 'Data/SN-GPT-Table'  # Directory to save the GPT-processed tables
n_folder_path = 'Data/Failed-Table'  # Folder to store files without 'Table' in the name or failed tables
system_content = """ This task is to take the simplified table webpage as input and convert each cell in the table body to customised format and 
output it as the json document. Please directly give the answer without any analysis.""" ### System instruction of your task, here is an example.
system_instructions = {
    "role": "system",
    "content": system_content
}

client = OpenAI(
    api_key=api_key,
    base_url=base_url
)
# -- global var end --


# -- functions start --
# 1. create user content
def create_user_content():
    # Read the CSV file containing I/O pairs
    example_content = '''The customised format is <Table Title><Row Header><Column Header>=<Entry Value>, 
    and if the table has multilayered header, extract corresponding header from outside to inside. 
    2 examples are given, just help you to understand the customised format. Webpage that are slightly different in symbols need you to process.
    Note:  1.colspan and rowspan mean the cell occupies multiple columns or rows, 
    2.morerows=x or morecols=x mean occupies (x+1) columns or rows,  
    3.<entry>, <entry:&nbsp>, or no element exists after “:” mean a blank space should be leaved in the cell. ''' ### Your detailed task, here is an example.
    with open(input_csv, mode='r') as file:
        csv_reader = csv.DictReader(file)
        # Collecting I/O pairs from CSV
        example_num = 1
        for row in csv_reader:
            ex1 = row["Input"],
            ex1_output = row["Output"],
            example_content += f"""### Example {example_num}:\nInput:\n{ex1}\nOutput:\n{ex1_output}\n"""
            example_num += 1
    return example_content


# 2. detect_file_encoding
def detect_file_encoding(file_path):
    with open(file_path, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result['encoding']
# 3 .get root dirs
def get_dirs(root_dir):
    dirs = os.listdir(root_dir)
    dirs.remove('.DS_Store')
    return dirs

# 4. detect_file_exist
def detect_file_exist(file_path):
    if os.path.exists(file_path):
        return True
    else:
        return False

# -- functions end --
def few_shot_from_csv(input_csv, input_data_path, output_path, api_key, n_folder_path):
    # Process new input data and get prediction
    file_list = os.listdir(input_data_path)  # Load input data
    flag = 1
    for file in file_list:
        print(f"current file: {file},index:{flag}")
        if detect_file_exist(os.path.join(output_path,file)):
            print(f" {os.path.join(output_path,file)} Exist")
            continue
        messages = [system_instructions]  # Add system instruction first
        user_instructions = {
            "role": "user",
            "content": create_user_content()
        }
        messages.append(user_instructions)

        # Maintain original file names
        input_file_path = os.path.join(input_data_path, file)
        output_file_name = os.path.splitext(file)[0] + '.json'
        output_file_path = os.path.join(output_path, output_file_name)
        encodeing = detect_file_encoding(input_file_path)
        # Read file content
        with open(input_file_path, 'r', encoding=encodeing) as input_file:
            text = input_file.read()
            # print(text)

        # Check if file content contains 'Table'
        if 'Table' not in text and 'table' not in text:
            # Move file to 'N' folder without processing
            if not os.path.exists(n_folder_path):
                os.makedirs(n_folder_path)
            os.rename(input_file_path, os.path.join(n_folder_path, file))
            print(f"File {file} moved to N folder because it doesn't contain 'Table'.")
            continue  # Skip to the next file

        # If 'Table' is found in the content, proceed with processing
        print(f"Processing file: {file}")

        # Add the new input to be processed by the model
        messages.append({"role": "user", "content": text})
        print(f"submited {file}")
        response = client.chat.completions.create(
            model="gpt-4-1106-preview",
            messages=messages,
            temperature=0.3,
            #    frequency_penalty=0,
            #    presence_penalty=0
            # max_completion_tokens = 8192
        )

        prediction = response.choices[0].message.content.strip()
        print(f"predicted {prediction}")
        send_num = 1
        max_retries = 3
        retries = 0
        while True:

            if "..." in prediction:
                send_num += 1
                print(f"Omit,ask again : {send_num}")
                if send_num >= 3:
                    messages = messages[0:-3]
                messages.append({"role": "assistant", "content": prediction})
                messages.append({"role": "user",
                                 "content": "Do not omit and Continue your output, each cell should be converted !!!"})

                if retries < max_retries:
                    try:
                        response = client.chat.completions.create(
                            model="gpt-4-1106-preview",
                            messages=messages,
                            temperature=0.3,
                            #        frequency_penalty=0,
                            #        presence_penalty=0,
                            #        max_completion_tokens = 8192
                        )
                        prediction = response.choices[0].message.content.strip()

                    except openai.InternalServerError as e:
                        retries += 1
                        print(f"Omit：{e}。Retrying... ({retries}/{max_retries})")
                        time.sleep(2 ** retries)  # 指数回退延迟
                        if retries == max_retries:
                            raise Exception("Maximized Retry times. Overloaded")
            else:
                # print("omit not in prediction,ask again")
                # print("：Complete，Save it")
                break
        # print(prediction)
        if not os.path.exists(output_path):
            print(output_path)
            os.mkdir(output_path)
        with open(os.path.join(output_path,file), "w") as f:
            f.write(prediction)
        print("：Completed，Save it as:", output_path + file)


def main():
    """
    Main function to set up parameters, read configuration, and invoke the few-shot learning process.
    """
    # Ensure output and 'N' paths exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    if not os.path.exists(n_folder_path):
        os.makedirs(n_folder_path)

    input_json_dirs = get_dirs(input_data_path)
    print("input_json_dirs:",input_json_dirs)
    for input_dir in input_json_dirs:
        print("input_dir:",input_dir)
        input_json_path = os.path.join(input_data_path, input_dir)
        output_json_path = os.path.join(output_path, input_dir)
        few_shot_from_csv(input_csv, input_json_path, output_json_path, api_key, n_folder_path)

    # Call the few-shot function to process the CSV and input data
    #few_shot_from_csv(input_csv, input_data_path, output_path, api_key, n_folder_path)

# Run the main function if this script is executed
if __name__ == "__main__":
    main()

input_json_dirs: ['10.1007-s00170-019-04167-2']
input_dir: 10.1007-s00170-019-04167-2
current file: .DS_Store,index:1
File .DS_Store moved to N folder because it doesn't contain 'Table'.
current file: table1.json,index:1
Processing file: table1.json
submited table1.json
predicted {"Table 1: "The chemical composition of the considered AA6061 aluminum alloy."\n###<Material><AA6061><Composition (weight %)><Al>=<Base>###, \n###<Material><AA6061><Composition (weight %)><Si>=<0.73>###, \n###<Material><AA6061><Composition (weight %)><Fe>=<0.33>###, \n###<Material><AA6061><Composition (weight %)><Cu>=<0.26>###, \n###<Material><AA6061><Composition (weight %)><Mn>=<0.04>###, \n###<Material><AA6061><Composition (weight %)><Mg>=<0.88>###, \n###<Material><AA6061><Composition (weight %)><Cr>=<0.17>###, \n###<Material><AA6061><Composition (weight %)><Zn>=<0.05>###, \n###<Material><AA6061><Composition (weight %)><Ti>=<0.02>###, \n###<Material><AA6061><Composition (weight %)><Others>=<0.02>###}
Data/SN

## Remove File Name Folder

In [3]:
import os

# Define the base directory (replace with your actual base path)
base_dir = r'Data/RSC-GPT-Table'  # Put Your Own Table directory 

# Walk through all directories and files in the base directory
for root, dirs, files in os.walk(base_dir):
    for file in files:
        # Check if the file is a JSON file
        if file.endswith(".json"):
            # Get the DOI folder name
            doi_folder_name = os.path.basename(root)
            file_path = os.path.join(root, file)
            
            # Define new file name: DOI_foldername + original file name
            new_file_name = f"{doi_folder_name}-{file}"
            new_file_path = os.path.join(base_dir, new_file_name)
            
            print(f"Renaming {file_path} to {new_file_path}")
            try:
                # Rename the JSON file
                os.rename(file_path, new_file_path)
                print(f"Successfully renamed: {file_path} to {new_file_path}")
            except Exception as e:
                print(f"Failed to rename {file_path}: {e}")

Renaming Data/RSC-GPT-Table/10.1039-b811775f/table1.json to Data/RSC-GPT-Table/10.1039-b811775f-table1.json
Successfully renamed: Data/RSC-GPT-Table/10.1039-b811775f/table1.json to Data/RSC-GPT-Table/10.1039-b811775f-table1.json
Renaming Data/RSC-GPT-Table/10.1039-b811775f/table2.json to Data/RSC-GPT-Table/10.1039-b811775f-table2.json
Successfully renamed: Data/RSC-GPT-Table/10.1039-b811775f/table2.json to Data/RSC-GPT-Table/10.1039-b811775f-table2.json
Renaming Data/RSC-GPT-Table/10.1039-b811775f/table3.json to Data/RSC-GPT-Table/10.1039-b811775f-table3.json
Successfully renamed: Data/RSC-GPT-Table/10.1039-b811775f/table3.json to Data/RSC-GPT-Table/10.1039-b811775f-table3.json
