# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1200
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example: Catalysts for CO<sub>2</sub>  Fischer–Tropsch conversion to liquid fuels

In [8]:
with open("html_tables/html_table_catalystCO2.txt", 'r') as file:
    html_table_catalystCO2 = file.read()
    file.close()

In [9]:
html_table_catalystCO2

'<table class="table "><colgroup><col align="left"><col align="left"><col align="left"><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="left"></colgroup><thead><tr valign="top" class="colsep0"><th class="colsep0 rowsep0" scope="col" align="center">catalyst</th><th class="colsep0 rowsep0" scope="col" align="center">synthesis method</th><th class="colsep0 rowsep0" scope="col" align="center">promoter incorporation</th><th class="colsep0 rowsep0" scope="col" align="center" char="."><i>X</i>&nbsp;CO<sub>2</sub> (%)</th><th class="colsep0 rowsep0" scope="col" align="center" char="."><i>S</i>&nbsp;CO (%)</th><th class="colsep0 rowsep0" scope="col" align="center" char="."><i>S</i>&nbsp;C<sub>1</sub> (%)</th><th class="colsep0 rowsep0" scope="col" align="center" char="."><i>S</i>&nbsp;C<sub>2</sub>–C<sub>4</sub> (%)</th><th class="colsep0 rowsep

In [12]:
with open("json_schemas/json_schema_catalystCO2.json") as file:
    json_schema_catalystCO2 = json.load(file)
    file.close()

In [13]:
json_schema_catalystCO2

{'type': 'object',
 'properties': {'Carbon-supported catalysts CO2': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Catalyst': {'type': 'string'},
     'Synthesis method': {'type': 'string'},
     'Promoter incorporation': {'type': 'string'},
     'X CO2 (%)': {'type': 'number'},
     'S CO (%)': {'type': 'number'},
     'S C1 (%)': {'type': 'number'},
     'S C2–C4 (%)': {'type': 'number'},
     'S C5+ (%)': {'type': 'number'},
     'Temperature (°C)': {'type': 'number'},
     'P (MPa)': {'type': 'number'}}}}}}

In [14]:
prompt_catalystCO2 = prompt(prompt_template, html_table_catalystCO2, json_schema_catalystCO2)

In [15]:
davinci_catalystCO2 = model_davinci(prompt_catalystCO2)

In [21]:
with open("structured_openai_results/davinci_catalystCO2.json", 'w') as file:
    json.dump(json.loads(davinci_catalystCO2), file)

In [22]:
with open("structured_openai_results/davinci_catalystCO2.json") as file:
    davinci_catalystCO2_json = json.load(file)
    file.close()

In [23]:
print(json.dumps(davinci_catalystCO2_json, indent=4, ensure_ascii=False))

[
    {
        "Catalyst": "Fe–Cu–K",
        "Synthesis method": "triple incipient wetness impregnation",
        "Promoter incorporation": "impregnation",
        "X CO2 (%)": 15,
        "S CO (%)": 50,
        "S C1 (%)": 11.0,
        "S C2–C4 (%)": 21.0,
        "S C5+ (%)": 18.0,
        "Temperature (°C)": 350,
        "P (MPa)": 8.6
    },
    {
        "Catalyst": "Fe–PYL",
        "Synthesis method": "hydrothermal",
        "Promoter incorporation": "hydrothermal",
        "X CO2 (%)": 21.6,
        "S CO (%)": 29.2,
        "S C1 (%)": 36.8,
        "S C2–C4 (%)": 46.0,
        "S C5+ (%)": 17.2,
        "Temperature (°C)": 300,
        "P (MPa)": 1
    },
    {
        "Catalyst": "Fe–K",
        "Synthesis method": "templated synthesis",
        "Promoter incorporation": "ultrasonic-assisted melt infiltration",
        "X CO2 (%)": 50.6,
        "S CO (%)": 8.2,
        "S C1 (%)": 15.42,
        "S C2–C4 (%)": 31.95,
        "S C5+ (%)": 44.52,
        "Temperature (°C)

#### change object name in the schema

In [25]:
json_schema_catalystCO2_new_schema = {
    "type": "object",
    "properties": {
        "Catalysts": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "Catalyst": {"type": "string"},
                    "Synthesis method": {"type": "string"},
                    "Promoter incorporation": {"type": "string"},
                    "X CO2 (%)": {"type": "number"},
                    "S CO (%)": {"type": "number"},
                    "S C1 (%)": {"type": "number"},
                    "S C2–C4 (%)": {"type": "number"},
                    "S C5+ (%)": {"type": "number"},
                    "Temperature (°C)": {"type": "number"},
                    "P (MPa)": {"type": "number"},
                },
            },
        },
    },
}

In [26]:
prompt_catalystCO2 = prompt(prompt_template, html_table_catalystCO2, json_schema_catalystCO2_new_schema)

In [27]:
davinci_catalystCO2_new_schema = model_davinci(prompt_catalystCO2)

In [30]:
with open("structured_openai_results/davinci_catalystCO2_new_schema.json", 'w') as file:
    json.dump(json.loads(davinci_catalystCO2_new_schema), file)

In [31]:
with open("structured_openai_results/davinci_catalystCO2_new_schema.json") as file:
    davinci_catalystCO2_new_schema_json = json.load(file)
    file.close()

In [32]:
print(json.dumps(davinci_catalystCO2_new_schema_json, indent=4, ensure_ascii=False))

{
    "Catalysts": [
        {
            "Catalyst": "Fe–Cu–K",
            "Synthesis method": "triple incipient wetness impregnation",
            "Promoter incorporation": "impregnation",
            "X CO2 (%)": 15,
            "S CO (%)": 50,
            "S C1 (%)": 11.0,
            "S C2–C4 (%)": 21.0,
            "S C5+ (%)": 18.0,
            "Temperature (°C)": 350,
            "P (MPa)": 8.6
        },
        {
            "Catalyst": "Fe–PYL",
            "Synthesis method": "hydrothermal",
            "Promoter incorporation": "hydrothermal",
            "X CO2 (%)": 21.6,
            "S CO (%)": 29.2,
            "S C1 (%)": 36.8,
            "S C2–C4 (%)": 46.0,
            "S C5+ (%)": 17.2,
            "Temperature (°C)": 300,
            "P (MPa)": 1
        },
        {
            "Catalyst": "Fe–K",
            "Synthesis method": "templated synthesis",
            "Promoter incorporation": "ultrasonic-assisted melt infiltration",
            "X CO2 (%)": 50.6,
