# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1200
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example wrong schema: table_MOFproperties / schema_catalystCO2

In [7]:
with open("html_tables/html_table_MOFproperties.txt", 'r') as file:
    html_table_MOFproperties = file.read()
    file.close()

In [8]:
html_table_MOFproperties

'<table class="table" border="0"><colgroup><col align="left"><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."></colgroup><thead><tr valign="top" class="colsep0"><th class="colsep0 rowsep0" scope="col" align="center">&nbsp;</th><th class="rowsep1 colsep0" colspan="2" scope="col" align="center" char=".">BET (m<sup>2</sup>/g)</th><th class="rowsep1 colsep0" colspan="3" scope="col" align="center" char="."><i>V</i><sub>pore</sub> (cc/g)</th><th class="colsep0 rowsep0" scope="col" align="center" char=".">&nbsp;</th><th class="colsep0 rowsep0" scope="col" align="center" char=".">&nbsp;</th></tr><tr valign="top" class="colsep0"><th class="colsep0 rowsep0" scope="col" align="center">MOFs</th><th class="colsep0 rowsep0" scope="col" align="center" char=".">N<sub>2</sub></th><th class="colsep0 rowsep0" scope="col" align="center" char=".">calcd</th><th class="c

In [19]:
HTML(html_table_MOFproperties)

Unnamed: 0_level_0,BET (m2/g),BET (m2/g),Vpore (cc/g),Vpore (cc/g),Vpore (cc/g),Unnamed: 6_level_0,Unnamed: 7_level_0
MOFs,N2,calcd,N2,CH4,calcd,ρ (g/cm3),metal (mmol/cc)
HKUST-1,1850,2064,0.78,0.78,0.78,0.883,4.38
Ni-MOF-74,1350,1240,0.51,0.52,0.49,1.206,7.74
PCN-14,2000,2170,0.85,0.78,0.76,0.829,2.59
UTSA-20,1620,1960,0.66,0.66,0.69,0.909,3.61
NU-125,3120,3680,1.29,1.23,1.32,0.578,1.82
NU-111,4930,4650,2.09,2.12,2.03,0.409,1.36


In [21]:
with open("html_tables/html_table_catalystCO2.txt", 'r') as file:
    html_table_catalystCO2 = file.read()
    file.close()

In [22]:
HTML(html_table_catalystCO2)

catalyst,synthesis method,promoter incorporation,X CO2 (%),S CO (%),S C1 (%),S C2–C4 (%),S C5+ (%),temperature (°C),P (MPa),reference
Fe–Cu–K,triple incipient wetness impregnation,impregnation,15.0,50.0,11.0,21.0,18.0,350,8.6,(64)
Fe–PYL,hydrothermal,hydrothermal,21.6,29.2,36.8,46.0,17.2,300,1.0,(65)
Fe–K,templated synthesis,ultrasonic-assisted melt infiltration,50.6,8.2,15.42,31.95,44.52,300,2.5,(66)
MnFe–N2,dry impregnation,impregnation,25.2,35.6,39.8,21.92,1.0,360,2.5,(67)
Fe2O3,precipitation,cellulose templated,40.0,15.0,11.9,36.98,36.04,300,1.5,(68)


In [23]:
with open("json_schemas/json_schema_catalystCO2.json") as file:
    json_schema_catalystCO2 = json.load(file)
    file.close()

In [24]:
json_schema_catalystCO2

{'type': 'object',
 'properties': {'Carbon-supported catalysts CO2': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Catalyst': {'type': 'string'},
     'Synthesis method': {'type': 'string'},
     'Promoter incorporation': {'type': 'string'},
     'X CO2 (%)': {'type': 'number'},
     'S CO (%)': {'type': 'number'},
     'S C1 (%)': {'type': 'number'},
     'S C2–C4 (%)': {'type': 'number'},
     'S C5+ (%)': {'type': 'number'},
     'Temperature (°C)': {'type': 'number'},
     'P (MPa)': {'type': 'number'}}}}}}

In [25]:
prompt_MOFproperties_wrong_schema = prompt(prompt_template, html_table_MOFproperties, json_schema_catalystCO2)

In [26]:
davinci_MOFproperties_wrong_schema = model_davinci(prompt_MOFproperties_wrong_schema)

In [30]:
with open("structured_openai_results/davinci_MOFproperties_wrong_schema.json", 'w') as file:
    json.dump(json.loads(davinci_MOFproperties_wrong_schema), file)

In [31]:
with open("structured_openai_results/davinci_MOFproperties_wrong_schema.json") as file:
    davinci_MOFproperties_wrong_schema_json = json.load(file)
    file.close()

In [32]:
print(json.dumps(davinci_MOFproperties_wrong_schema_json, indent=4, ensure_ascii=False))

[
    {
        "MOFs": "HKUST-1",
        "N2": 1850,
        "calcd": 2064,
        "VporeN2": 0.78,
        "VporeCH4": 0.78,
        "calcdVpore": 0.78,
        "ρg/cm3": 0.883,
        "metalmmol/cc": 4.38
    },
    {
        "MOFs": "Ni-MOF-74",
        "N2": 1350,
        "calcd": 1240,
        "VporeN2": 0.51,
        "VporeCH4": 0.52,
        "calcdVpore": 0.49,
        "ρg/cm3": 1.206,
        "metalmmol/cc": 7.74
    },
    {
        "MOFs": "PCN-14",
        "N2": 2000,
        "calcd": 2170,
        "VporeN2": 0.85,
        "VporeCH4": 0.78,
        "calcdVpore": 0.76,
        "ρg/cm3": 0.829,
        "metalmmol/cc": 2.59
    },
    {
        "MOFs": "UTSA-20",
        "N2": 1620,
        "calcd": 1960,
        "VporeN2": 0.66,
        "VporeCH4": 0.66,
        "calcdVpore": 0.69,
        "ρg/cm3": 0.909,
        "metalmmol/cc": 3.61
    },
    {
        "MOFs": "NU-125",
        "N2": 3120,
        "calcd": 3680,
        "VporeN2": 1.29,
        "VporeCH4": 1.23,
       