# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1200
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example: MOFs properties

In [7]:
with open("html_tables/html_table_MOFproperties.txt", 'r') as file:
    html_table_MOFproperties = file.read()
    file.close()

In [8]:
html_table_MOFproperties

'<table class="table" border="0"><colgroup><col align="left"><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."><col align="char" char="."></colgroup><thead><tr valign="top" class="colsep0"><th class="colsep0 rowsep0" scope="col" align="center">&nbsp;</th><th class="rowsep1 colsep0" colspan="2" scope="col" align="center" char=".">BET (m<sup>2</sup>/g)</th><th class="rowsep1 colsep0" colspan="3" scope="col" align="center" char="."><i>V</i><sub>pore</sub> (cc/g)</th><th class="colsep0 rowsep0" scope="col" align="center" char=".">&nbsp;</th><th class="colsep0 rowsep0" scope="col" align="center" char=".">&nbsp;</th></tr><tr valign="top" class="colsep0"><th class="colsep0 rowsep0" scope="col" align="center">MOFs</th><th class="colsep0 rowsep0" scope="col" align="center" char=".">N<sub>2</sub></th><th class="colsep0 rowsep0" scope="col" align="center" char=".">calcd</th><th class="c

In [11]:
with open("json_schemas/json_schema_MOFproperties.json") as file:
    json_schema_MOFproperties = json.load(file)
    file.close()

In [12]:
json_schema_MOFproperties

{'type': 'object',
 'properties': {'MOFs': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'MOF': {'type': 'string'},
     'N2_BET': {'type': 'number'},
     'calcd_BET': {'type': 'number'},
     'N2_Vpore': {'type': 'number'},
     'CH4_Vpore': {'type': 'number'},
     'calcd_Vpore': {'type': 'number'},
     'ρ': {'type': 'number'},
     'metal': {'type': 'number'}}}}}}

In [20]:
prompt_MOFproperties = prompt(prompt_template, html_table_MOFproperties, json_schema_MOFproperties)

In [21]:
davinci_MOFproperties = model_davinci(prompt_MOFproperties)

#### pydantic json validation

In [31]:
class MOFs(BaseModel):
    MOF: str
    N2_BET: float
    calcd_BET: float
    N2_Vpore: float
    CH4_Vpore: float
    calcd_Vpore: float
    ρ: float
    metal: float
        
class MOFsList(BaseModel):
    mofs: List[MOFs] = Field(alias = "MOFs")

In [32]:
MOFsList.parse_raw(davinci_MOFproperties)

MOFsList(mofs=[MOFs(MOF='HKUST-1', N2_BET=1850.0, calcd_BET=2064.0, N2_Vpore=0.78, CH4_Vpore=0.78, calcd_Vpore=0.78, ρ=0.883, metal=4.38), MOFs(MOF='Ni-MOF-74', N2_BET=1350.0, calcd_BET=1240.0, N2_Vpore=0.51, CH4_Vpore=0.52, calcd_Vpore=0.49, ρ=1.206, metal=7.74), MOFs(MOF='PCN-14', N2_BET=2000.0, calcd_BET=2170.0, N2_Vpore=0.85, CH4_Vpore=0.78, calcd_Vpore=0.76, ρ=0.829, metal=2.59), MOFs(MOF='UTSA-20', N2_BET=1620.0, calcd_BET=1960.0, N2_Vpore=0.66, CH4_Vpore=0.66, calcd_Vpore=0.69, ρ=0.909, metal=3.61), MOFs(MOF='NU-125', N2_BET=3120.0, calcd_BET=3680.0, N2_Vpore=1.29, CH4_Vpore=1.23, calcd_Vpore=1.32, ρ=0.578, metal=1.82), MOFs(MOF='NU-111', N2_BET=4930.0, calcd_BET=4650.0, N2_Vpore=2.09, CH4_Vpore=2.12, calcd_Vpore=2.03, ρ=0.409, metal=1.36)])

In [34]:
with open("structured_openai_results/davinci_MOFproperties.json", 'w') as file:
    json.dump(json.loads(davinci_MOFproperties), file)

In [35]:
with open("structured_openai_results/davinci_MOFproperties.json") as file:
    davinci_MOFproperties_json = json.load(file)
    file.close()

In [36]:
print(json.dumps(davinci_MOFproperties_json, indent=4, ensure_ascii=False))

{
    "MOFs": [
        {
            "MOF": "HKUST-1",
            "N2_BET": 1850,
            "calcd_BET": 2064,
            "N2_Vpore": 0.78,
            "CH4_Vpore": 0.78,
            "calcd_Vpore": 0.78,
            "ρ": 0.883,
            "metal": 4.38
        },
        {
            "MOF": "Ni-MOF-74",
            "N2_BET": 1350,
            "calcd_BET": 1240,
            "N2_Vpore": 0.51,
            "CH4_Vpore": 0.52,
            "calcd_Vpore": 0.49,
            "ρ": 1.206,
            "metal": 7.74
        },
        {
            "MOF": "PCN-14",
            "N2_BET": 2000,
            "calcd_BET": 2170,
            "N2_Vpore": 0.85,
            "CH4_Vpore": 0.78,
            "calcd_Vpore": 0.76,
            "ρ": 0.829,
            "metal": 2.59
        },
        {
            "MOF": "UTSA-20",
            "N2_BET": 1620,
            "calcd_BET": 1960,
            "N2_Vpore": 0.66,
            "CH4_Vpore": 0.66,
            "calcd_Vpore": 0.69,
            "ρ": 0.909,
    