# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1200
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example: Ni-doped ceria anode materials for SOFCs

In [8]:
with open("html_table_anodesSOFCs.txt", 'r') as file:
    html_table_anodesSOFCs = file.read()
    file.close()

In [9]:
html_table_anodesSOFCs

'<table class="table">\n                <thead>\n                        <tr>\n                                <th>\nNi-doped ceria NPs</th>\n                                <th>\nCrystalline size <em>D</em><sub>XRD</sub> (nm)</th>\n                                <th>\nDislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>\n                                <th>\nLattice strain, <em>ε</em> = <em>β</em>/4\u2006tan\u2006<em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>\n                                <th>\nLattice constant <em>A</em> (Å)</th>\n                        </tr>\n                </thead>\n                            <tbody>\n                        <tr>\n                                <td>\nNDC1</td>\n                                <td>\n33.03</td>\n                                <td>\n9.161 × 10<sup>−4</sup></td>\n                                <td>\n3.08</td>\n                                <td>\n5.413</td>\n                        </tr>\n                        <tr>\n     

In [12]:
with open("json_schemas/json_schema_anodesSOFCs.json") as file:
    json_schema_anodesSOFCs = json.load(file)
    file.close()

In [13]:
json_schema_anodesSOFCs

{'type': 'object',
 'properties': {'Ce-doped SOFC anodes': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Ni-doped ceria NP': {'type': 'string'},
     'Crystalline size DXRD (nm)': {'type': 'number'},
     'Dislocation density, δ (nm−2)': {'type': 'number'},
     'Lattice strain, ε = β/4\u2006tan\u2006θ (×10−3)': {'type': 'number'},
     'Lattice constant A (Å)': {'type': 'number'}}}}}}

In [14]:
prompt_anodesSOFCs = prompt(prompt_template, html_table_anodesSOFCs, json_schema_anodesSOFCs)

In [15]:
davinci_anodesSOFCs = model_davinci(prompt_anodesSOFCs)

In [24]:
with open("structured_openai_results/davinci_anodesSOFCs.json", 'w') as file:
    json.dump(json.loads(davinci_anodesSOFCs), file)

In [22]:
with open("structured_openai_results/davinci_anodesSOFCs.json") as file:
    davinci_anodesSOFCs_json = json.load(file)
    file.close()

In [23]:
print(json.dumps(davinci_anodesSOFCs_json, indent=4, ensure_ascii=False))

[
    {
        "Ni-doped ceria NP": "NDC1",
        "Crystalline size DXRD (nm)": 33.03,
        "Dislocation density, δ (nm−2)": 0.0009161,
        "Lattice strain, ε = β/4 tan θ (×10−3)": 3.08,
        "Lattice constant A (Å)": 5.413
    },
    {
        "Ni-doped ceria NP": "NDC2",
        "Crystalline size DXRD (nm)": 31.29,
        "Dislocation density, δ (nm−2)": 0.001021,
        "Lattice strain, ε = β/4 tan θ (×10−3)": 3.22,
        "Lattice constant A (Å)": 5.406
    },
    {
        "Ni-doped ceria NP": "NDC3",
        "Crystalline size DXRD (nm)": 31.23,
        "Dislocation density, δ (nm−2)": 0.001025,
        "Lattice strain, ε = β/4 tan θ (×10−3)": 3.7,
        "Lattice constant A (Å)": 5.403
    },
    {
        "Ni-doped ceria NP": "NDC4",
        "Crystalline size DXRD (nm)": 33.02,
        "Dislocation density, δ (nm−2)": 0.0009154,
        "Lattice strain, ε = β/4 tan θ (×10−3)": 3.24,
        "Lattice constant A (Å)": 5.407
    }
]


#### change object name in the schema

In [163]:
json_schema_anodesSOFCs_new_schema = {
    "type": "object",
    "properties": {
        "Ni-doped ceria NPs": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "Ni-doped ceria NP": {"type": "string"},
                    "Crystalline size DXRD (nm)": {"type": "number"},
                    "Dislocation density, δ (nm−2)": {"type": "number"},
                    "Lattice strain, ε = β/4 tan θ (×10−3)": {"type": "number"},
                    "Lattice constant A (Å)": {"type": "number"},
                },
            },
        },
    },
}

In [167]:
prompt_anodesSOFCs = prompt(prompt_template, html_table_anodesSOFCs, json_schema_anodesSOFCs_new_schema)

In [168]:
davinci_anodesSOFCs_new_schema = model_davinci(prompt_anodesSOFCs)

In [25]:
with open("JSON_davinci/davinci_anodesSOFCs_new_schema.txt", 'r') as file:
    davinci_anodesSOFCs_new_schema = file.read()
    file.close()

In [26]:
with open("structured_openai_results/davinci_anodesSOFCs_new_schema.json", 'w') as file:
    json.dump(json.loads(davinci_anodesSOFCs_new_schema), file)

In [27]:
with open("structured_openai_results/davinci_anodesSOFCs_new_schema.json") as file:
    davinci_anodesSOFCs_new_schema_json = json.load(file)
    file.close()

In [28]:
print(json.dumps(davinci_anodesSOFCs_new_schema_json, indent=4, ensure_ascii=False))

{
    "Ni-doped ceria NPs": [
        {
            "Ni-doped ceria NP": "NDC1",
            "Crystalline size DXRD (nm)": 33.03,
            "Dislocation density, δ (nm−2)": 0.0009161,
            "Lattice strain, ε = β/4 tan θ (×10−3)": 3.08,
            "Lattice constant A (Å)": 5.413
        },
        {
            "Ni-doped ceria NP": "NDC2",
            "Crystalline size DXRD (nm)": 31.29,
            "Dislocation density, δ (nm−2)": 0.001021,
            "Lattice strain, ε = β/4 tan θ (×10−3)": 3.22,
            "Lattice constant A (Å)": 5.406
        },
        {
            "Ni-doped ceria NP": "NDC3",
            "Crystalline size DXRD (nm)": 31.23,
            "Dislocation density, δ (nm−2)": 0.001025,
            "Lattice strain, ε = β/4 tan θ (×10−3)": 3.7,
            "Lattice constant A (Å)": 5.403
        },
        {
            "Ni-doped ceria NP": "NDC4",
            "Crystalline size DXRD (nm)": 33.02,
            "Dislocation density, δ (nm−2)": 0.0009154,
       