# 
# JSON structured data from tables in scientific papers

## Jsonformer non-token approach for OpenAI
#### (based on: https://github.com/1rgs/jsonformer, https://github.com/martinezpl/jsonformer/tree/add-openai)

In [1]:
import os
import pandas as pd
import json
import time

import openai
from IPython.display import IFrame
from IPython.display import HTML

from LLMsTablesToJson.html_table import get_driver, extract_tableSource, extract_table, quit_driver
from LLMsTablesToJson.jsonformer_non_tokens import JsonformerNoTokens, OpenAIModel, highlight_values

DATA_DIR = "your_path"
chromedriver_path = os.path.join(DATA_DIR, "chromedriver.exe")

openai.api_key = "your_openai_api_key"

## Example: Ni-doped ceria anode materials for SOFCs¶

### Get table HTML code from DOI

In [76]:
doi = "10.1039/D3NJ00316G"

In [77]:
IFrame(src="https://dx.doi.org/" + doi, width=900, height=380)

In [78]:
driver = get_driver(doi, chromedriver_path)
time.sleep(2)
tableSource = extract_tableSource(driver)

[<selenium.webdriver.remote.webelement.WebElement (session="427f36180aac02dcffedb8b93bff17be", element="2F15F09D44EC762FBA9C6AD1C1B8500E_element_108")>, <selenium.webdriver.remote.webelement.WebElement (session="427f36180aac02dcffedb8b93bff17be", element="2F15F09D44EC762FBA9C6AD1C1B8500E_element_109")>, <selenium.webdriver.remote.webelement.WebElement (session="427f36180aac02dcffedb8b93bff17be", element="2F15F09D44EC762FBA9C6AD1C1B8500E_element_110")>, <selenium.webdriver.remote.webelement.WebElement (session="427f36180aac02dcffedb8b93bff17be", element="2F15F09D44EC762FBA9C6AD1C1B8500E_element_111")>, <selenium.webdriver.remote.webelement.WebElement (session="427f36180aac02dcffedb8b93bff17be", element="2F15F09D44EC762FBA9C6AD1C1B8500E_element_112")>, <selenium.webdriver.remote.webelement.WebElement (session="427f36180aac02dcffedb8b93bff17be", element="2F15F09D44EC762FBA9C6AD1C1B8500E_element_113")>, <selenium.webdriver.remote.webelement.WebElement (session="427f36180aac02dcffedb8b93bff

In [79]:
table_num = 3
html_table_anodesSOFCs = extract_table(tableSource, table_num)
quit_driver(driver)

In [80]:
html_table_anodesSOFCs

'<table class="table">\n                <thead>\n                        <tr>\n                                <th>\nNi-doped ceria NPs</th>\n                                <th>\nCrystalline size <em>D</em><sub>XRD</sub> (nm)</th>\n                                <th>\nDislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>\n                                <th>\nLattice strain, <em>ε</em> = <em>β</em>/4\u2006tan\u2006<em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>\n                                <th>\nLattice constant <em>A</em> (Å)</th>\n                        </tr>\n                </thead>\n                            <tbody>\n                        <tr>\n                                <td>\nNDC1</td>\n                                <td>\n33.03</td>\n                                <td>\n9.161 × 10<sup>−4</sup></td>\n                                <td>\n3.08</td>\n                                <td>\n5.413</td>\n                        </tr>\n                        <tr>\n     

In [81]:
with open("html_table_anodesSOFCs.txt", "w") as file:
    file.write(html_table_anodesSOFCs)

### Show table

In [84]:
HTML(html_table_anodesSOFCs)

Ni-doped ceria NPs,Crystalline size DXRD (nm),"Dislocation density, δ (nm−2)","Lattice strain, ε = β/4 tan θ (×10−3)",Lattice constant A (Å)
NDC1,33.03,9.161 × 10−4,3.08,5.413
NDC2,31.29,1.021 × 10−3,3.22,5.406
NDC3,31.23,1.025 × 10−3,3.7,5.403
NDC4,33.02,9.154 × 10−4,3.24,5.407


### Convert HTML table to JSON format with openAI LLMs

In [86]:
json_schema_anodesSOFCs = {
    "type": "object",
    "properties": {
        "Ce-doped SOFC anodes": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "Ni-doped ceria NP": {"type": "string"},
                    "Crystalline size DXRD (nm)": {"type": "number"},
                    "Dislocation density, δ (nm−2)": {"type": "number"},
                    "Lattice strain, ε = β/4 tan θ (×10−3)": {"type": "number"},
                    "Lattice constant A (Å)": {"type": "number"},
                },
            },
        },
    },
}

In [87]:
with open('json_schema_anodesSOFCs.json', 'w') as file:
    json.dump(json_schema_anodesSOFCs, file)

In [88]:
with open("json_schema_anodesSOFCs.json") as file:
    json_schema_anodesSOFCs = json.load(file)
    file.close()

In [89]:
json_schema_anodesSOFCs

{'type': 'object',
 'properties': {'Ce-doped SOFC anodes': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Ni-doped ceria NP': {'type': 'string'},
     'Crystalline size DXRD (nm)': {'type': 'number'},
     'Dislocation density, δ (nm−2)': {'type': 'number'},
     'Lattice strain, ε = β/4\u2006tan\u2006θ (×10−3)': {'type': 'number'},
     'Lattice constant A (Å)': {'type': 'number'}}}}}}

#### standard prompt

In [96]:
builder = JsonformerNoTokens(
    model=OpenAIModel("text-davinci-003", debug=False),
    json_schema=json_schema_anodesSOFCs,
    text=html_table_anodesSOFCs,
    prompt="Generate an object with the following schema extracting the information from the provided table in html code:",
    temperature=0.5,
    debug=True,
    max_array_length=4, 
    max_string_token_length=10,
)

print("Generating...")
result_anodesSOFCs = builder()

Generating...
[generate_string] prompt Generate a object with the following schema extracting the information from the provided table in html code:
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10<sup>−4</sup></td>
                                <td>
3.08</td>
    

[generate_number] response  3.08, "L
[generate_number] prompt Generate a object with the following schema extracting the information from the provided table in html code:
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10<sup>−4</sup></td>
                            

[generate_number] response  31.29, "D
[generate_number] prompt Generate a object with the following schema extracting the information from the provided table in html code:
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10<sup>−4</sup></td>
                           

[generate_number] response  5.406}, {"Ni
[generate_string] prompt Generate a object with the following schema extracting the information from the provided table in html code:
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10<sup>−4</sup></td>
                        

[generate_number] response  1.025, "L
[generate_number] prompt Generate a object with the following schema extracting the information from the provided table in html code:
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10<sup>−4</sup></td>
                           

[generate_string] response  "NDC4", "Crystalline size
[generate_number] prompt Generate a object with the following schema extracting the information from the provided table in html code:
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10<sup>−4</sup></td>
           

[generate_number] response  3.24, "L
[generate_number] prompt Generate a object with the following schema extracting the information from the provided table in html code:
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10<sup>−4</sup></td>
                            

In [97]:
highlight_values(result_anodesSOFCs)

{
  Ce-doped SOFC anodes: [
    {
      Ni-doped ceria NP: [94m"NDC1"[0m,
      Crystalline size DXRD (nm): [94m33.03[0m,
      Dislocation density, δ (nm−2): [94m9.161[0m,
      Lattice strain, ε = β/4 tan θ (×10−3): [94m3.08[0m,
      Lattice constant A (Å): [94m5.413[0m
    },
    {
      Ni-doped ceria NP: [94m"NDC2"[0m,
      Crystalline size DXRD (nm): [94m31.29[0m,
      Dislocation density, δ (nm−2): [94m1.021[0m,
      Lattice strain, ε = β/4 tan θ (×10−3): [94m3.22[0m,
      Lattice constant A (Å): [94m5.406[0m
    },
    {
      Ni-doped ceria NP: [94m"NDC3"[0m,
      Crystalline size DXRD (nm): [94m31.23[0m,
      Dislocation density, δ (nm−2): [94m1.025[0m,
      Lattice strain, ε = β/4 tan θ (×10−3): [94m3.7[0m,
      Lattice constant A (Å): [94m5.403[0m
    },
    {
      Ni-doped ceria NP: [94m"NDC4"[0m,
      Crystalline size DXRD (nm): [94m33.02[0m,
      Dislocation density, δ (nm−2): [94m9.154[0m,
      Lattice strain, ε = β/4 tan

In [134]:
with open('json_table_anodesSOFCs_wrong.json', 'w') as file:
    json.dump(result_anodesSOFCs, file)

#### improved prompt for finding numbers with specific notation

In [127]:
builder = JsonformerNoTokens(
    model=OpenAIModel("text-davinci-003", debug=False),
    json_schema=json_schema_anodesSOFCs,
    text=html_table_anodesSOFCs,
    prompt="Generate an object with the following schema extracting the information from the provided table in html code \
(if you find numbers as 1.025 × 10<sup>−3</sup>, this means 1.025e-3):",
    temperature=0.5,
    debug=True,
    max_array_length=4,
    max_string_token_length=10,
)

print("Generating...")
result_anodesSOFCs = builder()

Generating...
[generate_string] prompt Generate a object with the following schema extracting the information from the provided table in html code (If you find numbers as 1.025 × 10<sup>−3</sup>, this means 1.025e-3):
<table class="table">
                <thead>
                        <tr>
                                <th>
Ni-doped ceria NPs</th>
                                <th>
Crystalline size <em>D</em><sub>XRD</sub> (nm)</th>
                                <th>
Dislocation density, <em>δ</em> (nm<sup>−2</sup>)</th>
                                <th>
Lattice strain, <em>ε</em> = <em>β</em>/4 tan <em>θ</em> (×10<sup><sup>−3</sup></sup>)</th>
                                <th>
Lattice constant <em>A</em> (Å)</th>
                        </tr>
                </thead>
                            <tbody>
                        <tr>
                                <td>
NDC1</td>
                                <td>
33.03</td>
                                <td>
9.161 × 10

RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [95]:
highlight_values(result_anodesSOFCs)

{
  Ce-doped SOFC anodes: [
    {
      Ni-doped ceria NP: [94m"NDC1"[0m,
      Crystalline size DXRD (nm): [94m33.03[0m,
      Dislocation density, δ (nm−2): [94m0.0009161[0m,
      Lattice strain, ε = β/4 tan θ (×10−3): [94m3.08[0m,
      Lattice constant A (Å): [94m5.413[0m
    },
    {
      Ni-doped ceria NP: [94m"NDC2"[0m,
      Crystalline size DXRD (nm): [94m31.29[0m,
      Dislocation density, δ (nm−2): [94m0.0011021[0m,
      Lattice strain, ε = β/4 tan θ (×10−3): [94m3.22[0m,
      Lattice constant A (Å): [94m5.406[0m
    },
    {
      Ni-doped ceria NP: [94m"NDC3"[0m,
      Crystalline size DXRD (nm): [94m31.23[0m,
      Dislocation density, δ (nm−2): [94m0.0011025[0m,
      Lattice strain, ε = β/4 tan θ (×10−3): [94m3.7[0m,
      Lattice constant A (Å): [94m5.403[0m
    },
    {
      Ni-doped ceria NP: [94m"NDC4"[0m,
      Crystalline size DXRD (nm): [94m33.02[0m,
      Dislocation density, δ (nm−2): [94m0.0009154[0m,
      Lattice str

In [131]:
with open('json_table_anodesSOFCs.json', 'w') as file:
    json.dump(json_table_anodesSOFCs, file)