# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1200
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example: Supercapacitor performance of 2D nanocomposite materials

In [7]:
with open("html_tables/html_table_supercapacitor.txt", 'r') as file:
    html_table_supercapacitor = file.read()
    file.close()import json

In [8]:
html_table_supercapacitor

'<table><thead><tr class="rowsep-1 valign-top"><th scope="col">Sl no</th><th scope="col">Electrodes</th><th scope="col">Specific capacitance (F/g)</th><th scope="col">Electrolyte</th><th scope="col">Current density (A/g)</th><th scope="col">Ref.</th></tr></thead><tbody><tr><td class="align-char">1</td><td>FG</td><td class="align-char">276</td><td>1&nbsp;M H<sub>2</sub>SO<sub>4</sub></td><td>0.1</td><td><a class="anchor workspace-trigger u-display-inline anchor-paragraph" href="#bb0425" name="bbb0425"><span class="anchor-text">[85]</span></a></td></tr><tr><td class="align-char">2</td><td>Ti<sub>3</sub>C<sub>2</sub>T<sub>x</sub></td><td class="align-char">140</td><td>1&nbsp;M KOH</td><td>5&nbsp;mV/s</td><td><a class="anchor workspace-trigger u-display-inline anchor-paragraph" href="#bb0430" name="bbb0430"><span class="anchor-text">[86]</span></a></td></tr><tr><td class="align-char">3</td><td>EDA- Ti<sub>3</sub>C<sub>2</sub>T<sub>x</sub></td><td class="align-char">486.2</td><td>1&nbsp;M H

In [9]:
with open("json_schemas/json_schema_supercapacitor.json") as file:
    json_schema_supercapacitor = json.load(file)
    file.close()

In [10]:
json_schema_supercapacitor

{'type': 'object',
 'properties': {'Nanocomposites': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Electrode': {'type': 'string'},
     'Specific capacitance (F/g)': {'type': 'number'},
     'Electrolyte': {'type': 'string'},
     'Current density (A/g)': {'type': 'string'}}}}}}

In [30]:
prompt_supercapacitor = prompt(prompt_template, html_table_supercapacitor, json_schema_supercapacitor)

In [31]:
davinci_supercapacitor = model_davinci(prompt_supercapacitor)

#### pydantic json validation

In [59]:
class Nanocomposites(BaseModel):
    Electrode: str
    Specific_capacitance: float = Field(alias = "Specific capacitance (F/g)")
    Electrolyte: str
    Current_density: str = Field(alias = "Current density (A/g)")
        
class NanocompositesList(BaseModel):
    Nanocomposites: List[Nanocomposites]

In [63]:
NanocompositesList.parse_raw(davinci_supercapacitor)

NanocompositesList(Nanocomposites=[Nanocomposites(Electrode='FG', Specific_capacitance=276.0, Electrolyte='1 M H2SO4', Current_density='0.1'), Nanocomposites(Electrode='Ti3C2Tx', Specific_capacitance=140.0, Electrolyte='1 M KOH', Current_density='5 mV/s'), Nanocomposites(Electrode='EDA- Ti3C2Tx', Specific_capacitance=486.2, Electrolyte='1 M H2SO4', Current_density='2 mV/s'), Nanocomposites(Electrode='MoS2 NW', Specific_capacitance=122.0, Electrolyte='1 M Na2SO4', Current_density='0.5'), Nanocomposites(Electrode='MoS2 NS', Specific_capacitance=138.0, Electrolyte='1 M Na2SO4', Current_density='1'), Nanocomposites(Electrode='MoS2 NW', Specific_capacitance=142.0, Electrolyte='1 M KCL', Current_density='0.59'), Nanocomposites(Electrode='NiCo-LDHs', Specific_capacitance=1187.2, Electrolyte='6 M KOH', Current_density='1'), Nanocomposites(Electrode='CoSx/Ni-Co LDH', Specific_capacitance=1562.0, Electrolyte='6 M KOH', Current_density='1')])

In [64]:
with open("structured_openai_results/davinci_supercapacitor.json", 'w') as file:
    json.dump(json.loads(davinci_supercapacitor), file)

In [65]:
with open("structured_openai_results/davinci_supercapacitor.json") as file:
    davinci_supercapacitor_json = json.load(file)
    file.close()

In [66]:
print(json.dumps(davinci_supercapacitor_json, indent=4, ensure_ascii=False))

{
    "Nanocomposites": [
        {
            "Electrode": "FG",
            "Specific capacitance (F/g)": 276,
            "Electrolyte": "1 M H2SO4",
            "Current density (A/g)": "0.1"
        },
        {
            "Electrode": "Ti3C2Tx",
            "Specific capacitance (F/g)": 140,
            "Electrolyte": "1 M KOH",
            "Current density (A/g)": "5 mV/s"
        },
        {
            "Electrode": "EDA- Ti3C2Tx",
            "Specific capacitance (F/g)": 486.2,
            "Electrolyte": "1 M H2SO4",
            "Current density (A/g)": "2 mV/s"
        },
        {
            "Electrode": "MoS2 NW",
            "Specific capacitance (F/g)": 122,
            "Electrolyte": "1 M Na2SO4",
            "Current density (A/g)": "0.5"
        },
        {
            "Electrode": "MoS2 NS",
            "Specific capacitance (F/g)": 138,
            "Electrolyte": "1 M Na2SO4",
            "Current density (A/g)": "1"
        },
        {
            "Electrode"