# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1200
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example: Perovskite-structured cathode materials for SOFCs 

In [15]:
with open("html_table_perovskiteSOFCs.txt", 'r') as file:
    html_table_perovskiteSOFCs = file.read()
    file.close()

In [16]:
html_table_perovskiteSOFCs

'<table><thead><tr class="rowsep-1 valign-top"><th scope="col">Composition</th><th scope="col"><span class="math"><span class="MathJax_Preview" style=""></span><span class="MathJax_SVG" id="MathJax-Element-17-Frame" tabindex="0" data-mathml="<math xmlns=&quot;http://www.w3.org/1998/Math/MathML&quot;><mrow is=&quot;true&quot;><msub is=&quot;true&quot;><mrow is=&quot;true&quot;><mi mathvariant=&quot;bold-italic&quot; is=&quot;true&quot;>&amp;#x3C3;</mi></mrow><mrow is=&quot;true&quot;><mi mathvariant=&quot;bold-italic&quot; is=&quot;true&quot;>e</mi></mrow></msub></mrow></math>" role="presentation" style="font-size: 90%; display: inline-block; position: relative;"><svg xmlns:xlink="http://www.w3.org/1999/xlink" width="2.737ex" height="1.724ex" viewBox="0 -513 1178.6 742.1" role="img" focusable="false" style="vertical-align: -0.532ex;" aria-hidden="true"><g stroke="currentColor" fill="currentColor" stroke-width="0" transform="matrix(1 0 0 -1 0 0)"><g is="true"><g is="true"><g is="true"><g

In [17]:
with open("json_schema_perovskiteSOFCs.json") as file:
    json_schema_perovskiteSOFCs = json.load(file)
    file.close()

In [18]:
json_schema_perovskiteSOFCs

{'type': 'object',
 'properties': {'Perovskite SOFC cathodes': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Composition': {'type': 'string'},
     '𝝈𝒆  (Scm−1)': {'type': 'string'},
     '𝝈𝒊  (Scm−1)': {'type': 'string'},
     'CTE (10−6K−1)': {'type': 'string'},
     'References': {'type': 'string'}}}}}}

In [19]:
prompt_perovskiteSOFCs = prompt(prompt_template, html_table_perovskiteSOFCs, json_schema_perovskiteSOFCs)

In [20]:
davinci_perovskiteSOFCs = model_davinci(prompt_perovskiteSOFCs)

In [106]:
with open('structured_openai_results/davinci_perovskiteSOFCs.json', 'w') as file:
    json.dump(json.loads(davinci_perovskiteSOFCs), file)

In [107]:
with open("structured_openai_results/davinci_perovskiteSOFCs.json") as file:
    davinci_perovskiteSOFCs_json = json.load(file)
    file.close()

In [108]:
print(json.dumps(davinci_perovskiteSOFCs_json, indent=4, ensure_ascii=False))

{
    "Perovskite SOFC cathodes": [
        {
            "Composition": "La1-xSrxMnO3",
            "𝝈𝒆  (Scm−1)": "130–300",
            "𝝈𝒊  (Scm−1)": "5.93 × 10−7",
            "CTE (10−6K−1)": "11–13",
            "References": "[90]"
        },
        {
            "Composition": "La1-xSrxCoO3",
            "𝝈𝒆  (Scm−1)": "1200–1600",
            "𝝈𝒊  (Scm−1)": "0.22",
            "CTE (10−6K−1)": "19–20",
            "References": "[91], [92]"
        },
        {
            "Composition": "La1-xSrxFeO3",
            "𝝈𝒆  (Scm−1)": "129–369",
            "𝝈𝒊  (Scm−1)": "0.205–5.6 × 10−3",
            "CTE (10−6K−1)": "12.2–16.3",
            "References": "[93], [94], [95]"
        },
        {
            "Composition": "La1-xSrxCoFeO3",
            "𝝈𝒆  (Scm−1)": "87–1050",
            "𝝈𝒊  (Scm−1)": "0.058–8 × 10−3",
            "CTE (10−6K−1)": "14.8–21.4",
            "References": "[96], [97]"
        },
        {
            "Composition": "Pr1-xSrxCo1-yFeyO3",
        