# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1500
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example: Biomass properties

In [7]:
with open("html_tables/html_table_biomass.txt", 'r') as file:
    html_table_biomass = file.read()
    file.close()

In [11]:
html_table_biomass[:-1363]

'<table><thead><tr class="valign-top"><th scope="col" class="align-left rowsep-1" rowspan="3">Sample</th><th scope="col" class="align-left" colspan="5">Ultimate analysis</th><th scope="col" class="align-left rowsep-1" colspan="4">Proximate analysis</th><th scope="col" class="align-left">HHV</th><th scope="col" class="align-left">H/O</th><th scope="col" class="align-left">He density</th></tr><tr class="valign-top"><th scope="col" class="align-left rowsep-1" colspan="5">(wt%, db)</th><th scope="col" class="align-left">(wt%)</th><th scope="col" class="align-left rowsep-1" colspan="3">(wt%, db)</th><th scope="col" class="align-left">(MJ/kg, db)</th><th scope="col" class="align-left">ratio</th><th scope="col" class="align-left">(g/cm<sup>3</sup>)<a class="anchor workspace-trigger u-display-inline anchor-paragraph" href="#tblfn2" name="btblfn2"><span class="anchor-text"><sup>b</sup></span></a></th></tr><tr class="rowsep-1 valign-top"><th scope="col" class="align-left">C</th><th scope="col" c

In [9]:
with open("json_schemas/json_schema_biomass.json") as file:
    json_schema_biomass = json.load(file)
    file.close()

In [10]:
json_schema_biomass

{'type': 'object',
 'properties': {'Biomass type': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Sample': {'type': 'string'},
     'Ultimate Analysis (wt%, db)': {'type': 'object',
      'properties': {'C': {'type': 'number'},
       'N': {'type': 'number'},
       'H': {'type': 'number'},
       'S': {'type': 'number'},
       'O': {'type': 'number'}}},
     'Proximate Analysis (wt%, db)': {'type': 'object',
      'properties': {'MC (wt%)': {'type': 'number'},
       'Ash': {'type': 'number'},
       'VM': {'type': 'number'},
       'FC': {'type': 'number'}}},
     'HHV (MJ/kg, db)': {'type': 'number'},
     'H/O': {'type': 'number'},
     'He density (g/cm3)': {'type': 'number'}}}}}}

In [60]:
prompt_biomass = prompt(prompt_template, html_table_biomass[:-1363], json_schema_biomass)

In [61]:
davinci_biomass = model_davinci(prompt_biomass)

In [81]:
with open("structured_openai_results/davinci_biomass.json", 'w') as file:
    json.dump(json.loads(davinci_biomass), file)

In [82]:
with open("structured_openai_results/davinci_biomass.json") as file:
    davinci_biomass_json = json.load(file)
    file.close()

In [84]:
print(json.dumps(davinci_biomass_json, indent=4, ensure_ascii=False))

{
    "Biomass type": [
        {
            "Sample": "AS",
            "Ultimate Analysis (wt%, db)": {
                "C": 49.44,
                "N": 0.31,
                "H": 5.85,
                "S": 0.05,
                "O": 42.9
            },
            "Proximate Analysis (wt%, db)": {
                "MC (wt%)": 6.5,
                "Ash": 1.45,
                "VM": 78.9,
                "FC": 19.6
            },
            "HHV (MJ/kg, db)": 19.565,
            "H/O": 2.16,
            "He density (g/cm3)": 1.252
        },
        {
            "Sample": "CHE",
            "Ultimate Analysis (wt%, db)": {
                "C": 50.22,
                "N": 0.34,
                "H": 5.55,
                "S": 0.01,
                "O": 43.41
            },
            "Proximate Analysis (wt%, db)": {
                "MC (wt%)": 8.4,
                "Ash": 0.47,
                "VM": 81.2,
                "FC": 18.3
            },
            "HHV (MJ/kg, db)": 19.109