# 
# JSON structured data from tables in scientific papers

## OpenAI model

In [1]:
import os
import json
from typing import List

from IPython.display import HTML
from langchain.llms import OpenAI
from pydantic import BaseModel, Field
from pydantic.schema import schema

os.environ["OPENAI_API_KEY"] = "your_openai_api_key"

In [4]:
model_davinci = OpenAI(
    model_name="text-davinci-003",
    temperature=0.9,
    max_tokens=1200
    )

In [5]:
prompt_template = """
Generate a JSON object extracting the information from this table in html code:
"""

In [6]:
def prompt(prompt_template, html_table, json_schema):
    prompt_output = "Generate the JSON result with the following JSON schema and give only the JSON as output: "
    prompt = prompt_template + html_table + prompt_output + str(json_schema)
    return prompt

### Example: Properties of activated carbons for CO<sub>2</sub> adsorption

In [7]:
with open("html_tables/html_table_carbonsCO2ads.txt", 'r') as file:
    html_table_carbonsCO2ads = file.read()
    file.close()

In [8]:
html_table_carbonsCO2ads

'<table><thead><tr class="rowsep-1"><th scope="col">Feedstock</th><th scope="col">Temperature (<sup>∘</sup>C)</th><th scope="col">Surface area (m<sup>2</sup>/g)</th><th scope="col">Total Pore Volume (cm<sup>3</sup>/g)</th><th scope="col">CO<sub>2</sub> capacity (mg/g)</th><th scope="col">Ref.</th></tr></thead><tbody><tr><th scope="row">Rambutan peel</th><td class="align-char">500</td><td class="align-char">7.80</td><td>0.011</td><td class="align-char">27.83</td><td><a class="anchor workspace-trigger u-display-inline anchor-paragraph" href="#bib100" name="bbib100"><span class="anchor-text">[100]</span></a></td></tr><tr><th scope="row">Rambutan peel</th><td class="align-char">700</td><td class="align-char">175.84</td><td>0.111</td><td class="align-char">56.61</td><td><a class="anchor workspace-trigger u-display-inline anchor-paragraph" href="#bib100" name="bbib100"><span class="anchor-text">[100]</span></a></td></tr><tr><th scope="row">Rambutan peel</th><td class="align-char">900</td><td

In [9]:
with open("json_schemas/json_schema_carbonsCO2ads.json") as file:
    json_schema_carbonsCO2ads = json.load(file)
    file.close()

In [10]:
json_schema_carbonsCO2ads

{'type': 'object',
 'properties': {'Activated carbons': {'type': 'array',
   'items': {'type': 'object',
    'properties': {'Feedstock': {'type': 'string'},
     'Temperature (∘C)': {'type': 'number'},
     'Surface area (m2/g)': {'type': 'number'},
     'Total Pore Volume (cm3/g)': {'type': 'number'},
     'CO2 capacity (mg/g)': {'type': 'number'}}}}}}

In [18]:
prompt_carbonsCO2ads = prompt(prompt_template, html_table_carbonsCO2ads[:-5573], json_schema_carbonsCO2ads)

In [19]:
davinci_carbonsCO2ads = model_davinci(prompt_carbonsCO2ads)

#### pydantic json validation

In [23]:
class ActivatedCarbons(BaseModel):
    Feedstock: str
    Temperature: float = Field(alias = "Temperature (∘C)")
    Surface_area: float = Field(alias = "Surface area (m2/g)")
    Total_Pore_Volume: str = Field(alias = "Total Pore Volume (cm3/g)")
    Capacity_CO2: float = Field(alias = "Capacity CO2 (mg/g)")
        
class CarbonsList(BaseModel):
    activated_carbons: List[ActivatedCarbons] = Field(alias = "Activated carbons")

In [24]:
CarbonsList.parse_raw(davinci_carbonsCO2ads)

CarbonsList(activated_carbons=[ActivatedCarbons(Feedstock='Rambutan peel', Temperature=500.0, Surface_area=7.8, Total_Pore_Volume='0.011', Capacity_CO2=27.83), ActivatedCarbons(Feedstock='Rambutan peel', Temperature=700.0, Surface_area=175.84, Total_Pore_Volume='0.111', Capacity_CO2=56.61), ActivatedCarbons(Feedstock='Rambutan peel', Temperature=900.0, Surface_area=569.64, Total_Pore_Volume='0.313', Capacity_CO2=68.74), ActivatedCarbons(Feedstock='Hickory wood', Temperature=300.0, Surface_area=0.1, Total_Pore_Volume='NA', Capacity_CO2=34.48), ActivatedCarbons(Feedstock='Hickory wood', Temperature=450.0, Surface_area=12.9, Total_Pore_Volume='NA', Capacity_CO2=44.96), ActivatedCarbons(Feedstock='Hickory wood', Temperature=600.0, Surface_area=401.0, Total_Pore_Volume='NA', Capacity_CO2=61.0), ActivatedCarbons(Feedstock='Sugarcane baggasse', Temperature=300.0, Surface_area=5.2, Total_Pore_Volume='NA', Capacity_CO2=38.72), ActivatedCarbons(Feedstock='Sugarcane baggasse', Temperature=450.0, 

In [12]:
with open("structured_openai_results/davinci_carbonsCO2ads.json", 'w') as file:
    json.dump(json.loads(davinci_carbonsCO2ads), file)

In [13]:
with open("structured_openai_results/davinci_carbonsCO2ads.json") as file:
    davinci_carbonsCO2ads_json = json.load(file)
    file.close()

In [14]:
print(json.dumps(davinci_carbonsCO2ads_json, indent=4, ensure_ascii=False))

{
    "Activated carbons": [
        {
            "Feedstock": "Rambutan peel",
            "Temperature (∘C)": 500,
            "Surface area (m2/g)": 7.8,
            "Total Pore Volume (cm3/g)": 0.011,
            "Capacity CO2 (mg/g)": 27.83
        },
        {
            "Feedstock": "Rambutan peel",
            "Temperature (∘C)": 700,
            "Surface area (m2/g)": 175.84,
            "Total Pore Volume (cm3/g)": 0.111,
            "Capacity CO2 (mg/g)": 56.61
        },
        {
            "Feedstock": "Rambutan peel",
            "Temperature (∘C)": 900,
            "Surface area (m2/g)": 569.64,
            "Total Pore Volume (cm3/g)": 0.313,
            "Capacity CO2 (mg/g)": 68.74
        },
        {
            "Feedstock": "Hickory wood",
            "Temperature (∘C)": 300,
            "Surface area (m2/g)": 0.1,
            "Total Pore Volume (cm3/g)": "NA",
            "Capacity CO2 (mg/g)": 34.48
        },
        {
            "Feedstock": "Hickory wood",
