In [None]:
import glob
import os
from langchain import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv
from langchain_core.output_parsers import JsonOutputParser

import base64
import requests
import pandas as pd
from tqdm import tqdm
import json
import pickle

from pydantic import BaseModel

In [None]:
class TableSchema(BaseModel):
    
    header: list[str]
    content: list[list[str]]

In [None]:
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')
  
def get_pic_name_by_path(path):
  return path.split("/")[-1].split(".")[0]

In [None]:
load_dotenv("/workspaces/CORD19_Plus/.env")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [None]:
#model = "gpt-4o"
model = "gpt-4o-mini"

input_path = "/workspaces/CORD19_Plus/data/clean/tab_img2/"

root_parse_path = "/workspaces/CORD19_Plus/data/clean/pub_json2/*.json"
table_root_path = "/workspaces/CORD19_Plus/data/clean/tab_json2"
json_paths = sorted(glob.glob(root_parse_path))

#input_pics = sorted(glob.glob(input_path))
llm = ChatOpenAI(model = model, response_format=TableSchema)
parser = JsonOutputParser()

In [None]:
input_pics = []
for path in tqdm(json_paths):
    file_name = path.split("/")[-1].replace(".json", "")
    input_pics += glob.glob(f"{input_path}{file_name}*")

In [None]:
def process_image(pic, llm, parser, prompt, path_to_safe = table_root_path):

    #check if json extraction already exists
    name = get_pic_name_by_path(pic)
    if os.path.exists(f'{path_to_safe}/{name}.json'):
        with open(f'{path_to_safe}/{name}.json', 'r') as json_file:
            parsed_res = json.load(json_file)
            return parsed_res

    base64_image = encode_image(pic)
    res = llm.invoke(
        [
            HumanMessage(
                content=[
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}"
                        }
                    }
                ]
            )
        ]
    )
    parsed_res = parser.parse(res.content)
    
    #pickle.dump(parsed_res, open(f"{path_to_safe}/{name}", "wb"))

    with open(f'{path_to_safe}/{name}.json', 'w') as json_file:
        json.dump(parsed_res, json_file, indent=4)

    return parsed_res

In [None]:
import concurrent.futures
from tqdm import tqdm

results = []
max_workers = 60

prompt= """
The input contains a picture of a table. 
I want you to parse the table and return a json representation of it.
Just return the json without any surrounding text.
"""

prompt = """
You are an expert at extracting textual and tabular data from an image. 
You will be given an image that originates from a research paper, and you should parse all information from the table without surrounding text.
Return all information to the given structure.
"""

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    
    # schedules the calls for every pic
    futures = {executor.submit(process_image, pic, llm, parser, prompt): pic for pic in input_pics}
    print(f"{len(futures)} tasks started")


    with tqdm(total=len(futures)) as pbar:
        # aggregate the results
        for future in concurrent.futures.as_completed(futures):
            
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"There was an error: {e}")
            
            pbar.update()