In [None]:
# generate new folder with clean pn jsons and fiterted images
from papermage import Document
import os
from pathlib import Path
import warnings
import json
from cord19_plus.utils import image_from_box
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import groupby
import pandas as pd
import shutil
import glob
from multiprocessing import Manager
from tqdm import tqdm
import pymupdf
from concurrent.futures import ProcessPoolExecutor
import logging
logger = logging.getLogger()

# Logging-Level auf einen Wert setzen, der höher ist als CRITICAL
logger.setLevel(logging.CRITICAL + 1)

from cord19_plus.data_model.helpers.caption import (get_caption_for_box,
                                                    get_cleaned_captions,
                                                    text_matches_pattern)


#logger = logging.getLogger(__name__)
#logging.basicConfig(level=logging.INFO)
# Define the paths
input_folder = '/workspaces/CORD19_Plus/data/rel_pdfs_out'
input_folder = "/workspaces/CORD19_Plus/data/clean/pub_json2/*.json"

input_pdf_path = "/workspaces/CORD19_Plus/data/rel_pdfs"
output_pub = "/workspaces/CORD19_Plus/data/clean/pub_json2"
output_tab_img = "/workspaces/CORD19_Plus/data/clean/tab_img2"
output_pub_pdf = "/workspaces/CORD19_Plus/data/clean/pub_pdf3"

In [None]:
#pub_jsons = glob.glob(os.path.join(input_folder, '*.json'))
pub_jsons = sorted(glob.glob(input_folder))[:100]

In [None]:
def fix_tables(doc):
    tables = doc.tables
    healthy_tables = []

    for table in tables:
        
        t_caps_cleaned = get_cleaned_captions(doc)
        
        t_caption = get_caption_for_box(box=table.boxes[0],
                                        captions=doc.captions,
                                        caption_ids=t_caps_cleaned["tables"])
        
        if text_matches_pattern(t_caption, "tables") or text_matches_pattern(table.text, "tables"):
            healthy_tables.append(table)

    return healthy_tables

In [None]:
def save_clean_tables(path, shared_dict):

    #check if output already exists
    file_name = path.split("/")[-1]
    #if os.path.exists(f"{output_pub}/{file_name}"):
    #    return

    try:
        with open(path, "r") as f:
            doc_json = json.load(f)
            current_doc = Document.from_json(doc_json)
        

        with shared_dict['lock']:  # Ensure shared access is thread-safe
            shared_dict['all_cnt'] += len(current_doc.tables)
            shared_dict['all_tables'] += [{path.split("/")[-1].replace(".json", "") : len(current_doc.tables)}]
        
            
        current_doc.tables.entities = fix_tables(current_doc)
        
        with shared_dict['lock']:
            shared_dict['healthy_cnt'] += len(current_doc.tables)
            shared_dict['healthy_tables'] += [{path.split("/")[-1].replace(".json", "") : len(current_doc.tables)}]

        
        with open(f"{output_pub}/{file_name}", "w") as _f:
            json.dump(current_doc.to_json(), _f)
        
        #save images
        entities = list(current_doc.get_layer("tables"))
        
        entities = sorted(entities, key=lambda x: x.boxes[0].page)
        for key, group in groupby(entities, key=lambda x: x.boxes[0].page):
            for table_id, table in enumerate(list(group)):
                box = table.boxes[0]
                page = box.page
                doc_id = file_name.split(".")[0]
                scale = pymupdf.Matrix(2, 2)

                im_path = output_tab_img / Path(f"{doc_id}_{page}_{table_id}.png")

                try:
                    shutil.copy(f"{input_pdf_path}/{doc_id}.pdf", f"{output_pub_pdf}/{doc_id}.pdf")
                    image_from_box(box, f"{input_pdf_path}/{doc_id}.pdf", im_path, scale)
                    logger.info(
                        f"For {doc_id} : {input_pdf_path}/{doc_id}.pdf exported table nr {table_id} from page {box.page} to {im_path}"
                    )
                except ValueError as e:
                    logger.error(
                        f"Failed for {doc_id} table {table_id} from page {box.page}, pdf path {input_pdf_path}/{doc_id}.pdf. Because of ValueError: {e}"
                    )
                    continue
                except pymupdf.FileNotFoundError as f:
                    logger.error(f"PDF file not found for {doc_id},{input_pdf_path}/{doc_id}.pdf ")
    except:
        print(f"something wrong with {path}")

In [None]:
max_workers = 10

all_cnt = None
healthy_cnt = None
all_tables = None
healthy_tables = None


with ProcessPoolExecutor(max_workers=max_workers) as executor:

    with Manager() as manager:
        shared_dict = manager.dict()
        shared_dict['all_cnt'] = 0  # Shared counter for all tables
        shared_dict['healthy_cnt'] = 0  # Shared counter for healthy tables
        shared_dict['lock'] = manager.Lock()  # Lock to synchronize access

        shared_dict['healthy_tables'] = []
        shared_dict['all_tables'] = []    

        futures = [
            executor.submit(save_clean_tables, path, shared_dict)
            for path in pub_jsons[:len(pub_jsons)]
        ]

        # Use tqdm for showing progress
        for _ in tqdm(futures, total=len(futures), desc="Total PDFs", leave=True):
            _.result()  # This will propagate any exceptions raised during processing

        all_cnt = shared_dict['all_cnt']
        healthy_cnt = shared_dict['healthy_cnt']
        all_tables = shared_dict['all_tables']
        healthy_tables = shared_dict['healthy_tables']

In [None]:
all_tables = {k: v for d in all_tables for k, v in d.items()}
healthy_tables  = {k: v for d in healthy_tables for k, v in d.items()}

In [None]:
all_df = pd.DataFrame(list(all_tables.items()), columns=["docno", "all_tables"])
healthy_df = pd.DataFrame(list(healthy_tables.items()), columns=["docno", "healthy_tables"])

In [None]:
table_df = pd.merge(all_df, healthy_df, on='docno', how='left')

In [None]:
qrels_df = pd.read_csv("/workspaces/CORD19_Plus/data/qrels.csv")

In [None]:


merged_df = pd.merge(qrels_df, table_df, on='docno', how='left')

merged_df[['all_tables', 'healthy_tables']] = merged_df[['all_tables', 'healthy_tables']].fillna(0)

# Group by 'qid' and sum 'all_tables' and 'healthy_tables'
grouped = merged_df.groupby('qid')[['all_tables', 'healthy_tables']].sum().reset_index()

# Sort by 'qid' for better visualization
grouped = grouped.sort_values('qid')

# Melt the DataFrame to long format for seaborn
melted = grouped.melt(id_vars='qid', 
                      value_vars=['all_tables', 'healthy_tables'],
                      var_name='Table Type', 
                      value_name='Count')

# Set the style of the visualization
sns.set(style="whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(20, 10))

# Create a barplot
sns.barplot(x='qid', y='Count', hue='Table Type', data=melted)

# Add titles and labels
plt.title('Number of Tables and Healthy Tables per QID', fontsize=16)
plt.xlabel('QID', fontsize=14)
plt.ylabel('Number of Tables', fontsize=14)

# Customize the legend
plt.legend(title='Table Type', fontsize=12, title_fontsize=12)

# Improve layout
plt.tight_layout()

# Display the plot
plt.show()