In [1]:
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

import pandas as pd
import random

import pyterrier as pt
#if not pt.started():
#   pt.init()

from cord19_plus.data_model.model import Table
from cord19_plus.data_model.model_gold import GoldTable

from dotenv import dotenv_values
from cord19_plus.data_model.database_setup import create_engine

from cord19_plus.data_model.model_gold import Base

db_vals = dotenv_values("/workspaces/CORD19_Plus/src/cord19_plus/data_model/cord19.env")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
engine = create_engine(f"postgresql+psycopg2://{db_vals['USER']}:{db_vals['PASSWORD']}@{db_vals['ADDRESS']}:{db_vals['PORT']}/{db_vals['DB']}", echo=False)
session = Session(engine)

In [2]:
engine_gold = create_engine(f"postgresql+psycopg2://{db_vals['USER']}:{db_vals['PASSWORD']}@{db_vals['ADDRESS']}:{db_vals['PORT']}/gold_tables", echo=False)
#Base.metadata.create_all(engine_gold)

session_gold = Session(engine_gold)


In [None]:
result = session.query(Table)
#result_dict = [{"docno" : str(e.id), "ir_id" : e.ir_id, "content" : e.content, "content_json" : e.content_json} for e in result]

result_dict = [{"docno" : str(e.ir_tab_id), "ir_id" : e.ir_id, "content" : e.content, "content_json" : e.content_json, "caption" : e.caption} for e in result]

In [None]:
dataset = pt.get_dataset('irds:cord19/fulltext/trec-covid')
qrels = dataset.get_qrels()
qrels = qrels[qrels['qid'] == "1"]

In [None]:
avail_ids = set([entry['ir_id'] for entry in result_dict])
avail_tab_ids = list(set([entry['docno'] for entry in result_dict]))
num_tables = {idx: len([entry for entry in result_dict if entry['ir_id'] == idx]) for idx in avail_ids}
qrels = qrels[qrels['docno'].isin(avail_ids)]

In [None]:
count_df = pd.DataFrame({"docno": num_tables.keys(), "count": num_tables.values()})

In [None]:
qrels = qrels.merge(count_df)

In [None]:
def annotation_set(avail_tab_ids, qrels, size=10):
    print(len(avail_tab_ids))
    random.shuffle(avail_tab_ids)
    annotation_set = {0:[], 1:[], 2:[]}

    for tab_id in avail_tab_ids:
         label = qrels[qrels['docno'] == tab_id.split("_")[0]].label.item()
         if len(annotation_set[label]) < size:
            annotation_set[label].append(tab_id)
            avail_tab_ids.remove(tab_id)
    print(len(avail_tab_ids))
    return annotation_set

In [None]:
a1_annotation_set = annotation_set(avail_tab_ids, qrels)
a2_annotation_set = annotation_set(avail_tab_ids, qrels)

In [None]:
#tables_to_commit = session.query(Table).filter(Table.ir_tab_id.in_(set(bjoern_annotation_set[0]))).all()

In [None]:
#tables_to_commit

In [None]:
def commit_gold_tables(annotation_set, name, session, session_gold):
    """
    Commits the given gold tables to the database. 
    Args:
        annotation_set (AnnotationSet): The annotation set to be committed.
    """
    annotation_ids = []
    for sub_list in annotation_set.values():
        annotation_ids += sub_list


    tables_to_commit = session.query(Table).filter(Table.ir_tab_id.in_(set(annotation_ids))).all()
    table: Table
    for table in tables_to_commit: 
        t = GoldTable(content=table.content,
                      ir_tab_id = table.ir_tab_id,
                      ir_id=table.ir_id,
                      content_json=table.content_json,
                      caption= table.caption,
                      position_left = table.position_left,
                      position_top = table.position_top,
                      position_page = table.position_page,
                      width =table.width,
                      height = table.height,
                        # proxy references
                      references=["d1", "d2", "d3"],
                      annotator_name = name)
                      
        session_gold.add(t)
    session_gold.commit()

In [None]:
commit_gold_tables(a1_annotation_set, "Annotator 1" ,session, session_gold)

In [None]:
commit_gold_tables(a2_annotation_set, "Annotator 2" ,session, session_gold)

In [None]:
#get pdfs

engine_gold = create_engine(f"postgresql+psycopg2://{db_vals['USER']}:{db_vals['PASSWORD']}@{db_vals['ADDRESS']}:{db_vals['PORT']}/gold_tables", echo=False)
#Base.metadata.create_all(engine_gold)

session_gold = Session(engine_gold)

In [3]:
tables = session_gold.query(GoldTable)

In [5]:
t_ids = [t.ir_id for t in tables]

In [9]:
files = [f"{t_id}.pdf" for t_id in set(t_ids)]

In [11]:
files

['kjovtgua.pdf',
 'w7ej6jfg.pdf',
 'k2juhyex.pdf',
 'zk44e4qy.pdf',
 'lxakf79k.pdf',
 '1aal6njl.pdf',
 '51w1fe7k.pdf',
 '8ow952d8.pdf',
 'hib30ct6.pdf',
 'qele28zk.pdf',
 'w0pbk3kv.pdf',
 'cyp9fbw0.pdf',
 'zdv0ilti.pdf',
 '4ihv80au.pdf',
 'd3rrnjz2.pdf',
 '8arwlhf0.pdf',
 'eav5gr3y.pdf',
 'v08cs51n.pdf',
 '8hiurkho.pdf',
 '50oy9qqy.pdf',
 'e6jt8yhs.pdf',
 'aeyf0yu1.pdf',
 'd8n9711b.pdf',
 'rmmp3gms.pdf',
 'qyg8hn56.pdf',
 'kzk4i2j2.pdf',
 'tsje2x90.pdf',
 'x9bxnrtn.pdf',
 'b2znv6pa.pdf',
 'dbowa5bt.pdf',
 'bojfc3q0.pdf',
 '08ds967z.pdf',
 'qeehgxa1.pdf',
 'unvabosp.pdf',
 'jgwvjkbj.pdf',
 'ik15f074.pdf',
 'zqf351sv.pdf',
 'uexahhdr.pdf',
 '1s6dlcer.pdf',
 'jljjqs6m.pdf',
 'a56u5e2o.pdf',
 'zu46bdpu.pdf',
 '9slpoyz7.pdf',
 'i758v1vb.pdf',
 'iy4knx7j.pdf',
 'fcmzdcuh.pdf']