In [1]:
import pandas as pd
from pathlib import Path
from figure import Figure, Label, Observation
import numpy as np
from document import Cord19Document

In [2]:

vil_files_path = Path('/Users/jtrell2/data/biocuration/vil-al-interface/files/cord19')
all_path = vil_files_path / 'all.parquet'

df_all = pd.read_parquet(all_path)

### 1. Import every dataset that does not come from tinman or CORD19
These datasets should be the easy ones as there is no associated document to match

In [3]:
all_parquet_file = 'cord19_higher-modality_v1.parquet'

tax_parquet_files = [
  'cord19_experimental_v1.parquet',
  'cord19_gel_v1.parquet',
  'cord19_graphics_v1.parquet',
  'cord19_microscopy_v1.parquet',
  'cord19_molecular_v1.parquet',
  'cord19_radiology_v1.parquet',
  'cord19_electron_v1.parquet'
]

In [4]:
vil_files_path = Path('/Users/jtrell2/data/biocuration/vil-al-interface/files/cord19')
df_all_images = pd.read_parquet(vil_files_path / all_parquet_file)

# while exploring radiology images, I found some cases that were not in high modality.parquet
# this is a mistake, so we need to fusion these images with the full_parquet data
df_missing_from_full = pd.read_parquet('./missing_rad_ang.parquet')
df_missing_from_full = df_missing_from_full[['img', 'label', 'caption', 'width', 'height', 'prediction', 'img_path']]

df_all_images = pd.concat([df_all_images, df_missing_from_full])

df_tinman = df_all_images[df_all_images.source == 'tinman']
df_cord19 = df_all_images[df_all_images.source == 'cord19']
df_others = df_all_images[~df_all_images.source.isin(['tinman', 'cord19'])]
df_full_labels = pd.read_parquet(vil_files_path / 'all.parquet')
print(df_full_labels.shape)

df_full_labels = pd.concat([df_full_labels, df_missing_from_full])
print(df_full_labels.shape)

(524139, 7)
(527091, 7)


In [5]:
import psycopg
from dotenv import dotenv_values

def connect(host: str, port: int, dbname: str, user: str, password: str) -> psycopg.Connection:
  conn_str = f"host={host} port={port} dbname={dbname} user={user} password={password}"
  return psycopg.connect(conn_str)

env_file = '../../.env'
config = dotenv_values(env_file)
conn = connect(**config)

In [6]:
import time

STATUS_LABELED = 0
STATUS_UNLABELED = 1
STATUS_LABELED_EXTERNALLY = 2

TYPE_FIGURE = 0
TYPE_SUBFIGURE = 1

def insert_figure(conn, figure: Figure):
  with conn.cursor() as cur:
    sql_string = "INSERT INTO dev.figures (name,caption,num_panes,fig_type,doc_id,status,uri,parent_id,width,height,coordinates,last_update_by,owner,migration_key,notes,labels,source) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) RETURNING id;"
    cur.execute(sql_string, figure.to_tuple())
    conn.commit()
    id = cur.fetchone()[0]
    return id

def insert_figures(db_params: dict, figures: list[Figure]):
    try:
      conn = connect(**db_params)
      with conn.cursor() as cur:
        with cur.copy("COPY dev.figures (name,caption,num_panes,fig_type,doc_id,status,uri,parent_id,width,height,coordinates,last_update_by,owner,migration_key,notes,labels,source) FROM STDIN") as copy:
          for f in figures:
            copy.write_row(f.to_tuple())
      conn.commit()
    except Exception as e:
      print(e)
    finally:
      conn.close()


def get_full_labels(df) -> dict[str,str]:
  labels = {}
  for idx, el in df.iterrows():
    labels[el.img_path] = el.label
  return labels

def insert_images_from_external_sources(config, df, full_labels):
  figures = []

  for idx, el in df.iterrows():
    # df_all has the more detailed labels
    label = full_labels[el.img_path]
    figure = Figure(
      id=None,
      status=STATUS_LABELED_EXTERNALLY,
      uri=el.img_path,
      width=el.width,
      height=el.height,
      type=TYPE_SUBFIGURE,
      name=el.name,
      caption=None if el.caption == "" else el.caption,
      num_panes=1,
      doc_id=None,
      parent_id=None,
      coordinates=None,
      last_update_by=None,
      owner=None,
      migration_key=None,
      notes=None,
      labels=[label],
      source=el.source
    )
    figures.append(figure)
  insert_figures(config, figures)


def build_pmc_to_id_doc_dic(db_params) -> dict[str,str]:
  """ Get a dictionary [pmcid, doc_id] to match figures to their corresponding
      source documents """
  try:
    conn = connect(**db_params)
    with conn.cursor() as cur:
      cur.execute("select id, pmcid from dev.documents where pmcid != ''")
      rows = cur.fetchall()
      return {r[1]: r[0] for r in rows}
  except Exception as e:
    print(e)
    raise Exception(e)
  finally:
    conn.close()


def extract_pmcid(img_path):
  els = img_path.split('/')
  return els[1]

def insert_images_from_cord19(
  config,
  df,
  full_labels: dict[str,str],
  pmc2id_dict: dict[str, str]):

  figures = []
  for idx, el in df.iterrows():
    label = full_labels[el.img_path]
    pmcid = extract_pmcid(el.img_path)
    doc_id = pmc2id_dict.get(pmcid, None)

    if doc_id is None:
      print(el.img_path)
    
    figure = Figure(
      id=None,
      status=STATUS_UNLABELED,
      uri=el.img_path,
      width=el.width,
      height=el.height,
      type=el.type,
      name=el.name,
      caption=None if el.caption == "" else el.caption,
      num_panes=1,
      doc_id=doc_id,
      parent_id=el.parent_id,
      coordinates=el.coordinates,
      last_update_by=None,
      owner=None,
      migration_key=None,
      notes=None,
      labels=None, # all this data is unlabeled,
      source=el.source
    )
    figures.append(figure)    
  insert_figures(config, figures)


In [7]:
def get_map_fig_uri_2_db_id(db_params: dict) -> dict[str, int]:
  """ 
    Query the database for the insert figures and create a dictionary that 
    matches the figure path to the database id. We can just this match to 
    populate the figure id for the labels table.
  """
  try:
    conn = connect(**db_params)
    with conn.cursor() as cur:
      cur.execute("select id, uri from dev.figures")
      rows = cur.fetchall()
      return {r[1]: r[0] for r in rows}
  except Exception as e:
    print(e)
    raise Exception(e)
  finally:
    conn.close()


def insert_labels_per_classifier(db_params: dict, labels: list[Label]):
  try:
    conn = connect(**db_params)
    with conn.cursor() as cur:
      with cur.copy("COPY dev.labels_cord19 (figure_id,classifier,label,prediction,features,pred_probs,margin_sample,entropy,split_set) FROM STDIN") as copy:
        for l in labels:
          copy.write_row(l.to_tuple())
    conn.commit()
  except Exception as e:
    print(e)
    raise Exception(e)
  finally:
    conn.close()  


def insert_labels(db_params: dict, parquet_files: list[Path]):
  uri2id = get_map_fig_uri_2_db_id(db_params)

  for parquet_file in parquet_files:
    classifier = parquet_file.name.split('_')[1]
    print(f"processing {classifier}")
    df = pd.read_parquet(parquet_file)
    df = df[~df.source.isin(['tinman'])]

    if df.shape[0] > 0:
      df = df.replace({np.nan: None})
      df["figure_id"] = df.apply(lambda x: uri2id[x.img_path], axis=1)

      if 'en_metric' not in df.columns:
        df['en_metric'] = None
      if 'ms_metric' not in df.columns:
        df['ms_metric'] = None
      
      labels = []
      for idx, el in df.iterrows():
        label = Label(classifier=classifier,
                      label=el.label,
                      features=el.features,
                      entropy=el.en_metric,
                      figure_id=el.figure_id,
                      margin_sample=el.ms_metric,
                      pred_probs=el.pred_probs,
                      prediction=el.prediction,
                      split_set=el.split_set)
        labels.append(label)
      print(f"inserting labels for {classifier} {len(labels)} rows")
      insert_labels_per_classifier(config, labels)
    else:
      print(f"Nothing to insert for {classifier}")


In [63]:
high_modality_filename = 'cord19_higher-modality_v1.parquet'
children_filenames = tax_parquet_files.copy()
children_filenames.append(high_modality_filename)
parquet_files = [vil_files_path / x for x in children_filenames]

full_labels = get_full_labels(df_full_labels)
# insert_images_from_external_sources(config, df_others, full_labels)
# insert_labels(config, parquet_files)

In [68]:
insert_images_from_external_sources(config, df_others, full_labels)

## Insert CORD19 figures

In [64]:
# from the uic dataset, figures have length 4 in image path and subfigures have length 5
from typing import Optional

def is_figure(splits: int) -> int:
  if splits == 3:
    return TYPE_FIGURE
  if splits == 4:
    return TYPE_FIGURE
  elif splits == 5:
    return TYPE_SUBFIGURE
  else:
    return -1

def calc_img_path_length(img_path: str) -> int:
  # supplementary material processed have an additional level to the structure
  splits = img_path.split('/')
  return len(splits) if 'supplementary' not in splits else len(splits) - 1

def get_parent_image(img_path: str, type: int) -> str:
  if type == TYPE_FIGURE:
    return None
  else:
    if 'figsplit' in img_path:
      return f"{str(Path(img_path).parent)}.jpg".replace("figsplit_", "")
    else:
      return f"{str(Path(img_path).parent)}.jpg"

df_cord19['path_length'] = df_cord19.apply(lambda x: calc_img_path_length(x.img_path), axis=1)
df_cord19['type'] = df_cord19.apply(lambda x: is_figure(x.path_length), axis=1)
df_cord19['parent_id'] = df_cord19.apply(lambda x: get_parent_image(x.img_path, x.type), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cord19['path_length'] = df_cord19.apply(lambda x: calc_img_path_length(x.img_path), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cord19['type'] = df_cord19.apply(lambda x: is_figure(x.path_length), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cord19['parent_id'] = df_co

In [65]:
# get the dictionary of documents for foreigin key
pmc2docid_dict = build_pmc_to_id_doc_dic(config)
# insert first the figures
df_figures = df_cord19[df_cord19.type == TYPE_FIGURE]
df_figures['coordinates'] = None
insert_images_from_cord19(config, df_figures, full_labels, pmc2docid_dict)
# map figure ids to replace the parent_id column
img_path_2_id = get_map_fig_uri_2_db_id(config)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_figures['coordinates'] = None


In [67]:
def get_coordinate_mapping(df, base_path: Path):
  # match coordinates to subfigure img_path
  subfigure_paths = list(set([f"{Path(x).parent}" for x in df.img_path]))

  img_path_2_coordinates = {}
  fails = []
  for p in subfigure_paths:
    filename = f"{Path(p).name}.jpg.txt"
    try:
      with open(base_path / p / filename, 'r') as f:
        # ignore first and second lines
        lines = f.readlines()[2:]
        for idx, line in enumerate(lines):
          line = line.replace("    ", " ").replace("\n", "").strip()
          line = line.split(' ')
          line = [float(x) for x in line if x != '']
          name = f"{p}/{str(idx+1).zfill(3)}.jpg"
          img_path_2_coordinates[name] = line
    except FileNotFoundError as error:
      fails.append(p)
  return img_path_2_coordinates


base_path = '/Users/jtrell2/data/biocuration/'
df_subfigures = df_cord19[df_cord19.type == TYPE_SUBFIGURE]
coordinate_mapping = get_coordinate_mapping(df_subfigures, Path(base_path))
df_subfigures.parent_id = df_subfigures.apply(lambda x: img_path_2_id[x.parent_id], axis=1)
df_subfigures['coordinates'] = df_subfigures.apply(lambda x: coordinate_mapping.get(x.img_path, None), axis=1)
insert_images_from_cord19(config, df_subfigures, full_labels, pmc2docid_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subfigures.parent_id = df_subfigures.apply(lambda x: img_path_2_id[x.parent_id], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subfigures['coordinates'] = df_subfigures.apply(lambda x: coordinate_mapping.get(x.img_path, None), axis=1)


## Insert tinman images

In [9]:
# all of tinman are subfigures...

# insert documents first
from os import listdir, path
from pathlib import Path

tinman_base_path = Path('/Users/jtrell2/data/biocuration/tinman')

tinman_folders = [x for x in listdir(tinman_base_path) if path.isdir(tinman_base_path / x) ]
tinman_docs = [str(Path('tinman') / x / x[1:]) + ".pdf" for x in tinman_folders]
tinman_docs[:4]


['tinman/pIDRD_27_1765432/IDRD_27_1765432.pdf',
 'tinman/p32203189/32203189.pdf',
 'tinman/pPMC6957273/PMC6957273.pdf',
 'tinman/p32218151/32218151.pdf']

In [36]:
from time import sleep
import requests
from datetime import datetime

def get_metadata(id, uri, dictionary, is_pmc):
  data = dictionary['result'][id]
  authors = [x['name'] for x in data['authors']]
  try:
    publication_date = datetime.strptime(data['epubdate'], '%Y %b %d')
  except:
    try:
      publication_date = datetime.strptime(data['pubdate'], '%Y %b')
    except:
      publication_date = datetime.strptime(data['pubdate'], '%Y %b %d')
  journal = data['fulljournalname']
  title = data['title']

  pubmed_id = None
  pmcid = None
  doi = None
  for articleid in data["articleids"]:
    if articleid['idtype'] == 'pmid':
      pubmed_id = articleid['value']
    if is_pmc:
      if articleid['idtype'] == 'pmcid':
        pmcid = articleid['value']
    else:
      if articleid['idtype'] == 'pmc':
        pmcid = articleid['value']
    if articleid['idtype'] == 'doi':
      doi = articleid['value']
  project = 'animo'
  license = None
  uri = uri

  return {
    'authors': authors,
    'publication_date': publication_date,
    'journal': journal,
    'title': title,
    'abstract': None,
    'project': project,
    'license': None,
    'uri': str(uri),
    'pmcid': pmcid,
    'pubmed_id': pubmed_id,
    'doi': doi,
    'cord_uid': None,
    'notes': None,
    'status': 'IMPORTED',
    'modalities': None
  }


tinman_docs_to_insert = []
pubmed_ids = []
pubmed_paths = []
pmcids = []
pmcs_paths = []
others = []
other_paths = []

for doc in tinman_docs:
  doc_path = Path(doc)
  doc_name = doc_path.name[:-4]
  if 'PMC' in doc_name:
    pmcids.append(doc_name[3:])
    pmcs_paths.append(doc_path)
  elif len(doc_name) == 8:
    pubmed_ids.append(doc_name)
    pubmed_paths.append(doc_path)
  else:
    others.append(doc_name)
    other_paths.append(doc_path)

concat_pubmed_ids = ','.join(pubmed_ids)
concat_pmcids = ','.join(pmcids)
res_pubmedids = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={concat_pubmed_ids}&retmode=json')
sleep(3)
res_pmcids = requests.get(f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pmc&id={concat_pmcids}&retmode=json')

dict_pubmed = res_pubmedids.json()
dict_pmc = res_pmcids.json()

documents = []
for id, doc_path in zip(pubmed_ids, pubmed_paths):
  try:
    metadata = get_metadata(id, doc_path, dict_pubmed, False)
    document = Cord19Document(**metadata)
    if document.pmcid is not None:
      document.pmcid = document.pmcid[3:]
    documents.append(document)
  except Exception as e:
    print(id)
    raise e

for id, doc_path in zip(pmcids, pmcs_paths):
  metadata = get_metadata(id, doc_path, dict_pmc, True)
  metadata['pmcid'] = int(metadata['pmcid'][3:]) 
  document = Cord19Document(**metadata)
  documents.append(document)

for id, doc_path in zip(others, other_paths):
  document = Cord19Document(
    abstract=None,
    authors=None,
    journal=None,
    title=id,
    cord_uid=None,
    doi=None,
    license=None,
    modalities=None,
    notes=None,
    pmcid=None,
    publication_date=None,
    project='animo',
    pubmed_id=None,
    repository=None,
    uri=str(doc_path),
    status='IMPORTED'
  )
  documents.append(document)




In [37]:
def insert_documents_to_db(db_params: dict, documents: list[Cord19Document]):
  try:
    conn = connect(**db_params)
    with conn.cursor() as cur:
      with cur.copy("COPY dev.documents (title, authors, abstract, publication_date, pmcid, pubmed_id, journal, repository, project, license, status, uri, doi, notes) FROM STDIN") as copy:
        for d in documents:
          copy.write_row(d.to_tuple())
    conn.commit()
  except Exception as e:
    print(e)
  finally:
    conn.close()

insert_documents_to_db(config, documents)

In [26]:
','.join(pubmed_ids)

'32203189,32218151,15525531,32057769,31752552,25874557,12826015,26028575,32201080,12702662,11110798,16854972,25479419,10655217,25869670,32065055,26028574,25671546,25474681,25330189,25850673,25443298,25330323,25232734,24732978,24929033,25474470,24968003,25054285,24810406,25594177,25978500,24918786,25167143,25101958,32238078,32198713,11060233,17223323,32209118,25982859,32747622,25181289,32231345,25357003,31856610,25527285,25644702,11546739,19158391,11091073,24844228,25144461,25372608,32758183,24684930,10444073,12783803,32229605,24684931,24999833,32466694,26121959,32210742,32105468,32249956,25635455,12847081,10015631,25493563,25264253,25467982,25340742,32181901,25448701,25423491,32142938,25195067,25273556,25808955,24901837,16831832,25913400,26083785,32167166,25788288,32178970,25154398,12054525,32723915,25053664,25723162,25981666,25907097,24836561,25261697,32243911,32092911,32660153,32221306,26044593,26061275,10209262,26079877,32200634,32178593,32255491,18050498,26057124,32155444,25527286,

In [None]:
def insert_images_from_tinman(
  config,
  df,
  full_labels: dict[str,str]):

  figures = []
  for idx, el in df.iterrows():
    label = full_labels[el.img_path]
    pmcid = extract_pmcid(el.img_path)

    if doc_id is None:
      print(el.img_path)
    
    figure = Figure(
      id=None,
      status=STATUS_UNLABELED,
      uri=el.img_path,
      width=el.width,
      height=el.height,
      type=el.type,
      name=el.name,
      caption=None if el.caption == "" else el.caption,
      num_panes=1,
      doc_id=doc_id,
      parent_id=el.parent_id,
      coordinates=el.coordinates,
      last_update_by=None,
      owner=None,
      migration_key=None,
      notes=None,
      labels=None, # all this data is unlabeled,
      source=el.source
    )
    figures.append(figure)    
  insert_figures(config, figures)

In [26]:
tinman_figures = {}
for idx, row in df_tinman.iterrows():
  figure_path = f"{Path(row.img_path).parent}.jpg"
  w, h = Image.open(base_path / figure_path).size
  if figure_path not in tinman_figures:
    tinman_figures[figure_path] = {
      'img': figure_path.split('/')[-1],
      'source': 'tinman',
      'fig_type': TYPE_FIGURE,
      'img_path': figure_path,
      'label': None,
      'split_set': None,
      'features': None,
      'prediction': None,
      'width': w,
      'height': h,
      'caption': row.caption,
      'ms_metric': None,
      'en_metric': None,
      'pred_probs': None
    }

tinman_figures = [tinman_figures[x] for x in tinman_figures]

df_tinman_figures = pd.DataFrame(tinman_figures)
df_tinman_subfigures = df_tinman.copy()



Unnamed: 0,img,source,fig_type,img_path,label,split_set,features,prediction,width,height,caption,ms_metric,en_metric,pred_probs
0,5_1.jpg,tinman,0,tinman/pP31412244/P31412244/5_1.jpg,,,,,860,860,Figure 2. Atg2A Localizes to MAM upon Autophag...,,,
1,9_1.jpg,tinman,0,tinman/pP31412244/P31412244/9_1.jpg,,,,,748,1264,no caption extracted for this image,,,
2,16_1.jpg,tinman,0,tinman/pP31412244/P31412244/16_1.jpg,,,,,1025,1123,no caption extracted for this image,,,
3,7_1.jpg,tinman,0,tinman/pP31412244/P31412244/7_1.jpg,,,,,843,1058,Figure 3. The MLD of Atg2A Is Responsible for ...,,,
4,17_1.jpg,tinman,0,tinman/pP31412244/P31412244/17_1.jpg,,,,,1024,1239,no caption extracted for this image,,,


In [10]:
from PIL import Image

base_path = Path('/Users/jtrell2/data/biocuration')
img = Image.open(base_path / 'tinman/pP31412244/P31412244/5_1.jpg')
print(img.size)

(860, 860)


In [73]:
df_tinman.head()

Unnamed: 0,img,source,img_path,label,caption,split_set,features,prediction,width,height,pred_probs,ms_metric,en_metric
0,5dfba084f5d15939fd4cade2,tinman,tinman/pP31412244/P31412244/5_1/001.jpg,mic,Figure 2. Atg2A Localizes to MAM upon Autophag...,TRAIN,"[0.28341508, 0.0, 2.1752567, 4.174899, 0.79845...",exp,314,196,"[0.9546151161193848, 0.005383739247918129, 0.0...",,
1,5dfba084f5d15939fd4cade3,tinman,tinman/pP31412244/P31412244/5_1/004.jpg,mic,Figure 2. Atg2A Localizes to MAM upon Autophag...,TRAIN,"[0.059746925, 0.0, 2.9038484, 2.5127983, 0.080...",exp,542,531,"[0.5600640773773193, 0.016546688973903656, 0.3...",,
2,5dfba084f5d15939fd4cade4,tinman,tinman/pP31412244/P31412244/5_1/003.jpg,gra,Figure 2. Atg2A Localizes to MAM upon Autophag...,TRAIN,"[0.14466317, 1.8963909, 1.6162982, 0.9271355, ...",gra,209,310,"[0.019948463886976242, 0.9020611047744751, 0.0...",,
3,5dfba084f5d15939fd4cade8,tinman,tinman/pP31412244/P31412244/9_1/007.jpg,gra,no caption extracted for this image,TRAIN,"[0.3676869, 4.7407584, 0.52103585, 0.18581437,...",gra,248,251,"[5.356776711096245e-08, 0.9984208345413208, 3....",,
4,5dfba084f5d15939fd4cadea,tinman,tinman/pP31412244/P31412244/9_1/006.jpg,gra,no caption extracted for this image,TRAIN,"[1.0687255, 4.2336006, 0.37320167, 0.09526259,...",gra,253,286,"[1.3379103380728452e-10, 0.9999992847442627, 9...",,


# Insert labels after inserting all figures

In [70]:
insert_labels(config, parquet_files)

processing experimental
inserting labels for experimental 28491 rows
processing gel
Nothing to insert for gel
processing graphics
inserting labels for graphics 259044 rows
processing microscopy
inserting labels for microscopy 20997 rows
processing molecular
inserting labels for molecular 8788 rows
processing radiology
inserting labels for radiology 4615 rows
processing electron
inserting labels for electron 789 rows
processing higher-modality
inserting labels for higher-modality 520811 rows


In [96]:
df_cord19['document'] = df_cord19.apply(lambda x: x.img_path.split('/')[1], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cord19['document'] = df_cord19.apply(lambda x: x.img_path.split('/')[1], axis=1)


In [152]:
# sanity check: parent figures exist
df_subfigures = df_cord19[df_cord19.type==1]

base_path = Path('/Users/jtrell2/data/biocuration/')

not_exist = []
for idx, row in df_subfigures.iterrows():
  p = base_path / row.parent_id
  if not p.exists():
    not_exist.append(p)

assert len(not_exist) == 0


In [108]:
# tinman is a bit different as the information for the PDF lies somewhere else...
# i'd to insert those documents first... 
df_tinman.head()

Unnamed: 0,img,source,img_path,label,caption,split_set,features,prediction,width,height,pred_probs,ms_metric,en_metric
0,5dfba084f5d15939fd4cade2,tinman,tinman/pP31412244/P31412244/5_1/001.jpg,mic,Figure 2. Atg2A Localizes to MAM upon Autophag...,TRAIN,"[0.28341508, 0.0, 2.1752567, 4.174899, 0.79845...",exp,314,196,"[0.9546151161193848, 0.005383739247918129, 0.0...",,
1,5dfba084f5d15939fd4cade3,tinman,tinman/pP31412244/P31412244/5_1/004.jpg,mic,Figure 2. Atg2A Localizes to MAM upon Autophag...,TRAIN,"[0.059746925, 0.0, 2.9038484, 2.5127983, 0.080...",exp,542,531,"[0.5600640773773193, 0.016546688973903656, 0.3...",,
2,5dfba084f5d15939fd4cade4,tinman,tinman/pP31412244/P31412244/5_1/003.jpg,gra,Figure 2. Atg2A Localizes to MAM upon Autophag...,TRAIN,"[0.14466317, 1.8963909, 1.6162982, 0.9271355, ...",gra,209,310,"[0.019948463886976242, 0.9020611047744751, 0.0...",,
3,5dfba084f5d15939fd4cade8,tinman,tinman/pP31412244/P31412244/9_1/007.jpg,gra,no caption extracted for this image,TRAIN,"[0.3676869, 4.7407584, 0.52103585, 0.18581437,...",gra,248,251,"[5.356776711096245e-08, 0.9984208345413208, 3....",,
4,5dfba084f5d15939fd4cadea,tinman,tinman/pP31412244/P31412244/9_1/006.jpg,gra,no caption extracted for this image,TRAIN,"[1.0687255, 4.2336006, 0.37320167, 0.09526259,...",gra,253,286,"[1.3379103380728452e-10, 0.9999992847442627, 9...",,
