In [1]:
from dotenv import dotenv_values

In [2]:
env_file = '../../.env'

In [3]:
config = dotenv_values(env_file)

In [4]:
dict(config)

{'dbname': 'biocuration',
 'user': 'biocurator',
 'password': '20050326',
 'port': '5432',
 'host': 'localhost'}

In [5]:
import psycopg

In [6]:
def connect(host: str, port: int, dbname: str, user: str, password: str) -> psycopg.Connection:
  conn_str = f"host={host} port={port} dbname={dbname} user={user} password={password}"
  return psycopg.connect(conn_str)

In [13]:
conn = connect(**config)

In [14]:
conn

<psycopg.Connection [IDLE] (host=localhost user=biocurator database=biocuration) at 0x7ff4d89f60e0>

In [15]:
conn.close()

In [7]:
PATH_TO_METADATA = '/Users/jtrell2/data/biocuration/cord19_datasets/2021-01-12/metadata.csv'

import csv
from document import Cord19Document
from datetime import datetime
from os import listdir
from pathlib import Path

PROJECT = 'cord19'
ROOT_UIC_DOCS = Path('/Users/jtrell2/data/biocuration/cord19-uic')

documents = []

uic_pmc_ids = listdir(ROOT_UIC_DOCS)

with open(PATH_TO_METADATA) as f_in:
  reader = csv.DictReader(f_in)

  for row in reader:
    if len(row['publish_time']) == 0:
      publication_date = None
    elif len(row['publish_time']) == 4:
      publication_date = datetime(int(row['publish_time']), 1, 1)
    else:
      publication_date = datetime.strptime(row['publish_time'], "%Y-%m-%d")

    uri = None
    if row['pmcid'] in uic_pmc_ids:
      uri = f"{ROOT_UIC_DOCS.stem}/{row['pmcid']}"

    pubmed_id = int(row['pubmed_id']) if row['pubmed_id'].isdecimal() else None

    authors = row['authors'].split('; ')
    if len(authors[0]) == 0:
      authors = None
    else:
      for author in authors:
        author = author.replace("\x00", '')

    document = Cord19Document(title=row['title'],
                             abstract=row['abstract'],
                             authors=authors,
                             modalities=None,
                             publication_date=publication_date,
                             pmcid=row['pmcid'],
                             pubmed_id=pubmed_id,
                             license=row['license'],
                             journal=row['journal'],
                             doi=row['doi'],
                             cord_uid=row['cord_uid'],
                             repository=row['source_x'],
                             uri=uri,
                             status='IMPORTED',
                             project=PROJECT,
                             notes=None)
    documents.append(document)

In [10]:
conn = connect(**config)

with conn.cursor() as cur:
  with cur.copy("COPY dev.documents (title, authors, abstract, publication_date, pmcid, pubmed_id, journal, repository, project, license, status, uri, doi, notes) FROM STDIN") as copy:
    for d in documents:
      copy.write_row(d.to_tuple())
conn.commit()



In [9]:
max_len_abstract = 0
t = ""
for d in documents:
  if len(d.journal) > max_len_abstract:
    max_len_abstract = len(d.journal)
    t = d.journal
t

'Decima Novena Conferencia Iberoamericana en Sistemas, Cibernetica e Informatica, CISCI 2020, Decimo Septimo Simposium Iberoamericano en Educacion, Cibernetica e Informatica, SIECI 2020 - 19th Ibero-American Conference on Systems, Cybernetics and Informatics, CISCI 2020, 17th Ibero-American Symposium on Education, Cybernetics and Informatics, SIECI 2020'

In [35]:
datetime(2008, 1, 1)

datetime.datetime(2008, 1, 1, 0, 0)

In [50]:
from pathlib import Path

ROOT_UIC_DOCS = '/Users/jtrell2/data/biocuration/cord19-uic'
Path(ROOT_UIC_DOCS).stem

'cord19-uic'

In [12]:
s = "1123213"
s.isdecimal()

True

In [17]:
from dataclasses import dataclass, field, fields
from typing import Optional

@dataclass
class Test:
  something: Optional(int)

  def __post_init__(self):
    for field in fields(self):
        value = getattr(self, field.name)
        if not isinstance(value, field.type):
            raise ValueError(f'Expected {field.name} to be {field.type}, '
                             f'got {repr(value)}')

a = Test()
a

TypeError: Cannot instantiate typing.Optional

In [18]:
root = Path('/Users/jtrell2/data/biocuration/')
pdf_location_uic = 'cord19-uic'
pdf_location_udel = 'CORD19/batch_udel/PDFs'
metadata_path = '/Users/jtrell2/data/biocuration/cord19_datasets/2022-06-02/metadata.csv'

existing_pdfs_uic = {x: f"{pdf_location_uic}/{x}/main.pdf" for x in listdir(root / pdf_location_uic)}
existing_pdfs_udel = {x: f"{pdf_location_udel}/{x}.pdf" for x in listdir(root / pdf_location_udel)}
existing_pdfs = existing_pdfs_uic.update(existing_pdfs_udel)

In [19]:
existing_pdfs

In [21]:
existing_pdfs_udel

{'1awau7hm.pdf': 'CORD19/batch_udel/PDFs/1awau7hm.pdf.pdf',
 '24lz9tf1.pdf': 'CORD19/batch_udel/PDFs/24lz9tf1.pdf.pdf',
 '093yih5n.pdf': 'CORD19/batch_udel/PDFs/093yih5n.pdf.pdf',
 '1fsih7l6.pdf': 'CORD19/batch_udel/PDFs/1fsih7l6.pdf.pdf',
 '0yfrux89.pdf': 'CORD19/batch_udel/PDFs/0yfrux89.pdf.pdf',
 '1tr9hhsf.pdf': 'CORD19/batch_udel/PDFs/1tr9hhsf.pdf.pdf',
 '0wskg7v1.pdf': 'CORD19/batch_udel/PDFs/0wskg7v1.pdf.pdf',
 '1o2is40t.pdf': 'CORD19/batch_udel/PDFs/1o2is40t.pdf.pdf',
 '03vk6enj.pdf': 'CORD19/batch_udel/PDFs/03vk6enj.pdf.pdf',
 '0bm2ke3k.pdf': 'CORD19/batch_udel/PDFs/0bm2ke3k.pdf.pdf',
 '0m32ecnu.pdf': 'CORD19/batch_udel/PDFs/0m32ecnu.pdf.pdf',
 '13bvkj2t.pdf': 'CORD19/batch_udel/PDFs/13bvkj2t.pdf.pdf',
 '261q3ywu.pdf': 'CORD19/batch_udel/PDFs/261q3ywu.pdf.pdf',
 '1c26qrp7.pdf': 'CORD19/batch_udel/PDFs/1c26qrp7.pdf.pdf',
 '0v8ltunv.pdf': 'CORD19/batch_udel/PDFs/0v8ltunv.pdf.pdf',
 '2740rfql.pdf': 'CORD19/batch_udel/PDFs/2740rfql.pdf.pdf',
 '1go3jjeu.pdf': 'CORD19/batch_udel/PDFs