In [4]:
import json
import requests

In [2]:
with open('gxd.json', 'r') as handler:
  data = json.load(handler)

In [3]:
data[0]

{'pmid': '27072135',
 'jaxid': '5792639',
 'relevant': 'relevant',
 'year': '2016',
 'title': 'MicroRNA-127 Promotes Mesendoderm Differentiation of Mouse Embryonic Stem Cells by Targeting Left-Right Determination Factor 2.',
 'abstract': 'Specification of the three germ layers is a fundamental process and is essential for the establishment of organ rudiments. Multiple genetic and epigenetic factors regulate this dynamic process; however, the function of specific microRNAs in germ layer differentiation remains unknown. In this study, we established that microRNA-127 (miR-127) is related to germ layer specification via microRNA array analysis of isolated three germ layers of E7.5 mouse embryos and was verified through differentiation of mouse embryonic stem cells. miR-127 is highly expressed in endoderm and primitive streak. Overexpression of miR-127 increases and inhibition of miR-127 decreases the expression of mesendoderm markers. We further show that miR-127 promotes mesendoderm diff

In [12]:
from dataclasses import dataclass
from datetime import datetime
from dataclasses import field
from time import sleep
from typing import Optional
from pathlib import Path
from os import listdir, path
import csv
import requests


@dataclass
class Cord19Document:
    title: str
    project: str
    status: str
    abstract: Optional[str] = field(default=None)
    authors: Optional[list[str]] = field(default=None)
    publication_date: Optional[datetime] = field(default=None)
    modalities: Optional[list[str]] = field(default=None)
    pmcid: Optional[str] = field(default=None)
    pubmed_id: Optional[int] = field(default=None)
    journal: Optional[str] = field(default=None)
    repository: Optional[str] = field(default=None)
    cord_uid: Optional[str] = field(default=None)
    license: Optional[str] = field(default=None)
    uri: Optional[str] = field(default=None)
    doi: Optional[str] = field(default=None)
    notes: Optional[str] = field(default=None)

    def to_tuple(self):
        """Return data as tuple to insert in database"""
        return (
            self.title[:200] if self.title is not None else None,
            [x[:100] for x in self.authors] if self.authors is not None else None,
            self.abstract[:2000] if self.abstract is not None else None,
            self.publication_date,
            self.pmcid,
            self.pubmed_id,
            self.journal[:100] if self.journal is not None else None,
            self.repository,
            self.project,
            self.license,
            self.status,
            self.uri,
            self.doi,
            self.notes,
        )

In [17]:
import numpy as np
from datetime import datetime

def get_metadata(id, gxd_entry, pubmed_dict):
    data = pubmed_dict["result"][id]

    authors = [x["name"] for x in data["authors"]]
    publication_date = datetime.strptime(data['sortpubdate'][:10], "%Y/%m/%d")
    journal = data["fulljournalname"]

    pubmed_id = None
    pmcid = None
    doi = None
    for articleid in data["articleids"]:
        if articleid["idtype"] == "pmcid":
          pmcid = articleid["value"]
        elif articleid["idtype"] == "pmc":
          pmcid = articleid["value"]
        if articleid["idtype"] == "doi":
            doi = articleid["value"]
    uri = f"gxd/{gxd_entry['jaxid']}.pdf"

    return Cord19Document(
      title=gxd_entry['title'],
      abstract=gxd_entry['abstract'],
      authors=authors,
      modalities=None,
      publication_date=publication_date,
      pmcid=pmcid,
      pubmed_id=id,
      license=None,
      journal=journal,
      doi=doi,
      cord_uid=None,
      repository='pubmed',
      uri=uri,
      status="IMPORTED",
      project="gxd",
      notes=None,
    )
    

def load_from_gxd(gxd: list[dict]):
  documents = []
  gxd_pubmed_ids = [x['pmid'] for x in gxd]
  gxd_dict = {x['pmid']: x for x in gxd}

  n_splits = len(gxd_pubmed_ids) // 100
  id_splits = np.array_split(gxd_pubmed_ids, n_splits)
  
  for split in id_splits:
    concat_pmids = ",".join(split)
    res_pubmedids = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={concat_pmids}&retmode=json")
    
    for pmid in split:
      document = get_metadata(pmid, gxd_dict[pmid], res_pubmedids.json())
      documents.append(document)
    sleep(2.5)

  return documents

documents = load_from_gxd(data)

In [18]:
documents[:2]

[Cord19Document(title='MicroRNA-127 Promotes Mesendoderm Differentiation of Mouse Embryonic Stem Cells by Targeting Left-Right Determination Factor 2.', project='gxd', status='IMPORTED', abstract='Specification of the three germ layers is a fundamental process and is essential for the establishment of organ rudiments. Multiple genetic and epigenetic factors regulate this dynamic process; however, the function of specific microRNAs in germ layer differentiation remains unknown. In this study, we established that microRNA-127 (miR-127) is related to germ layer specification via microRNA array analysis of isolated three germ layers of E7.5 mouse embryos and was verified through differentiation of mouse embryonic stem cells. miR-127 is highly expressed in endoderm and primitive streak. Overexpression of miR-127 increases and inhibition of miR-127 decreases the expression of mesendoderm markers. We further show that miR-127 promotes mesendoderm differentiation through the nodal pathway, a d