In [1]:
import json

gxd_data = '../sample_data/gxd.json'

code2modality = {
  0: "histogram",
  1: "line chart",
  2: "other diagram",
  3: "macromolecule sequence",
  4: "3D structure",
  5: "fluorescence",
  6: "gel/blot",
  7: "plate",
  8: "light microscopy",
  9: "other",
  10: "residual",
  11: "compound"
}

with open(gxd_data, 'r') as f:
  data = json.load(f)

In [2]:
def parseEntry(entry: dict) -> dict:
  modalities = []
  for figure in entry['figures']:
    for subfigure in figure['subfigures']:
      modalities.append(code2modality[subfigure['type']])

  return {
    "cord_uid": entry['jaxid'],
    "source_x": 'gxd',
    "title": entry['title'],
    "abstract": entry['abstract'],
    "publish_time": f"{entry['year']}-01-01",
    "journal": "",
    "authors": "",
    "url": "",
    "pmcid": "",
    "modalities": ";".join(modalities),
  }


In [3]:
parsed_entries = []
for entry in data:
  parsed_entries.append(parseEntry(entry))

In [4]:
import pandas as pd 
df = pd.DataFrame(parsed_entries)

In [5]:
df.head()

Unnamed: 0,cord_uid,source_x,title,abstract,publish_time,journal,authors,url,pmcid,modalities
0,5792639,gxd,MicroRNA-127 Promotes Mesendoderm Differentiat...,Specification of the three germ layers is a fu...,2016-01-01,,,,,line chart;line chart;line chart;line chart;li...
1,5754472,gxd,The number of X chromosomes influences protect...,AIM: Sex differences in coronary heart disease...,2014-01-01,,,,,compound;line chart;line chart;line chart;line...
2,5430698,gxd,Regulated Expression of Chromobox Homolog 5 Re...,"The gene-trap lacZ reporter insertion, ROSA11,...",2012-01-01,,,,,line chart;histogram;other diagram;fluorescenc...
3,5758794,gxd,White spotting phenotype induced by targeted R...,Neural crest cells (NCCs) emerge from the dors...,2015-01-01,,,,,compound;residual;residual;residual;residual;r...
4,5553023,gxd,Transcriptional activation of hypoxia-inducibl...,Emerging evidence indicates that myeloid cells...,2014-01-01,,,,,residual;line chart;line chart;other diagram;o...


In [36]:
# output_path = "../sample_data/gxd.parquet"
# df.to_parquet(output_path, index=None)

In [15]:
from collections import Counter

def parseEntryJson(entry: dict) -> dict:
  modalities = []
  for figure in entry['figures']:
    for subfigure in figure['subfigures']:
      modalities.append(code2modality[subfigure['type']])
  counter = Counter(modalities)
  modalities_count = {key: counter[key] for key in counter}

  return {
    "cord_uid": entry['jaxid'],
    "source_x": 'gxd',
    "title": entry['title'],
    "abstract": entry['abstract'],
    "publish_time": f"{entry['year']}-01-01",
    "journal": "",
    "authors": "",
    "url": "",
    "pmcid": "",
    "modalities": modalities_count,
  }

In [16]:
parsed_entries_json = []
for entry in data:
  parsed_entries_json.append(parseEntryJson(entry))

In [18]:
parsed_entries_json[0]

{'cord_uid': '5792639',
 'source_x': 'gxd',
 'title': 'MicroRNA-127 Promotes Mesendoderm Differentiation of Mouse Embryonic Stem Cells by Targeting Left-Right Determination Factor 2.',
 'abstract': 'Specification of the three germ layers is a fundamental process and is essential for the establishment of organ rudiments. Multiple genetic and epigenetic factors regulate this dynamic process; however, the function of specific microRNAs in germ layer differentiation remains unknown. In this study, we established that microRNA-127 (miR-127) is related to germ layer specification via microRNA array analysis of isolated three germ layers of E7.5 mouse embryos and was verified through differentiation of mouse embryonic stem cells. miR-127 is highly expressed in endoderm and primitive streak. Overexpression of miR-127 increases and inhibition of miR-127 decreases the expression of mesendoderm markers. We further show that miR-127 promotes mesendoderm differentiation through the nodal pathway, a

In [22]:
gxd_dict = {x['cord_uid']: x for x in parsed_entries_json}
json_string = json.dumps(gxd_dict)
with open('../sample_data/gdx_dict.json', 'w') as outfile:
  outfile.write(json_string)

In [23]:
gxd_dict['5645828']

{'cord_uid': '5645828',
 'source_x': 'gxd',
 'title': "Dopaminergic control of autophagic-lysosomal function implicates Lmx1b in Parkinson's disease.",
 'abstract': "The role of developmental transcription factors in maintenance of neuronal properties and in disease remains poorly understood. Lmx1a and Lmx1b are key transcription factors required for the early specification of ventral midbrain dopamine (mDA) neurons. Here we show that conditional ablation of Lmx1a and Lmx1b after mDA neuron specification resulted in abnormalities that show striking resemblance to early cellular abnormalities seen in Parkinson's disease. We found that Lmx1b was required for the normal execution of the autophagic-lysosomal pathway and for the integrity of dopaminergic nerve terminals and long-term mDA neuronal survival. Notably, human LMX1B expression was decreased in mDA neurons in brain tissue affected by Parkinson's disease. Thus, these results reveal a sustained and essential requirement of Lmx1b for