## Get PFOCR Data

We could get it from the API, but for now, we'll just go ahead and download the entire JSON file we gave to BTE.
This notebook processes the PFOCR Jason file, creates a dataframe with relevant information and saves this data to Dropbox in csv files.

In [1]:
#load packages
import requests
import json
import pandas as pd

In [2]:
# for access to the below dropbox folder
# contact Bioinformatics core, Gladstone Institutes (bioinformatics@gladstone.ucsf.edu)
pfocr_url = "https://www.dropbox.com/s/bs04audl3g4frit/bte_chemicals_diseases_genes.ndjson?dl=1"
pfocr_request = requests.get(pfocr_url)
print(f"status_code: {pfocr_request.status_code}")
if pfocr_request.status_code != 200:
    print(pfocr_request.text)
else:
    print("the request has succeeded")

status_code: 200
the request has succeeded


In [3]:
#create a data frame for all the figure data from PFOCR
#expected size of final data frame (as in Jan, 2023): 77719 rows × 4 columns
#PFOCR has chemical, disease and gene information represented by mesh ids, mesh ids and NCBI gene ids repectively
figures_df_data = []

for line in pfocr_request.text.splitlines():
    pfocr_result = json.loads(line)
    figure_id = pfocr_result["_id"]

    chemical_curies = set()
    disease_curies = set()
    gene_curies = set()
    for identifier in pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]:
        curie = "MESH:" + identifier
        chemical_curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]:
        curie = "MESH:" + identifier
        disease_curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]:
        curie = "NCBIGene:" + identifier
        gene_curies.add(curie)

    figures_df_data.append({
        "figure_id": figure_id,
        "chemical_curie_set": chemical_curies,
        "disease_curie_set": disease_curies,
        "gene_curie_set": gene_curies,
    })

figures_df = pd.DataFrame.from_records(figures_df_data)

#save the dataframe to Dropbox
figures_df.to_csv('/Users/aagrawal/Dropbox (Gladstone)/pfocr2bte/figures_dataframe.csv', index=False)

In [4]:
# create figure metadata and identify unique CURIES for all PFOCR figures
figure_metadata_records = []
figure_curie_df_data = []
for line in pfocr_request.text.splitlines():
    pfocr_result = json.loads(line)
    figure_id = pfocr_result["_id"]

    chemical_curies = set()
    disease_curies = set()
    gene_curies = set()
    for identifier in pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]:
        curie = "MESH:" + identifier
        chemical_curies.add(curie)
        figure_curie_df_data.append({
            "figure_id": figure_id,
            "curie": curie,
            "category": "chemical",
        })
    for identifier in pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]:
        curie = "MESH:" + identifier
        disease_curies.add(curie)
        figure_curie_df_data.append({
            "figure_id": figure_id,
            "curie": curie,
            "category": "disease",
        })
    for identifier in pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]:
        curie = "NCBIGene:" + identifier
        gene_curies.add(curie)
        figure_curie_df_data.append({
            "figure_id": figure_id,
            "curie": curie,
            "category": "gene",
        })

    figure_metadata_records.append({
        "figure_id": figure_id,
        "figure_url": pfocr_result["associatedWith"]["figureUrl"],
        "figure_title": pfocr_result["associatedWith"]["title"],
    })

figure_metadata_df = pd.DataFrame.from_records(figure_metadata_records, index='figure_id')
#save the metadata dataframe to Dropbox
figure_metadata_df.to_csv('/Users/aagrawal/Dropbox (Gladstone)/pfocr2bte/figures_metadata.csv', index=True)


figure_curie_df = pd.DataFrame.from_records(figure_curie_df_data)
#save the metadata dataframe to Dropbox
figure_curie_df.to_csv('/Users/aagrawal/Dropbox (Gladstone)/pfocr2bte/figures_curie.csv', index=False)