# Get Figures

In [1]:
import json
import os
import time
from pathlib import Path

import magic
import numpy as np
import pandas as pd
import requests
from IPython.display import Image

## rpy2

In [2]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

In [3]:
from functools import partial

from rpy2.ipython import html

html.html_rdataframe = partial(html.html_rdataframe, table_class="docutils")

In [4]:
pandas2ri.activate()
base = importr("base")
readRDS = ro.r["readRDS"]



In [5]:
target_date = "20210513"
pmc_r_df = readRDS(
    f"../data/imagesdocsum_pathway_queries/{target_date}/pmc.df.all.rds"
)
with localconverter(ro.default_converter + pandas2ri.converter):
    pmc_df = ro.conversion.rpy2py(pmc_r_df).rename(
        columns={
            "figid": "pfocr_id",
            # "pmcid": "pmc_id",
            "filename": "figure_filename",
            "number": "figure_number",
            "figtitle": "figure_title",
            "papertitle": "paper_title",
            # "caption": "figure_caption",
            "figlink": "relative_figure_page_url",
            "reftext": "reference_text",
        }
    )

pmc_df["paper_link"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pmc_df["pmcid"]
)

pmc_df["figure_page_url"] = (
    "https://www.ncbi.nlm.nih.gov" + pmc_df["relative_figure_page_url"]
)

pmc_df["figure_thumbnail_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + pmc_df["pmcid"]
    + "/bin/"
    + pmc_df["figure_filename"]
)

pmc_df.drop(columns=["relative_figure_page_url"], inplace=True)

pmc_df

Unnamed: 0,pfocr_id,pmcid,figure_filename,figure_number,figure_title,paper_title,caption,reference_text,paper_link,figure_page_url,figure_thumbnail_url
1,PMC7970325__gr12.jpg,PMC7970325,gr12.jpg,Figure 12,Average PAL values for 36 differential DNA rep...,DNA repair pathway activation features in foll...,Average PAL values for 36 differential DNA rep...,"Uliana Vladimirova, et al. Heliyon. 2021 Mar;7...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
2,PMC7847494__ijcep0014-0097-f3.jpg,PMC7847494,ijcep0014-0097-f3.jpg,Figure 3,GSEA enrichment analysis of PDK2-4 in breast c...,Expression and clinical significance of PDK fa...,GSEA enrichment analysis of PDK2-4 in breast c...,"Jian Xu, et al. Int J Clin Exp Pathol. 2021;14...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
3,PMC6137129__fimmu-09-02055-g0006.jpg,PMC6137129,fimmu-09-02055-g0006.jpg,Figure 6,Overrepresented KEGG metabolic and signaling p...,Molecular and Cellular Response to Experimenta...,Overrepresented KEGG metabolic and signaling p...,"Ivana Bušelić, et al. Front Immunol. 2018;9:2055.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
4,PMC6721529__cells-08-00806-g003.jpg,PMC6721529,cells-08-00806-g003.jpg,Figure 3,Canonical pathway analysis of proteins identif...,Impact of Heat Shock Protein 90 Inhibition on ...,Canonical pathway analysis of proteins identif...,"Ángela Marrugal, et al. Cells. 2019 Aug;8(8):806.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
5,PMC7970325__gr17.jpg,PMC7970325,gr17.jpg,Figure 17,PAL distribution for 38 DNA repair pathways in...,DNA repair pathway activation features in foll...,PAL distribution for 38 DNA repair pathways in...,"Uliana Vladimirova, et al. Heliyon. 2021 Mar;7...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
...,...,...,...,...,...,...,...,...,...,...,...
124560,PMC7352181__cancers-12-01457-g002.jpg,PMC7352181,cancers-12-01457-g002.jpg,Figure 2,Angiotensin-associated pathways associated wit...,Renin-Angiotensin System in Lung Tumor and Mic...,Angiotensin-associated pathways associated wit...,"Maria Joana Catarata, et al. Cancers (Basel). ...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
124561,PMC6952829__bloodBLD2019001438f2.jpg,PMC6952829,bloodBLD2019001438f2.jpg,Figure 2,Concurrent activation of NF-κB and Notch signa...,Coactivation of NF-κB and Notch signaling is s...,Concurrent activation of NF-κB and Notch signa...,"Yan Xiu, et al. Blood. 2020 Jan 9;135(2):108-120.",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
124562,PMC7777525__gr2.jpg,PMC7777525,gr2.jpg,Fig. 2,Schematic representation of type I to type VI ...,Secrete or perish: The role of secretion syste...,Schematic representation of type I to type VI ...,"Cristina E. Alvarez-Martinez, et al. Comput St...",https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
124563,PMC6092289__TJP-596-3469-g003.jpg,PMC6092289,TJP-596-3469-g003.jpg,Figure 2,,The negotiated equilibrium model of spinal cor...,"A, left: soleus EMG is monitored 24 h per day ...",Jonathan R. Wolpaw. J Physiol. 2018 Aug 15;596...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [6]:
images_dir = Path(f"../data/images/{target_date}")
images_dir.mkdir(parents=True, exist_ok=True)

In [8]:
wait_sec = 0.25

downloaded_images_count_path = Path(
    f"../data/images/{target_date}/downloaded_images_count.log"
)
log_file_path = "../data/dead_links1.log"
# with open(log_file_path, "w") as f:
#    f.write("")

for i, pmc_row in pmc_df.iterrows():
    # if int(i) < 87181:
    #    continue

    pfocr_id = pmc_row["pfocr_id"]
    figure_thumbnail_url = pmc_row["figure_thumbnail_url"]

    figure_path = images_dir.joinpath(pfocr_id)
    if figure_path.exists():
        continue

    headers = {
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:86.0) Gecko/20100101 Firefox/86.0"
    }
    request = requests.get(figure_thumbnail_url, headers=headers)
    if request.status_code == 200:
        with open(figure_path, "wb") as f:
            f.write(request.content)

        filetype = magic.from_buffer(request.content)
        if "JPEG image data" not in filetype:
            with open(log_file_path, "a") as f:
                f.write(
                    f"get {request.status_code}: {figure_thumbnail_url}\t{filetype}"
                )
            print(filetype)
            display(Image(filename=figure_path))
            print(request.content)
    else:
        print(f"Got {request.status_code} for {figure_thumbnail_url}")
        print(request.content)
        with open(log_file_path, "a") as f:
            f.write(
                f"get {request.status_code}: {figure_thumbnail_url}\t{request.content}"
            )

    with open(downloaded_images_count_path, "w") as f:
        f.write(f"{i} of {len(pmc_df)}\n")

    time.sleep(wait_sec)

Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7127984/bin/EMS85824-f003.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7127984/bin/EMS85824-f005.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7237067/bin/nihms-1584862-f0012.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6489686/bin/pkz019f1.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6369880/bin/ct9-10-e00004-g004.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6489686/bin/pkz019f2.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6987283/bin/40169_2020_260_Fig3_HTML.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6039262/bin/emss-77884-f003.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7381384/bin/EMS117979-f018.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7923860/bin/gr3_lrg.jpg
Got 404 for https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6987283/bin/40169_2020_260_Fig5_HTML.jpg
Got 404 for https://www