# Chemical Intersections

[GitHub issue](https://github.com/wikipathways/pathway-figure-ocr/issues/25)

In [2]:
import json
import os
import re
import sys
import tempfile
from pathlib import Path
from pprint import pprint

import numpy as np
import openpyxl
import pandas as pd
import requests
import requests_cache
import xlrd

requests_cache.install_cache("pfocr_cache")

## Load PFOCR Chemicals

In [3]:
pfocr_chemicals_2020_url = "https://www.dropbox.com/s/08tnkgyty3u8cr4/pfocr_chemical_export_20201130.tsv?dl=1"

with tempfile.NamedTemporaryFile(suffix=".tsv") as f:
    pfocr_chemicals_2020_path = f.name
    with requests.get(pfocr_chemicals_2020_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)

    pfocr_chemicals_2020_df = (
        pd.read_csv(str(pfocr_chemicals_2020_path), sep="\t").rename(
            columns={
                "figure_id": "pfocr_id",
            }
        )
        # TODO: right now, lexicon_alias is either NaN or
        # it's identical to lexicon_term
        # .drop(columns=["lexicon_alias"])
    )

pfocr_chemicals_2020_df

Unnamed: 0,pfocr_id,matched_ocr_text,lexicon_alias,lexicon_term,lexicon_term_source,datasource,identifier,annotations,figure_nobe_count,figure_entrez_count
0,PMC100003__mb2410470011.jpg,bombesin,Bombesin,Bombesin,pubtator,MESH,D001839,neurotransmitter,5,5
1,PMC100008__mb2411709009.jpg,Bradykinin,Bradykinin,Bradykinin,pubtator,MESH,D001920,medication,8,52
2,PMC100008__mb2411709009.jpg,Isoproterenol,Isoproterenol,Isoproterenol,pubtator,MESH,D007545,medication,8,52
3,PMC100028__ii1110348001.jpg,Ethanolamines,Ethanolamines,Ethanolamines,pubtator,MESH,D004983,,0,0
4,PMC100028__ii1110348001.jpg,Fatty acids,Fatty Acids,Fatty Acids,pubtator,MESH,D005227,,0,0
...,...,...,...,...,...,...,...,...,...,...
120257,PMC99957__mb2310813001.jpg,Adenine,Adenine,Adenine,pubtator,MESH,D000225,medication,10,10
120258,PMC99957__mb2310813001.jpg,Guanine,Guanine,Guanine,pubtator,MESH,D006147,,10,10
120259,PMC99957__mb2310813001.jpg,Hypoxanthine,Hypoxanthine,Hypoxanthine,pubtator,MESH,D019271,,10,10
120260,PMC99957__mb2310813011.jpg,Adenine,Adenine,Adenine,pubtator,MESH,D000225,medication,2,2


In [4]:
pfocr_chemicals_2020_df[
    (
        pfocr_chemicals_2020_df["lexicon_alias"]
        != pfocr_chemicals_2020_df["lexicon_alias"]
    )
]

Unnamed: 0,pfocr_id,matched_ocr_text,lexicon_alias,lexicon_term,lexicon_term_source,datasource,identifier,annotations,figure_nobe_count,figure_entrez_count
13,PMC101494__ii1000773003.jpg,D-Ala,,Alanine,pfocr,MESH,D000409,amino_acid|medication,2,2
14,PMC101494__ii1000773003.jpg,L-Ala,,Alanine,pfocr,MESH,D000409,amino_acid|medication,2,2
25,PMC101920__jb1500344001.jpg,Thr,,Threonine,pfocr,MESH,D013912,amino_acid|medication,8,9
30,PMC102201__00-0335f8.jpg,Cys,,Cysteine,pfocr,MESH,D003545,amino_acid|medication,1,1
47,PMC102289__pp0516637001.jpg,Phe,,Phenylalanine,pfocr,MESH,D010649,amino_acid|medication,0,0
...,...,...,...,...,...,...,...,...,...,...
120159,PMC98962__mr0190007003.jpg,L-Ala,,Alanine,pfocr,MESH,D000409,amino_acid|medication,1,1
120160,PMC98962__mr0190007003.jpg,UDP MurNAc L-Ala,,Alanine,pfocr,MESH,D000409,amino_acid|medication,1,1
120161,PMC98962__mr0190007003.jpg,L-Lys,,Lysine,pfocr,MESH,D008239,amino_acid|medication,1,1
120162,PMC98962__mr0190007003.jpg,Lys,,Lysine,pfocr,MESH,D008239,amino_acid|medication,1,1


Get a normalized (actually just a rough-approximation) list of chemical names from PFOCR:

In [13]:
not_letters_numbers_re = re.compile("[^a-zA-Z0-9]")


def normalize(t):
    return not_letters_numbers_re.sub("", str(t).casefold())

In [6]:
pfocr_chemicals_2020 = set(
    pfocr_chemicals_2020_df["matched_ocr_text"].apply(normalize).to_list()
) | set(pfocr_chemicals_2020_df["lexicon_term"].apply(normalize).to_list())
len(pfocr_chemicals_2020)

15804

## CMap Chemicals

In [7]:
cmap_chemicals_url = (
    "https://www.dropbox.com/s/et5qkgl6gz9p8mn/CMap_Repurposing_Hub.xlsx?dl=1"
)

with tempfile.NamedTemporaryFile(suffix=".xlsx") as f:
    cmap_chemicals_path = f.name
    with requests.get(cmap_chemicals_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)

    cmap_chemicals_df = pd.read_excel(cmap_chemicals_path, sheet_name="Sheet1")

cmap_chemicals_df

Unnamed: 0,Name,MOA,Target,Disease Area,Indication,Id,Phase
0,(R)-(-)-apomorphine,dopamine receptor agonist,"ADRA2A, ADRA2B, ADRA2C, CALY, DRD1, DRD2, DRD3...",neurology/psychiatry,Parkinson's Disease,"BRD-K76022557-003-28-9, BRD-K76022557-003-02-7...",Launched
1,(R)-(-)-rolipram,phosphodiesterase inhibitor,"PDE4A, PDE4B, PDE4C, PDE4D, PDE5A",,,"BRD-K75516118-001-04-1, BRD-K75516118-001-05-9...",Phase 1
2,(R)-baclofen,benzodiazepine receptor agonist,"GABBR1, GABBR2",,,"BRD-K62353271-001-04-7, BRD-K62353271-001-02-1",Phase 3
3,(S)-(+)-rolipram,phosphodiesterase inhibitor,"PDE4B, PDE4D",,,"BRD-K65856711-001-05-9, BRD-K65856711-001-03-6...",Phase 1
4,"[sar9,met(o2)11]-substance-p",tachykinin antagonist,TACR1,,,"BRD-K89787693-001-01-1, BRD-K89787693-001-02-9",Preclinical
...,...,...,...,...,...,...,...
6793,8-M-PDOT,melatonin receptor agonist,"MTNR1A, MTNR1B",,,"BRD-A07232941-001-03-5, BRD-A07232941-001-02-7",Preclinical
6794,80841-78-7,,,,,BRD-K03044000-001-01-8,Preclinical
6795,9-aminoacridine,,,,,"BRD-K00535541-001-04-8, BRD-K00535541-001-05-5...",Preclinical
6796,9-aminocamptothecin,topoisomerase inhibitor,TOP1,,,"BRD-K09291936-001-13-3, BRD-K09291936-001-14-9",Phase 2


In [8]:
cmap_chemicals_df["PFOCR"] = (
    cmap_chemicals_df["Name"].apply(normalize).isin(pfocr_chemicals_2020)
)
cmap_chemicals_df

Unnamed: 0,Name,MOA,Target,Disease Area,Indication,Id,Phase,PFOCR
0,(R)-(-)-apomorphine,dopamine receptor agonist,"ADRA2A, ADRA2B, ADRA2C, CALY, DRD1, DRD2, DRD3...",neurology/psychiatry,Parkinson's Disease,"BRD-K76022557-003-28-9, BRD-K76022557-003-02-7...",Launched,False
1,(R)-(-)-rolipram,phosphodiesterase inhibitor,"PDE4A, PDE4B, PDE4C, PDE4D, PDE5A",,,"BRD-K75516118-001-04-1, BRD-K75516118-001-05-9...",Phase 1,False
2,(R)-baclofen,benzodiazepine receptor agonist,"GABBR1, GABBR2",,,"BRD-K62353271-001-04-7, BRD-K62353271-001-02-1",Phase 3,False
3,(S)-(+)-rolipram,phosphodiesterase inhibitor,"PDE4B, PDE4D",,,"BRD-K65856711-001-05-9, BRD-K65856711-001-03-6...",Phase 1,False
4,"[sar9,met(o2)11]-substance-p",tachykinin antagonist,TACR1,,,"BRD-K89787693-001-01-1, BRD-K89787693-001-02-9",Preclinical,False
...,...,...,...,...,...,...,...,...
6793,8-M-PDOT,melatonin receptor agonist,"MTNR1A, MTNR1B",,,"BRD-A07232941-001-03-5, BRD-A07232941-001-02-7",Preclinical,False
6794,80841-78-7,,,,,BRD-K03044000-001-01-8,Preclinical,False
6795,9-aminoacridine,,,,,"BRD-K00535541-001-04-8, BRD-K00535541-001-05-5...",Preclinical,False
6796,9-aminocamptothecin,topoisomerase inhibitor,TOP1,,,"BRD-K09291936-001-13-3, BRD-K09291936-001-14-9",Phase 2,False


TODO: I don't know how to export without the index column, so I exported and then opened it in Excel in order to manually delete it. Is there a better way?

In [9]:
cmap_chemicals_df.to_excel("CMap_Repurposing_Hub.xlsx", sheet_name="Sheet1")

## Anke Chemicals

In [14]:
full_library_chemicals_url = "https://www.dropbox.com/s/taia99vu2tah6cy/Full%20Library%20Gladstone%20%28with%20cpd%20names%29_20171106.xlsx?dl=1"

full_library_chemicals_sheet_name = "Dec2016 fullset copy4 platemaps"

with tempfile.NamedTemporaryFile(suffix=".xlsx") as f:
    full_library_chemicals_path = f.name
    with requests.get(full_library_chemicals_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)

    full_library_chemicals_df = pd.read_excel(
        full_library_chemicals_path,
        sheet_name=full_library_chemicals_sheet_name,
    )

full_library_chemicals_df

Unnamed: 0,Plate ID,Well,2D barcode,Compound ID,conc (mM),vol (uL),plateset,compound name
0,AD000052,A/01,EMPTY,,10,1,Full GL library,
1,AD000052,A/02,EMPTY,,10,1,Full GL library,
2,AD000052,A/03,205064781,GL03080,10,1,Full GL library,Bortezomib (Velcade)
3,AD000052,A/04,179703632,GL03168,10,1,Full GL library,PLX-4720
4,AD000052,A/05,207363191,GL03088,10,1,Full GL library,Lapatinib Ditosylate (Tykerb)
...,...,...,...,...,...,...,...,...
6907,AD0000100,P/20,EMPTY,,10,10,Full GL library,
6908,AD0000100,P/21,1150964649,GL05462,10,10,Full GL library,KRIBB11
6909,AD0000100,P/22,EMPTY,,10,10,Full GL library,
6910,AD0000100,P/23,EMPTY,,10,10,Full GL library,


In [23]:
full_library_chemicals_df["PFOCR"] = (
    full_library_chemicals_df["compound name"]
    .apply(normalize)
    .isin(pfocr_chemicals_2020)
)
full_library_chemicals_df

Unnamed: 0,Plate ID,Well,2D barcode,Compound ID,conc (mM),vol (uL),plateset,compound name,PFOCR
0,AD000052,A/01,EMPTY,,10,1,Full GL library,,False
1,AD000052,A/02,EMPTY,,10,1,Full GL library,,False
2,AD000052,A/03,205064781,GL03080,10,1,Full GL library,Bortezomib (Velcade),False
3,AD000052,A/04,179703632,GL03168,10,1,Full GL library,PLX-4720,True
4,AD000052,A/05,207363191,GL03088,10,1,Full GL library,Lapatinib Ditosylate (Tykerb),False
...,...,...,...,...,...,...,...,...,...
6907,AD0000100,P/20,EMPTY,,10,10,Full GL library,,False
6908,AD0000100,P/21,1150964649,GL05462,10,10,Full GL library,KRIBB11,False
6909,AD0000100,P/22,EMPTY,,10,10,Full GL library,,False
6910,AD0000100,P/23,EMPTY,,10,10,Full GL library,,False


TODO: I don't know how to export without the index column, so I exported and then opened it in Excel in order to manually delete it. Is there a better way?

In [24]:
full_library_chemicals_df.to_excel(
    "Full Library Gladstone (with cpd names)_20171106_PFOCR.xlsx",
    sheet_name=full_library_chemicals_sheet_name,
)