In [None]:
import os
import sys

src_path = os.path.abspath("../../..")
print(src_path)
sys.path.append(src_path)

In [None]:
resource_directory = os.path.abspath("./resource")
resource_directory

In [None]:
import pandas as pd
import re

## ICD9CM

### base

In [None]:
""" https://bioportal.bioontology.org/ontologies/ICD9CM """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD9CM.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# exclude non icd9 codes
raw_data = raw_data[raw_data.vocab == "ICD9CM"]
# exclude icd9proc codes
# icd9cm codes: 001-999.99, icd9proc: 00.00-99.99
raw_data = raw_data[raw_data.code.apply(lambda x: len(re.split("\.|-", x)[0]) > 2)]
data = raw_data[["code", "parent_code", "name"]]
# exclude non ICD9CM parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD9CM.csv"), index=False)
data.head()

### to CCSCM

In [None]:
def normalize_icd9cm(code: str):
    """Normalize ICD9CM code"""
    if code.startswith("E"):
        assert len(code) >= 4
        if len(code) == 4:
            return code
        return code[:4] + "." + code[4:]
    else:
        assert len(code) >= 3
        if len(code) == 3:
            return code
        return code[:3] + "." + code[3:]


normalize_icd9cm("01000")

In [None]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp"""

mapping = {}
with open(os.path.join(resource_directory, "raw/$dxref 2015.csv")) as f:
    lines = f.readlines()
    for line in lines[3:]:
        line = line.split(",")
        icd9cm_code = line[0].strip("'").strip()
        ccscm_code = line[1].strip("'").strip()
        assert icd9cm_code not in mapping
        mapping[icd9cm_code] = ccscm_code
data = {"ICD9CM": mapping.keys(), "CCSCM": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.ICD9CM = data.ICD9CM.map(normalize_icd9cm)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD9CM_to_CCSCM.csv"), index=False
)
data.head()

## CCSCM

### base

In [None]:
""" https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp """

data = {}
with open(os.path.join(resource_directory, "raw/dxlabel 2015.csv")) as f:
    lines = f.readlines()
    for line in lines[4:]:
        line = line.split(",")
        code = line[0].strip("'").strip()
        name = line[1].strip("'").strip()
        data[code] = {"name": name}
data = (
    pd.DataFrame.from_dict(data, orient="index")
    .reset_index()
    .rename(columns={"index": "code"})
)
data.to_csv(os.path.join(resource_directory, "processed/CCSCM.csv"), index=False)
data.head()

## ICD9PROC

### base

In [None]:
""" https://bioportal.bioontology.org/ontologies/ICD9CM """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD9CM.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# exclude non icd9 codes
raw_data = raw_data[raw_data.vocab == "ICD9CM"]
# exclude icd9cm codes
# icd9cm codes: 001-999.99, icd9proc: 00.00-99.99
raw_data = raw_data[raw_data.code.apply(lambda x: len(re.split("\.|-", x)[0]) <= 2)]
data = raw_data[["code", "parent_code", "name"]]
# exclude non icd9proc parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD9PROC.csv"), index=False)
data.head()

### to CCSPROC

In [None]:
def normalize_icd9proc(code: str):
    """Normalize ICD9PROC code"""
    assert len(code) >= 2
    if len(code) == 2:
        return code
    return code[:2] + "." + code[2:]


normalize_icd9proc("6111")

In [None]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp"""

mapping = {}
with open(os.path.join(resource_directory, "raw/$prref 2015.csv")) as f:
    lines = f.readlines()
    for line in lines[3:]:
        line = line.split(",")
        icd9proc_code = line[0].strip("'").strip()
        ccsproc_code = line[1].strip("'").strip()
        assert icd9proc_code not in mapping
        mapping[icd9proc_code] = ccsproc_code

data = {"ICD9PROC": mapping.keys(), "CCSPROC": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.ICD9PROC = data.ICD9PROC.map(normalize_icd9proc)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD9PROC_to_CCSPROC.csv"), index=False
)
data.head()

## CCSPROC

### base

In [None]:
""" https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp """

data = {}
with open(os.path.join(resource_directory, "raw/prlabel 2014.csv")) as f:
    lines = f.readlines()
    for line in lines[4:]:
        line = line.split(",")
        code = line[0].strip('"').strip()
        name = line[1].strip('"').strip()
        data[code] = {"name": name}
data = (
    pd.DataFrame.from_dict(data, orient="index")
    .reset_index()
    .rename(columns={"index": "code"})
)
data.to_csv(os.path.join(resource_directory, "processed/CCSPROC.csv"), index=False)
data.head()

## ICD10CM

### base

In [None]:
""" https://bioportal.bioontology.org/ontologies/ICD10CM """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD10CM.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# # exclude non icd10cm codes
raw_data = raw_data[raw_data.vocab == "ICD10CM"]
data = raw_data[["code", "parent_code", "name"]]
# exclude non icd10cm parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD10CM.csv"), index=False)
data.head()

### to CCSCM

In [None]:
def normalize_icd10cm(code: str):
    """Normalize ICD10CM code"""
    assert len(code) >= 3
    if len(code) == 3:
        return code
    return code[:3] + "." + code[3:]


normalize_icd10cm("Y9284")

In [None]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccsr/ccsr_archive.jsp#ccsr"""

mapping = {}
with open(os.path.join(resource_directory, "raw/ccs_dx_icd10cm_2019_1.csv")) as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.split(",")
        icd10cm_code = line[0].strip("'").strip()
        ccscm_code = line[1].strip("'").strip()
        assert icd10cm_code not in mapping
        mapping[icd10cm_code] = ccscm_code
data = {"ICD10CM": mapping.keys(), "CCSCM": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.ICD10CM = data.ICD10CM.map(normalize_icd10cm)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD10CM_to_CCSCM.csv"), index=False
)
data.head()

## ICD10PROC

### base

In [None]:
""" https://bioportal.bioontology.org/ontologies/ICD10PCS """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD10PCS.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# # exclude non icd10proc codes
raw_data = raw_data[raw_data.vocab == "ICD10PCS"]
data = raw_data[["code", "parent_code", "name"]]
# exclude non icd10proc parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD10PROC.csv"), index=False)
data.head()

### to CCSPROC

In [None]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccs10/ccs10.jsp"""

mapping = {}
with open(os.path.join(resource_directory, "raw/ccs_pr_icd10pcs_2019_1.csv")) as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.split(",")
        icd10proc_code = line[0].strip("'").strip()
        ccsproc_code = line[1].strip("'").strip()
        assert icd10proc_code not in mapping
        mapping[icd10proc_code] = ccsproc_code
data = {"ICD10PROC": mapping.keys(), "CCSPROC": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD10PROC_to_CCSPROC.csv"), index=False
)
data.head()

## NDC

Download NDC, RxNorm, ATC from https://athena.ohdsi.org/vocabulary/list

In [None]:
foldername = "NDC_RxNorm_ATC"

In [None]:
concept = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT.csv"),
    dtype={
        "concept_id": str,
        "vocabulary_id": str,
        "concept_class_id": str,
        "concept_code": str,
    },
    sep="\t",
)
concept.head()

In [None]:
concept_relationship = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_RELATIONSHIP.csv"),
    dtype={"concept_id_1": str, "concept_id_2": str, "relationship_id": str},
    sep="\t",
)
concept_relationship

In [None]:
concept_ancestor = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_ANCESTOR.csv"),
    dtype={"ancestor_concept_id": str, "descendant_concept_id": str},
    sep="\t",
)
concept_ancestor

### base

In [None]:
data = concept[concept.vocabulary_id == "NDC"]
data["code"] = data.concept_code
data["name"] = data.concept_name
data = data[["code", "name"]]
data = data.drop_duplicates().dropna()
data.to_csv(os.path.join(resource_directory, "processed/NDC.csv"), index=False)
data.head()

### to RxNorm

In [None]:
concept_ndc = concept[concept.vocabulary_id == "NDC"]
concept_relationship_maps_to = concept_relationship[
    concept_relationship.relationship_id == "Maps to"
]
concept_relationship_maps_to = concept_relationship_maps_to[
    pd.isna(concept_relationship_maps_to.invalid_reason)
]
concept_rxnorm = concept[concept.vocabulary_id == "RxNorm"]

ndc_rxnorm = concept_ndc.merge(
    concept_relationship_maps_to,
    left_on="concept_id",
    right_on="concept_id_1",
    how="inner",
    suffixes=("_ndc", "_r"),
)
ndc_rxnorm = ndc_rxnorm.merge(
    concept_rxnorm,
    left_on="concept_id_2",
    right_on="concept_id",
    how="inner",
    suffixes=("_ndc", "_rxnorm"),
)
ndc_rxnorm = ndc_rxnorm[
    [
        "concept_id_ndc",
        "concept_name_ndc",
        "concept_code_ndc",
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
    ]
]

data = pd.DataFrame()
data["NDC"] = ndc_rxnorm.concept_code_ndc
data["RxNorm"] = ndc_rxnorm.concept_code_rxnorm
data.to_csv(
    os.path.join(resource_directory, "processed/NDC_to_RxNorm.csv"), index=False
)
data.head()

### to ATC

In [None]:
# first convert NDC to RxNorm
concept_ndc = concept[concept.vocabulary_id == "NDC"]
concept_relationship_maps_to = concept_relationship[
    concept_relationship.relationship_id == "Maps to"
]
concept_relationship_maps_to = concept_relationship_maps_to[
    pd.isna(concept_relationship_maps_to.invalid_reason)
]
concept_rxnorm = concept[concept.vocabulary_id == "RxNorm"]

ndc_rxnorm = concept_ndc.merge(
    concept_relationship_maps_to,
    left_on="concept_id",
    right_on="concept_id_1",
    how="inner",
    suffixes=("_ndc", "_r"),
)
ndc_rxnorm = ndc_rxnorm.merge(
    concept_rxnorm,
    left_on="concept_id_2",
    right_on="concept_id",
    how="inner",
    suffixes=("_ndc", "_rxnorm"),
)
ndc_rxnorm = ndc_rxnorm[
    [
        "concept_id_ndc",
        "concept_name_ndc",
        "concept_code_ndc",
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
    ]
]

# then convert RxNorm to ATC5
concept_atc5 = concept[concept.vocabulary_id == "ATC"]
concept_atc5 = concept_atc5[concept_atc5.concept_class_id == "ATC 5th"]

ndc_rxnorm_atc5 = ndc_rxnorm.merge(
    concept_ancestor,
    left_on="concept_id_rxnorm",
    right_on="descendant_concept_id",
    how="inner",
    suffixes=("_rxnorm", "_r"),
)
ndc_rxnorm_atc5 = ndc_rxnorm_atc5.merge(
    concept_atc5,
    left_on="ancestor_concept_id",
    right_on="concept_id",
    how="inner",
    suffixes=("_rxnorm", "_atc"),
)
ndc_rxnorm_atc5 = ndc_rxnorm_atc5.rename(
    columns={
        "concept_id": "concept_id_atc5",
        "concept_name": "concept_name_atc5",
        "concept_code": "concept_code_atc5",
    }
)
ndc_rxnorm_atc5 = ndc_rxnorm_atc5[
    [
        "concept_id_ndc",
        "concept_name_ndc",
        "concept_code_ndc",
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
        "concept_id_atc5",
        "concept_name_atc5",
        "concept_code_atc5",
    ]
]

data = pd.DataFrame()
data["NDC"] = ndc_rxnorm_atc5.concept_code_ndc
data["ATC"] = ndc_rxnorm_atc5.concept_code_atc5
data.to_csv(os.path.join(resource_directory, "processed/NDC_to_ATC.csv"), index=False)
data.head()

## RxNorm

Download NDC, RxNorm, ATC from https://athena.ohdsi.org/vocabulary/list

In [None]:
foldername = "NDC_RxNorm_ATC"

In [None]:
concept = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT.csv"),
    dtype={
        "concept_id": str,
        "vocabulary_id": str,
        "concept_class_id": str,
        "concept_code": str,
    },
    sep="\t",
)
concept.head()

In [None]:
concept_relationship = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_RELATIONSHIP.csv"),
    dtype={"concept_id_1": str, "concept_id_2": str, "relationship_id": str},
    sep="\t",
)
concept_relationship

In [None]:
concept_ancestor = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_ANCESTOR.csv"),
    dtype={"ancestor_concept_id": str, "descendant_concept_id": str},
    sep="\t",
)
concept_ancestor

### base

In [None]:
data = concept[concept.vocabulary_id == "RxNorm"]
data["code"] = data.concept_code
data["name"] = data.concept_name
data = data[["code", "name"]]
data = data.drop_duplicates().dropna()
data.to_csv(os.path.join(resource_directory, "processed/RxNorm.csv"), index=False)
data.head()

### to ATC

In [None]:
concept_rxnorm = concept[concept.vocabulary_id == "RxNorm"]
concept_atc5 = concept[concept.vocabulary_id == "ATC"]
concept_atc5 = concept_atc5[concept_atc5.concept_class_id == "ATC 5th"]

rxnorm_atc5 = concept_rxnorm.merge(
    concept_ancestor,
    left_on="concept_id",
    right_on="descendant_concept_id",
    how="inner",
    suffixes=("_rxnorm", "_r"),
)
rxnorm_atc5 = rxnorm_atc5.merge(
    concept_atc5,
    left_on="ancestor_concept_id",
    right_on="concept_id",
    how="inner",
    suffixes=("_rxnorm", "_atc"),
)
rxnorm_atc5 = rxnorm_atc5[
    [
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
        "concept_id_atc",
        "concept_name_atc",
        "concept_code_atc",
    ]
]

data = pd.DataFrame()
data["RxNorm"] = rxnorm_atc5.concept_code_rxnorm
data["ATC"] = rxnorm_atc5.concept_code_atc
data.to_csv(
    os.path.join(resource_directory, "processed/RxNorm_to_ATC.csv"), index=False
)
data.head()

## ATC

### base

In [None]:
def get_atc_parent(code: str):
    """Get parent code of ATC code"""
    if len(code) == 7:
        return code[:5]
    elif len(code) == 5:
        return code[:4]
    elif len(code) == 4:
        return code[:3]
    elif len(code) == 3:
        return code[:1]
    else:
        return ""


get_atc_parent("V10XA53")

In [None]:
"""https://bioportal.bioontology.org/ontologies/ATC"""

atc = pd.read_csv(os.path.join(resource_directory, "raw/ATC.csv"))
atc = atc[["Class ID", "Preferred Label", "ATC LEVEL"]]
atc["Class ID"] = atc["Class ID"].apply(lambda x: x.split("/")[-1])
atc = atc.dropna()
atc = atc.drop_duplicates()
atc["parent_code"] = atc["Class ID"].map(get_atc_parent)
atc = atc.sort_values(by=["ATC LEVEL", "Class ID"])
atc.columns = ["code", "name", "level", "parent_code"]
atc = atc[["code", "parent_code", "name", "level"]]
atc.head()

additional info

In [None]:
"""https://go.drugbank.com/releases/latest"""

drugbank = pd.read_csv(
    os.path.join(resource_directory, "raw/drugs_info_5_1_8.csv").replace("\\", "/")
)
drugbank = drugbank.fillna("")
drugbank.atc_codes = drugbank.atc_codes.apply(lambda x: x.split("|"))
drugbank = drugbank.explode("atc_codes")
drugbank = drugbank[["drugbank_id", "description", "indication", "atc_codes", "smiles"]]
atc = atc.merge(drugbank, left_on="code", right_on="atc_codes", how="left")
atc = atc[
    [
        "code",
        "parent_code",
        "name",
        "level",
        "description",
        "indication",
        "smiles",
        "drugbank_id",
    ]
]
atc.to_csv(os.path.join(resource_directory, "processed/ATC.csv"), index=False)
atc.head()

### to ICD9CM

In [None]:
"""https://www.vumc.org/cpm/cpm-blog/medi-ensemble-medication-indication-resource-0"""

medi = pd.read_csv(os.path.join(resource_directory, "raw/MEDI_11242015.csv"))
medi = medi[medi.HSP == 1]
medi.CODE = medi.CODE.apply(lambda x: x.split("|"))
medi = medi.explode("CODE")
medi = medi.dropna().drop_duplicates().reset_index(drop=True)
medi = medi[["ATC", "CODE"]]
medi.columns = ["ATC", "ICD9CM"]
medi.to_csv(
    os.path.join(resource_directory, "processed/ATC_to_ICD9CM.csv"), index=False
)
medi.head()

### DDI

In [None]:
"""https://snap.stanford.edu/biodata/datasets/10017/10017-ChChSe-Decagon.html"""
ddi = pd.read_csv(
    os.path.join(resource_directory, "raw/DDI/ChChSe-Decagon_polypharmacy.csv")
)
ddi = ddi.rename(columns={"# STITCH 1": "STITCH 1"})
ddi.head()

In [None]:
"""http://sideeffects.embl.de/download/"""
stitch_2_atc = pd.read_csv(
    os.path.join(resource_directory, "raw/DDI/drug_atc.tsv"),
    sep="\t",
    header=None,
    names=["STITCH", "ATC"],
)
# http://stitch.embl.de/download/README
stitch_2_atc.STITCH = stitch_2_atc.STITCH.str.replace("CID1", "CID0")
stitch_2_atc = stitch_2_atc.groupby("STITCH").ATC.agg(lambda x: list(set(x))).to_dict()
stitch_2_atc

In [None]:
print(ddi["STITCH 1"].isin(stitch_2_atc).mean())
print(ddi["STITCH 2"].isin(stitch_2_atc).mean())

In [None]:
# DDI from paepr: GAMENet: Graph Augmented MEmory Networks for Recommending Medication Combination
k = 40
bottom_k_se = ddi["Polypharmacy Side Effect"].value_counts().iloc[-k:].index
ddi_bottom_k = ddi[ddi["Polypharmacy Side Effect"].isin(bottom_k_se)]
ddi_bottom_k = (
    ddi_bottom_k[["STITCH 1", "STITCH 2"]].drop_duplicates().reset_index(drop=True)
)
ddi_bottom_k["STITCH 1"] = ddi_bottom_k["STITCH 1"].map(stitch_2_atc)
ddi_bottom_k["STITCH 2"] = ddi_bottom_k["STITCH 2"].map(stitch_2_atc)
ddi_bottom_k = (
    ddi_bottom_k.dropna()
    .explode("STITCH 1")
    .explode("STITCH 2")
    .drop_duplicates()
    .reset_index(drop=True)
)
ddi_bottom_k.columns = ["ATC i", "ATC j"]
ddi_bottom_k.to_csv(
    os.path.join(resource_directory, "processed/DDI_GAMENet.csv"), index=False
)
print(ddi_bottom_k.shape)
ddi_bottom_k.head()

In [None]:
ddi_all = ddi[["STITCH 1", "STITCH 2"]].drop_duplicates().reset_index(drop=True)
ddi_all["STITCH 1"] = ddi_all["STITCH 1"].map(stitch_2_atc)
ddi_all["STITCH 2"] = ddi_all["STITCH 2"].map(stitch_2_atc)
ddi_all = (
    ddi_all.dropna()
    .explode("STITCH 1")
    .explode("STITCH 2")
    .drop_duplicates()
    .reset_index(drop=True)
)
ddi_all.columns = ["ATC i", "ATC j"]
ddi_all.to_csv(os.path.join(resource_directory, "processed/DDI.csv"), index=False)
print(ddi_all.shape)
ddi_all.head()