In [3]:
import os
import sys

src_path = os.path.abspath("../../..")
print(src_path)
sys.path.append(src_path)

/Users/zzachw/Projects/13_pyhealth/PyHealth


In [4]:
resource_directory = os.path.abspath("./resource")
resource_directory

'/Users/zzachw/Projects/13_pyhealth/PyHealth/pyhealth/medcode/dev/resource'

In [5]:
import pandas as pd
import re

## ICD9CM

### base

In [4]:
""" https://bioportal.bioontology.org/ontologies/ICD9CM """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD9CM.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# exclude non icd9 codes
raw_data = raw_data[raw_data.vocab == "ICD9CM"]
# exclude icd9proc codes
# icd9cm codes: 001-999.99, icd9proc: 00.00-99.99
raw_data = raw_data[raw_data.code.apply(lambda x: len(re.split("\.|-", x)[0]) > 2)]
data = raw_data[["code", "parent_code", "name"]]
# exclude non ICD9CM parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD9CM.csv"), index=False)
data.head()

Unnamed: 0,code,parent_code,name
0,806.11,806.1,Open fracture of C1-C4 level with complete les...
1,642.41,642.4,"Mild or unspecified pre-eclampsia, delivered, ..."
2,647.13,647.1,"Gonorrhea of mother, complicating pregnancy, c..."
3,374.21,374.2,Paralytic lagophthalmos
4,679.0,679.0,Maternal complications from in utero procedure...


### to CCSCM

In [5]:
def normalize_icd9cm(code: str):
    """Normalize ICD9CM code"""
    if code.startswith("E"):
        assert len(code) >= 4
        if len(code) == 4:
            return code
        return code[:4] + "." + code[4:]
    else:
        assert len(code) >= 3
        if len(code) == 3:
            return code
        return code[:3] + "." + code[3:]


normalize_icd9cm("01000")

'010.00'

In [6]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp"""

mapping = {}
with open(os.path.join(resource_directory, "raw/$dxref 2015.csv")) as f:
    lines = f.readlines()
    for line in lines[3:]:
        line = line.split(",")
        icd9cm_code = line[0].strip("'").strip()
        ccscm_code = line[1].strip("'").strip()
        assert icd9cm_code not in mapping
        mapping[icd9cm_code] = ccscm_code
data = {"ICD9CM": mapping.keys(), "CCSCM": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.ICD9CM = data.ICD9CM.map(normalize_icd9cm)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD9CM_to_CCSCM.csv"), index=False
)
data.head()

Unnamed: 0,ICD9CM,CCSCM
0,10.0,1
1,10.01,1
2,10.02,1
3,10.03,1
4,10.04,1


## CCSCM

### base

In [127]:
""" https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp """

data = {}
with open(os.path.join(resource_directory, "raw/dxlabel 2015.csv")) as f:
    lines = f.readlines()
    for line in lines[4:]:
        line = line.split(",")
        code = line[0].strip("'").strip()
        name = line[1].strip("'").strip()
        data[code] = {"name": name}
data = (
    pd.DataFrame.from_dict(data, orient="index")
    .reset_index()
    .rename(columns={"index": "code"})
)
data.to_csv(os.path.join(resource_directory, "processed/CCSCM.csv"), index=False)
data.head()

Unnamed: 0,code,name
0,1,Tuberculosis
1,10,Immunizations and screening for infectious dis...
2,100,Acute myocardial infarction
3,101,Coronary atherosclerosis and other heart disease
4,102,Nonspecific chest pain


## ICD9PROC

### base

In [8]:
""" https://bioportal.bioontology.org/ontologies/ICD9CM """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD9CM.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# exclude non icd9 codes
raw_data = raw_data[raw_data.vocab == "ICD9CM"]
# exclude icd9cm codes
# icd9cm codes: 001-999.99, icd9proc: 00.00-99.99
raw_data = raw_data[raw_data.code.apply(lambda x: len(re.split("\.|-", x)[0]) <= 2)]
data = raw_data[["code", "parent_code", "name"]]
# exclude non icd9proc parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD9PROC.csv"), index=False)
data.head()

Unnamed: 0,code,parent_code,name
16,94.62,94.6,Alcohol detoxification
17,94.69,94.6,Combined alcohol and drug rehabilitation and d...
18,94.6,94.0,Alcohol and drug rehabilitation and detoxifica...
19,94.61,94.6,Alcohol rehabilitation
20,94.67,94.6,Combined alcohol and drug rehabilitation


### to CCSPROC

In [9]:
def normalize_icd9proc(code: str):
    """Normalize ICD9PROC code"""
    assert len(code) >= 2
    if len(code) == 2:
        return code
    return code[:2] + "." + code[2:]


normalize_icd9proc("6111")

'61.11'

In [10]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp"""

mapping = {}
with open(os.path.join(resource_directory, "raw/$prref 2015.csv")) as f:
    lines = f.readlines()
    for line in lines[3:]:
        line = line.split(",")
        icd9proc_code = line[0].strip("'").strip()
        ccsproc_code = line[1].strip("'").strip()
        assert icd9proc_code not in mapping
        mapping[icd9proc_code] = ccsproc_code

data = {"ICD9PROC": mapping.keys(), "CCSPROC": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.ICD9PROC = data.ICD9PROC.map(normalize_icd9proc)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD9PROC_to_CCSPROC.csv"), index=False
)
data.head()

Unnamed: 0,ICD9PROC,CCSPROC
0,1.01,1
1,1.09,1
2,1.21,1
3,1.22,1
4,1.23,1


## CCSPROC

### base

In [129]:
""" https://www.hcup-us.ahrq.gov/toolssoftware/ccs/ccs.jsp """

data = {}
with open(os.path.join(resource_directory, "raw/prlabel 2014.csv")) as f:
    lines = f.readlines()
    for line in lines[4:]:
        line = line.split(",")
        code = line[0].strip('"').strip()
        name = line[1].strip('"').strip()
        data[code] = {"name": name}
data = (
    pd.DataFrame.from_dict(data, orient="index")
    .reset_index()
    .rename(columns={"index": "code"})
)
data.to_csv(os.path.join(resource_directory, "processed/CCSPROC.csv"), index=False)
data.head()

Unnamed: 0,code,name
0,1,"Incision and excision of CNS"""
1,10,"Thyroidectomy; partial or complete"""
2,100,Endoscopy and endoscopic biopsy of the urinary...
3,101,Transurethral excision; drainage; or removal u...
4,102,"Ureteral catheterization"""


## ICD10CM

### base

In [12]:
""" https://bioportal.bioontology.org/ontologies/ICD10CM """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD10CM.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# # exclude non icd10cm codes
raw_data = raw_data[raw_data.vocab == "ICD10CM"]
data = raw_data[["code", "parent_code", "name"]]
# exclude non icd10cm parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD10CM.csv"), index=False)
data.head()

Unnamed: 0,code,parent_code,name
0,Z62.811,Z62.81,Personal history of psychological abuse in chi...
1,Z01.81,Z01.8,Encounter for preprocedural examinations
2,Z01.89,Z01.8,Encounter for other specified special examinat...
3,Z01.8,Z01,Encounter for other specified special examinat...
4,Z01.82,Z01.8,Encounter for allergy testing


### to CCSCM

In [13]:
def normalize_icd10cm(code: str):
    """Normalize ICD10CM code"""
    assert len(code) >= 3
    if len(code) == 3:
        return code
    return code[:3] + "." + code[3:]


normalize_icd10cm("Y9284")

'Y92.84'

In [14]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccsr/ccsr_archive.jsp#ccsr"""

mapping = {}
with open(os.path.join(resource_directory, "raw/ccs_dx_icd10cm_2019_1.csv")) as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.split(",")
        icd10cm_code = line[0].strip("'").strip()
        ccscm_code = line[1].strip("'").strip()
        assert icd10cm_code not in mapping
        mapping[icd10cm_code] = ccscm_code
data = {"ICD10CM": mapping.keys(), "CCSCM": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.ICD10CM = data.ICD10CM.map(normalize_icd10cm)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD10CM_to_CCSCM.csv"), index=False
)
data.head()

Unnamed: 0,ICD10CM,CCSCM
0,A15.0,1
1,A15.4,1
2,A15.5,1
3,A15.6,1
4,A15.7,1


## ICD10PROC

### base

In [15]:
""" https://bioportal.bioontology.org/ontologies/ICD10PCS """

raw_data = pd.read_csv(os.path.join(resource_directory, "raw/ICD10PCS.csv"))
raw_data["code"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-1])
raw_data["vocab"] = raw_data["Class ID"].apply(lambda x: x.split("/")[-2])
raw_data["parent_code"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-1] if not pd.isna(x) else ""
)
raw_data["parent_vocab"] = raw_data["Parents"].apply(
    lambda x: x.split("/")[-2] if not pd.isna(x) else ""
)
raw_data["name"] = raw_data["Preferred Label"]
# # exclude non icd10proc codes
raw_data = raw_data[raw_data.vocab == "ICD10PCS"]
data = raw_data[["code", "parent_code", "name"]]
# exclude non icd10proc parent code
invalid_parents = list(set(data.parent_code.unique()) - set(data.code.unique()))
data = data.replace({"parent_code": invalid_parents}, "")
data.to_csv(os.path.join(resource_directory, "processed/ICD10PROC.csv"), index=False)
data.head()

Unnamed: 0,code,parent_code,name
0,0Q894Z,0Q894,Medical and Surgical @ Lower Bones @ Division ...
1,0Q894ZZ,0Q894Z,"Division of Left Femoral Shaft, Percutaneous E..."
2,005W3Z,005W3,Medical and Surgical @ Central Nervous System ...
3,005W3ZZ,005W3Z,"Destruction of Cervical Spinal Cord, Percutane..."
4,2W0MX3,2W0MX,Placement @ Anatomical Regions @ Change @ Lowe...


### to CCSPROC

In [16]:
"""https://www.hcup-us.ahrq.gov/toolssoftware/ccs10/ccs10.jsp"""

mapping = {}
with open(os.path.join(resource_directory, "raw/ccs_pr_icd10pcs_2019_1.csv")) as f:
    lines = f.readlines()
    for line in lines[1:]:
        line = line.split(",")
        icd10proc_code = line[0].strip("'").strip()
        ccsproc_code = line[1].strip("'").strip()
        assert icd10proc_code not in mapping
        mapping[icd10proc_code] = ccsproc_code
data = {"ICD10PROC": mapping.keys(), "CCSPROC": mapping.values()}
data = pd.DataFrame.from_dict(data)
data.to_csv(
    os.path.join(resource_directory, "processed/ICD10PROC_to_CCSPROC.csv"), index=False
)
data.head()

Unnamed: 0,ICD10PROC,CCSPROC
0,00800ZZ,1
1,00803ZZ,1
2,00804ZZ,1
3,00870ZZ,1
4,00873ZZ,1


## NDC

Download NDC, RxNorm, ATC from https://athena.ohdsi.org/vocabulary/list

In [17]:
foldername = "NDC_RxNorm_ATC"

In [18]:
concept = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT.csv"),
    dtype={
        "concept_id": str,
        "vocabulary_id": str,
        "concept_class_id": str,
        "concept_code": str,
    },
    sep="\t",
)
concept.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,21600001,ALIMENTARY TRACT AND METABOLISM,Drug,ATC,ATC 1st,C,A,19700101,20991231,
1,21600002,STOMATOLOGICAL PREPARATIONS,Drug,ATC,ATC 2nd,C,A01,19700101,20991231,
2,21600003,STOMATOLOGICAL PREPARATIONS,Drug,ATC,ATC 3rd,C,A01A,19700101,20991231,
3,21600004,Caries prophylactic agents,Drug,ATC,ATC 4th,C,A01AA,19700101,20991231,
4,21600005,"sodium fluoride; oral, local oral (caries prop...",Drug,ATC,ATC 5th,C,A01AA01,19700101,20991231,


In [19]:
concept_relationship = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_RELATIONSHIP.csv"),
    dtype={"concept_id_1": str, "concept_id_2": str, "relationship_id": str},
    sep="\t",
)
concept_relationship

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
0,45093654,1154196,Maps to,19700101,20991231,
1,44923643,19035209,Maps to,19700101,20991231,
2,45144409,19039121,Maps to,19700101,20991231,
3,44849505,19133296,Maps to,20090101,20991231,
4,19058667,19047727,Brand name of,20161007,20991231,
...,...,...,...,...,...,...
4773359,40224172,827884,Mapped from,20210509,20991231,
4773360,40224172,829918,Mapped from,20210509,20991231,
4773361,40224166,829917,Mapped from,20210509,20991231,
4773362,40224166,827886,Mapped from,20210509,20991231,


In [20]:
concept_ancestor = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_ANCESTOR.csv"),
    dtype={"ancestor_concept_id": str, "descendant_concept_id": str},
    sep="\t",
)
concept_ancestor

Unnamed: 0,ancestor_concept_id,descendant_concept_id,min_levels_of_separation,max_levels_of_separation
0,742267,40172924,2,3
1,703547,40090686,1,2
2,723013,19117335,1,1
3,561425,40233203,1,1
4,711584,1593324,2,3
...,...,...,...,...
3072534,45893526,46221584,2,3
3072535,45893526,46287704,1,1
3072536,45893526,46287705,1,2
3072537,45893526,46287706,2,3


### base

In [21]:
data = concept[concept.vocabulary_id == "NDC"]
data["code"] = data.concept_code
data["name"] = data.concept_name
data = data[["code", "name"]]
data = data.drop_duplicates().dropna()
data.to_csv(os.path.join(resource_directory, "processed/NDC.csv"), index=False)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["code"] = data.concept_code
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["name"] = data.concept_name


Unnamed: 0,code,name
8536,1,ferric pyrophosphate citrate 5.44 MG/ML Inject...
8537,2000001,Ergocalciferol 50000 UNT Oral Capsule [Deltalin]
8538,2000002,Ergocalciferol 50000 UNT Oral Capsule [Deltalin]
8539,2001402,Flurandrenolide 0.004 MG/SQCM Medicated Tape [...
8540,2001801,Capreomycin 500 MG/ML Injectable Solution [Cap...


### to RxNorm

In [22]:
concept_ndc = concept[concept.vocabulary_id == "NDC"]
concept_relationship_maps_to = concept_relationship[
    concept_relationship.relationship_id == "Maps to"
]
concept_relationship_maps_to = concept_relationship_maps_to[
    pd.isna(concept_relationship_maps_to.invalid_reason)
]
concept_rxnorm = concept[concept.vocabulary_id == "RxNorm"]

ndc_rxnorm = concept_ndc.merge(
    concept_relationship_maps_to,
    left_on="concept_id",
    right_on="concept_id_1",
    how="inner",
    suffixes=("_ndc", "_r"),
)
ndc_rxnorm = ndc_rxnorm.merge(
    concept_rxnorm,
    left_on="concept_id_2",
    right_on="concept_id",
    how="inner",
    suffixes=("_ndc", "_rxnorm"),
)
ndc_rxnorm = ndc_rxnorm[
    [
        "concept_id_ndc",
        "concept_name_ndc",
        "concept_code_ndc",
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
    ]
]

data = pd.DataFrame()
data["NDC"] = ndc_rxnorm.concept_code_ndc
data["RxNorm"] = ndc_rxnorm.concept_code_rxnorm
data.to_csv(
    os.path.join(resource_directory, "processed/NDC_to_RxNorm.csv"), index=False
)
data.head()

Unnamed: 0,NDC,RxNorm
0,2000001,1367414
1,2000002,1367414
2,2001402,797697
3,235870,797697
4,23587024,797697


### to ATC

In [23]:
# first convert NDC to RxNorm
concept_ndc = concept[concept.vocabulary_id == "NDC"]
concept_relationship_maps_to = concept_relationship[
    concept_relationship.relationship_id == "Maps to"
]
concept_relationship_maps_to = concept_relationship_maps_to[
    pd.isna(concept_relationship_maps_to.invalid_reason)
]
concept_rxnorm = concept[concept.vocabulary_id == "RxNorm"]

ndc_rxnorm = concept_ndc.merge(
    concept_relationship_maps_to,
    left_on="concept_id",
    right_on="concept_id_1",
    how="inner",
    suffixes=("_ndc", "_r"),
)
ndc_rxnorm = ndc_rxnorm.merge(
    concept_rxnorm,
    left_on="concept_id_2",
    right_on="concept_id",
    how="inner",
    suffixes=("_ndc", "_rxnorm"),
)
ndc_rxnorm = ndc_rxnorm[
    [
        "concept_id_ndc",
        "concept_name_ndc",
        "concept_code_ndc",
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
    ]
]

# then convert RxNorm to ATC5
concept_atc5 = concept[concept.vocabulary_id == "ATC"]
concept_atc5 = concept_atc5[concept_atc5.concept_class_id == "ATC 5th"]

ndc_rxnorm_atc5 = ndc_rxnorm.merge(
    concept_ancestor,
    left_on="concept_id_rxnorm",
    right_on="descendant_concept_id",
    how="inner",
    suffixes=("_rxnorm", "_r"),
)
ndc_rxnorm_atc5 = ndc_rxnorm_atc5.merge(
    concept_atc5,
    left_on="ancestor_concept_id",
    right_on="concept_id",
    how="inner",
    suffixes=("_rxnorm", "_atc"),
)
ndc_rxnorm_atc5 = ndc_rxnorm_atc5.rename(
    columns={
        "concept_id": "concept_id_atc5",
        "concept_name": "concept_name_atc5",
        "concept_code": "concept_code_atc5",
    }
)
ndc_rxnorm_atc5 = ndc_rxnorm_atc5[
    [
        "concept_id_ndc",
        "concept_name_ndc",
        "concept_code_ndc",
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
        "concept_id_atc5",
        "concept_name_atc5",
        "concept_code_atc5",
    ]
]

data = pd.DataFrame()
data["NDC"] = ndc_rxnorm_atc5.concept_code_ndc
data["ATC"] = ndc_rxnorm_atc5.concept_code_atc5
data.to_csv(os.path.join(resource_directory, "processed/NDC_to_ATC.csv"), index=False)
data.head()

Unnamed: 0,NDC,ATC
0,2000001,A11CC01
1,2000002,A11CC01
2,2026002,A11CC01
3,115014000,A11CC01
4,115014001,A11CC01


## RxNorm

Download NDC, RxNorm, ATC from https://athena.ohdsi.org/vocabulary/list

In [24]:
foldername = "NDC_RxNorm_ATC"

In [25]:
concept = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT.csv"),
    dtype={
        "concept_id": str,
        "vocabulary_id": str,
        "concept_class_id": str,
        "concept_code": str,
    },
    sep="\t",
)
concept.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,21600001,ALIMENTARY TRACT AND METABOLISM,Drug,ATC,ATC 1st,C,A,19700101,20991231,
1,21600002,STOMATOLOGICAL PREPARATIONS,Drug,ATC,ATC 2nd,C,A01,19700101,20991231,
2,21600003,STOMATOLOGICAL PREPARATIONS,Drug,ATC,ATC 3rd,C,A01A,19700101,20991231,
3,21600004,Caries prophylactic agents,Drug,ATC,ATC 4th,C,A01AA,19700101,20991231,
4,21600005,"sodium fluoride; oral, local oral (caries prop...",Drug,ATC,ATC 5th,C,A01AA01,19700101,20991231,


In [26]:
concept_relationship = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_RELATIONSHIP.csv"),
    dtype={"concept_id_1": str, "concept_id_2": str, "relationship_id": str},
    sep="\t",
)
concept_relationship

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
0,45093654,1154196,Maps to,19700101,20991231,
1,44923643,19035209,Maps to,19700101,20991231,
2,45144409,19039121,Maps to,19700101,20991231,
3,44849505,19133296,Maps to,20090101,20991231,
4,19058667,19047727,Brand name of,20161007,20991231,
...,...,...,...,...,...,...
4773359,40224172,827884,Mapped from,20210509,20991231,
4773360,40224172,829918,Mapped from,20210509,20991231,
4773361,40224166,829917,Mapped from,20210509,20991231,
4773362,40224166,827886,Mapped from,20210509,20991231,


In [27]:
concept_ancestor = pd.read_csv(
    os.path.join(resource_directory, f"raw/{foldername}/CONCEPT_ANCESTOR.csv"),
    dtype={"ancestor_concept_id": str, "descendant_concept_id": str},
    sep="\t",
)
concept_ancestor

Unnamed: 0,ancestor_concept_id,descendant_concept_id,min_levels_of_separation,max_levels_of_separation
0,742267,40172924,2,3
1,703547,40090686,1,2
2,723013,19117335,1,1
3,561425,40233203,1,1
4,711584,1593324,2,3
...,...,...,...,...
3072534,45893526,46221584,2,3
3072535,45893526,46287704,1,1
3072536,45893526,46287705,1,2
3072537,45893526,46287706,2,3


### base

In [28]:
data = concept[concept.vocabulary_id == "RxNorm"]
data["code"] = data.concept_code
data["name"] = data.concept_name
data = data[["code", "name"]]
data = data.drop_duplicates().dropna()
data.to_csv(os.path.join(resource_directory, "processed/RxNorm.csv"), index=False)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["code"] = data.concept_code
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["name"] = data.concept_name


Unnamed: 0,code,name
1299328,1000000,amlodipine 5 MG / hydrochlorothiazide 12.5 MG ...
1299329,1000001,amlodipine 5 MG / hydrochlorothiazide 25 MG / ...
1299330,1000002,Tribenzor 40/5/25
1299331,1000003,amlodipine 5 MG / hydrochlorothiazide 25 MG / ...
1299332,1000004,Amlodipine / Hydrochlorothiazide / Olmesartan ...


### to ATC

In [29]:
concept_rxnorm = concept[concept.vocabulary_id == "RxNorm"]
concept_atc5 = concept[concept.vocabulary_id == "ATC"]
concept_atc5 = concept_atc5[concept_atc5.concept_class_id == "ATC 5th"]

rxnorm_atc5 = concept_rxnorm.merge(
    concept_ancestor,
    left_on="concept_id",
    right_on="descendant_concept_id",
    how="inner",
    suffixes=("_rxnorm", "_r"),
)
rxnorm_atc5 = rxnorm_atc5.merge(
    concept_atc5,
    left_on="ancestor_concept_id",
    right_on="concept_id",
    how="inner",
    suffixes=("_rxnorm", "_atc"),
)
rxnorm_atc5 = rxnorm_atc5[
    [
        "concept_id_rxnorm",
        "concept_name_rxnorm",
        "concept_code_rxnorm",
        "concept_id_atc",
        "concept_name_atc",
        "concept_code_atc",
    ]
]

data = pd.DataFrame()
data["RxNorm"] = rxnorm_atc5.concept_code_rxnorm
data["ATC"] = rxnorm_atc5.concept_code_atc
data.to_csv(
    os.path.join(resource_directory, "processed/RxNorm_to_ATC.csv"), index=False
)
data.head()

Unnamed: 0,RxNorm,ATC
0,1000000,C09DA08
1,1000001,C09DA08
2,1000003,C09DA08
3,1000005,C09DA08
4,10763,C09DA08


## ATC

### base

In [30]:
def get_atc_parent(code: str):
    """Get parent code of ATC code"""
    if len(code) == 7:
        return code[:5]
    elif len(code) == 5:
        return code[:4]
    elif len(code) == 4:
        return code[:3]
    elif len(code) == 3:
        return code[:1]
    else:
        return ""


get_atc_parent("V10XA53")

'V10XA'

In [31]:
"""https://bioportal.bioontology.org/ontologies/ATC"""

atc = pd.read_csv(os.path.join(resource_directory, "raw/ATC.csv"))
atc = atc[["Class ID", "Preferred Label", "ATC LEVEL"]]
atc["Class ID"] = atc["Class ID"].apply(lambda x: x.split("/")[-1])
atc = atc.dropna()
atc = atc.drop_duplicates()
atc["parent_code"] = atc["Class ID"].map(get_atc_parent)
atc = atc.sort_values(by=["ATC LEVEL", "Class ID"])
atc.columns = ["code", "name", "level", "parent_code"]
atc = atc[["code", "parent_code", "name", "level"]]
atc.head()

Unnamed: 0,code,parent_code,name,level
5732,A,,ALIMENTARY TRACT AND METABOLISM DRUGS,1.0
5993,B,,BLOOD AND BLOOD FORMING ORGAN DRUGS,1.0
5477,C,,CARDIOVASCULAR SYSTEM DRUGS,1.0
2143,D,,DERMATOLOGICALS,1.0
6393,G,,GENITO URINARY SYSTEM AND SEX HORMONES,1.0


additional info

In [32]:
"""https://go.drugbank.com/releases/latest"""

drugbank = pd.read_csv(
    os.path.join(resource_directory, "raw/drugs_info_5_1_8.csv").replace("\\", "/")
)
drugbank = drugbank.fillna("")
drugbank.atc_codes = drugbank.atc_codes.apply(lambda x: x.split("|"))
drugbank = drugbank.explode("atc_codes")
drugbank = drugbank[["drugbank_id", "description", "indication", "atc_codes", "smiles"]]
atc = atc.merge(drugbank, left_on="code", right_on="atc_codes", how="left")
atc = atc[
    [
        "code",
        "parent_code",
        "name",
        "level",
        "description",
        "indication",
        "smiles",
        "drugbank_id",
    ]
]
atc.to_csv(os.path.join(resource_directory, "processed/ATC.csv"), index=False)
atc.head()

Unnamed: 0,code,parent_code,name,level,description,indication,smiles,drugbank_id
0,A,,ALIMENTARY TRACT AND METABOLISM DRUGS,1.0,,,,
1,B,,BLOOD AND BLOOD FORMING ORGAN DRUGS,1.0,,,,
2,C,,CARDIOVASCULAR SYSTEM DRUGS,1.0,,,,
3,D,,DERMATOLOGICALS,1.0,,,,
4,G,,GENITO URINARY SYSTEM AND SEX HORMONES,1.0,,,,


### to ICD9CM

In [33]:
"""https://www.vumc.org/cpm/cpm-blog/medi-ensemble-medication-indication-resource-0"""

medi = pd.read_csv(os.path.join(resource_directory, "raw/MEDI_11242015.csv"))
medi = medi[medi.HSP == 1]
medi.CODE = medi.CODE.apply(lambda x: x.split("|"))
medi = medi.explode("CODE")
medi = medi.dropna().drop_duplicates().reset_index(drop=True)
medi = medi[["ATC", "CODE"]]
medi.columns = ["ATC", "ICD9CM"]
medi.to_csv(
    os.path.join(resource_directory, "processed/ATC_to_ICD9CM.csv"), index=False
)
medi.head()

Unnamed: 0,ATC,ICD9CM
0,R05CB05,595.0
1,R05CB05,977.9
2,R05CB05,595.9
3,R05CB05,459.0
4,V03AF01,595.0


### DDI

In [9]:
"""https://snap.stanford.edu/biodata/datasets/10017/10017-ChChSe-Decagon.html"""
ddi = pd.read_csv(
    os.path.join(resource_directory, "raw/DDI/ChChSe-Decagon_polypharmacy.csv")
)
ddi = ddi.rename(columns={"# STITCH 1": "STITCH 1"})
ddi.head()

Unnamed: 0,STITCH 1,STITCH 2,Polypharmacy Side Effect,Side Effect Name
0,CID000002173,CID000003345,C0151714,hypermagnesemia
1,CID000002173,CID000003345,C0035344,retinopathy of prematurity
2,CID000002173,CID000003345,C0004144,atelectasis
3,CID000002173,CID000003345,C0002063,alkalosis
4,CID000002173,CID000003345,C0004604,Back Ache


In [10]:
"""http://sideeffects.embl.de/download/"""
stitch_2_atc = pd.read_csv(
    os.path.join(resource_directory, "raw/DDI/drug_atc.tsv"),
    sep="\t",
    header=None,
    names=["STITCH", "ATC"],
)
# http://stitch.embl.de/download/README
stitch_2_atc.STITCH = stitch_2_atc.STITCH.str.replace("CID1", "CID0")
stitch_2_atc = stitch_2_atc.groupby("STITCH").ATC.agg(lambda x: list(set(x))).to_dict()
stitch_2_atc

{'CID000000085': ['A16AA01'],
 'CID000000119': ['N03AG03', 'L03AA03'],
 'CID000000137': ['L01XD04'],
 'CID000000143': ['V03AF06', 'V03AF04', 'V03AF03'],
 'CID000000158': ['G02AD02'],
 'CID000000159': ['B01AC09'],
 'CID000000160': ['G02AD01'],
 'CID000000175': ['S02AA10', 'G01AD02'],
 'CID000000187': ['S01EB09'],
 'CID000000191': ['C01EB10', 'J05AB03', 'S01AD06'],
 'CID000000206': ['B05CB01',
  'B05CX01',
  'B05XA03',
  'A12CA01',
  'B05XA07',
  'B05XA01',
  'A12BA01',
  'A12AA07',
  'G04BA03',
  'V04CE01',
  'V06DC01',
  'V04CA02'],
 'CID000000214': ['G04BE01', 'C01EA01'],
 'CID000000232': ['B05XB01'],
 'CID000000247': ['A16AA06', 'A09AB02'],
 'CID000000271': ['A07XA03', 'A12AA20'],
 'CID000000298': ['S03AA08',
  'G01AA05',
  'J01BA01',
  'S01AA01',
  'D10AF03',
  'D06AX02',
  'S02AA01'],
 'CID000000303': ['A05AA03'],
 'CID000000311': ['A09AB04'],
 'CID000000312': ['B05XA13', 'A09AB03'],
 'CID000000338': ['S01BC08', 'N02BA12', 'N02BA04', 'D01AE12'],
 'CID000000401': ['J04AB01'],
 'CID0

In [11]:
print(ddi["STITCH 1"].isin(stitch_2_atc).mean())
print(ddi["STITCH 2"].isin(stitch_2_atc).mean())

0.9566012344279667
0.8617562842500851


In [12]:
# DDI from paepr: GAMENet: Graph Augmented MEmory Networks for Recommending Medication Combination
k = 40
bottom_k_se = ddi["Polypharmacy Side Effect"].value_counts().iloc[-k:].index
ddi_bottom_k = ddi[ddi["Polypharmacy Side Effect"].isin(bottom_k_se)]
ddi_bottom_k = (
    ddi_bottom_k[["STITCH 1", "STITCH 2"]].drop_duplicates().reset_index(drop=True)
)
ddi_bottom_k["STITCH 1"] = ddi_bottom_k["STITCH 1"].map(stitch_2_atc)
ddi_bottom_k["STITCH 2"] = ddi_bottom_k["STITCH 2"].map(stitch_2_atc)
ddi_bottom_k = (
    ddi_bottom_k.dropna()
    .explode("STITCH 1")
    .explode("STITCH 2")
    .drop_duplicates()
    .reset_index(drop=True)
)
ddi_bottom_k.columns = ["ATC i", "ATC j"]
ddi_bottom_k.to_csv(
    os.path.join(resource_directory, "processed/DDI_GAMENet.csv"), index=False
)
print(ddi_bottom_k.shape)
ddi_bottom_k.head()

(1011, 2)


Unnamed: 0,ATC i,ATC j
0,N03AE01,C03AA03
1,J05AG03,J05AF05
2,N01AX10,J02AC03
3,J02AC02,N01AX10
4,N05AD01,N06AX16


In [13]:
ddi_all = ddi[["STITCH 1", "STITCH 2"]].drop_duplicates().reset_index(drop=True)
ddi_all["STITCH 1"] = ddi_all["STITCH 1"].map(stitch_2_atc)
ddi_all["STITCH 2"] = ddi_all["STITCH 2"].map(stitch_2_atc)
ddi_all = (
    ddi_all.dropna()
    .explode("STITCH 1")
    .explode("STITCH 2")
    .drop_duplicates()
    .reset_index(drop=True)
)
ddi_all.columns = ["ATC i", "ATC j"]
ddi_all.to_csv(os.path.join(resource_directory, "processed/DDI.csv"), index=False)
print(ddi_all.shape)
ddi_all.head()

(132662, 2)


Unnamed: 0,ATC i,ATC j
0,S01AA19,N01AH01
1,S01AA19,N02AB03
2,J01CA01,N01AH01
3,J01CA01,N02AB03
4,N01AB08,R03DA05
