In [1]:
!pip install pandas



In [2]:
import pandas as pd

In [3]:
unique_dccs = ['EFO', 'UBERON', 'UNIPROTKB', 'SNOMEDCT_US', 'EXPBINS', 'PUBCHEM', 'GTEXEXP', 'HGNC']
codes_ = {}

In [4]:
def add_to_dict(row):
        codes_[row["CodeID:ID"]] = {
            "SAB": row["SAB"],
            "Code": row["CODE"],
            "value": row["value:float"],
            "lowerbound": row["lowerbound:float"],
            "upperbound": row["upperbound:float"],
            "unit": row["unit"],
            "label": "",
            "label_synonyms": None,
            "cuis": None,
        }

In [5]:
codes = pd.read_csv('neo4j/import/CODEs.csv', dtype = {'CODE': str})
codes = codes[codes['SAB'].isin(unique_dccs)]

codes.apply(add_to_dict, axis = 1)

6           None
32          None
33          None
44          None
56          None
            ... 
20026065    None
20026066    None
20026067    None
20026068    None
20026393    None
Length: 2368974, dtype: object

In [6]:
def add_end_ids(row):
    value = row[":END_ID"]
    if value not in end_ids:
        end_ids[value] = []
    end_ids[value].append(row[":START_ID"])

In [7]:
cui_codes = pd.read_csv("neo4j/import/CUI-CODEs.csv")

end_ids = {}
cui_codes.apply(add_end_ids, axis = 1)

0           None
1           None
2           None
3           None
4           None
            ... 
17253063    None
17253064    None
17253065    None
17253066    None
17253067    None
Length: 17253068, dtype: object

In [8]:
def add_to_cui_sui(row):
    value = row[":START_ID"]
    if value not in cui_suis:
        cui_suis[value] = []
    cui_suis[value].append(row[":END_ID"])

In [9]:
cui_sui = pd.read_csv("neo4j/import/CUI-SUIs.csv")

cui_suis = {}
cui_sui.apply(add_to_cui_sui, axis = 1)

0          None
1          None
2          None
3          None
4          None
           ... 
7953807    None
7953808    None
7953809    None
7953810    None
7953811    None
Length: 7953812, dtype: object

In [10]:
final_dict = {}

for key in codes_:
    cuis = end_ids[key]
    synonyms = []
    for cui in cuis:
        if cui in cui_suis:
            synonyms.extend(cui_suis[cui])
            
    for syn in synonyms:
        if not codes_[key]["label_synonyms"]:
            codes_[key]["label_synonyms"] = syn
        else:
            codes_[key]["label_synonyms"] += f" | {syn}"

    for cui in cuis:
        if not codes_[key]["cuis"]:
            codes_[key]["cuis"] = cui
        else:
            codes_[key]["cuis"] += f" | {cui}"
    
    codes_[key]["label"] = synonyms[0] if synonyms else ""

first = list(codes_)[0]
print(first, codes_[first])


SNOMEDCT_US:28677006 {'SAB': 'SNOMEDCT_US', 'Code': '28677006', 'value': nan, 'lowerbound': nan, 'upperbound': nan, 'unit': nan, 'label': 'Aborted Fetus (structure)', 'label_synonyms': 'Aborted Fetus (structure)', 'cuis': 'C0000781'}


In [11]:
final_df = pd.DataFrame.from_dict(codes_, orient="index")
final_df.reset_index(inplace= True)
final_df.drop(["index"], axis = 1, inplace = True)
final_df.head(20)

Unnamed: 0,SAB,Code,value,lowerbound,upperbound,unit,label,label_synonyms,cuis
0,SNOMEDCT_US,28677006,,,,,Aborted Fetus (structure),Aborted Fetus (structure),C0000781
1,SNOMEDCT_US,14423008,,,,,Adhesive bandage,Adhesive bandage,C0001512
2,SNOMEDCT_US,55603005,,,,,Adipose tissue,Adipose tissue,C0001527
3,SNOMEDCT_US,15890002,,,,,Albinism,Albinism,C0001916
4,SNOMEDCT_US,714291009,,,,,aminacrine,aminacrine,C0002503
5,SNOMEDCT_US,53800008,,,,,anthralin,anthralin,C0003166
6,SNOMEDCT_US,1217427007,,,,,Aquacobalamin,Aquacobalamin,C0003663
7,SNOMEDCT_US,96149000,,,,,bambermycins,bambermycins,C0004717
8,SNOMEDCT_US,117497006,,,,,Biogenic Amines,Biogenic Amines,C0005496
9,SNOMEDCT_US,13746004,,,,,Bipolar Disorder,Bipolar Disorder,C0005586


In [12]:
final_df.shape

(2368974, 9)

In [13]:
final_df.to_csv("nodes.tsv", sep = "\t")

# Edges

In [14]:
cui_cui = pd.read_csv('neo4j/import/CUI-CUIs.csv', dtype = {"evidence_class:string" : str} )

In [15]:
columns = ['source', 'target', 'relation', 'source_label', 'target_label', 'evidence', 'SAB']
cui_cui = cui_cui[cui_cui['SAB'].isin(unique_dccs)]

In [16]:
def cui_to_label(row):
    cui_labels[row["CUI"]] = row[":END_ID"]

In [17]:
code_sui = pd.read_csv("neo4j/import/CODE-SUIs.csv")
code_sui = code_sui.drop_duplicates(["CUI"])

cui_labels = {}
code_sui.apply(cui_to_label, axis = 1)

0           None
5           None
6           None
12          None
15          None
            ... 
14464723    None
14464749    None
14465239    None
14465293    None
14465515    None
Length: 7772507, dtype: object

In [18]:
def add_start_end_label(row):
    start_id = row[":START_ID"]
    end_id = row[":END_ID"]
    start_val = cui_labels[start_id] if start_id in cui_labels else ""
    end_val = cui_labels[end_id] if end_id in cui_labels else ""

    return [start_val, end_val]

In [19]:
cui_cui[["source_label", "target_label"]] = cui_cui.apply(add_start_end_label, axis = 1, result_type='expand')

In [20]:
cui_cui.head(10)

Unnamed: 0,:START_ID,:END_ID,:TYPE,SAB,evidence_class:string,source_label,target_label
15,C0190619,C0000726,has_direct_procedure_site,SNOMEDCT_US,,Embolectomy with catheter of mesenteric artery...,"Abdomen, NOS"
21,C0152096,C0000768,isa,SNOMEDCT_US,,Edwards,Birth defect
32,C0198693,C0000833,has_direct_morphology,SNOMEDCT_US,,Incision and drainage of nonobstetrical perine...,Abscess NOS
33,C2960791,C0000833,has_direct_morphology,SNOMEDCT_US,,Incision and drainage of abscess of head and/o...,Abscess NOS
34,C1861233,C0000846,has_associated_morphology,SNOMEDCT_US,,Tetramelic Monodactyly,agenesis
60,C1274717,C0001144,mapped_to,SNOMEDCT_US,,Superficial acne vulgaris (diagnosis),Acne Vulgaris
65,C4543842,C0001168,has_associated_morphology,SNOMEDCT_US,,Acute occlusion of artery of lower limb,Obturation
72,C5688388,C0001309,mapped_to,SNOMEDCT_US,,Perennial allergic conjunctivitis of left eye ...,Acute atopic conjunctivitis
73,C0151588,C0001314,isa,SNOMEDCT_US,,"Hearing Loss, Transitory",Acute diseases
121,C0193714,C0001625,has_direct_procedure_site,SNOMEDCT_US,,Exploration of adrenal gland with biopsy by tr...,Adrenal Glands


In [21]:
cui_cui.to_csv("edges.tsv", sep= "\t")