In [1]:
import csv
import os

# Add Disease Hierarchy
## Generate disease node & disease hierarchy edge

In [2]:
# Read the file
with open("data/evs.nci.nih.gov_ftp1_NCI_Thesaurus_Neoplasm_Neoplasm_Core_Hierarchy_By_Neoplastic_Status.txt", "r") as file:
    lines = file.readlines()

# Data structures
disease_data = []
disease_subclass_data = []
node_stack = []  # To keep track of parent nodes
combo_to_id = {}

# Processing the tree structure
for line in lines:
    stripped_line = line.lstrip('\t')  # Remove leading tabs
    current_level = len(line) - len(stripped_line)  # Number of leading tabs

    # Extract disease name and NCI code
    parts = stripped_line.rsplit(" (", 1)
    if len(parts) != 2:
        continue
    name, code = parts
    code = code.replace(")", "").strip()

    # If this disease combo hasn't been seen, save it
    if (name, code) not in combo_to_id:
        disease_id = len(disease_data) + 1
        combo_to_id[(name, code)] = disease_id
        disease_data.append({
            "disease_id": disease_id,
            "disease_name": name,
            "disease_nci_code": code
        })
    else:
        disease_id = combo_to_id[(name, code)]

    # Pop nodes from the stack to get to the correct parent
    while len(node_stack) > current_level:
        node_stack.pop()

    # If this isn't a top-level node, add subclass data
    if node_stack:
        disease_subclass_data.append({
            "higher_disease": node_stack[-1]["disease_id"],
            "lower_disease": disease_id
        })

    # Push this node onto the stack
    node_stack.append({
        "disease_name": name,
        "disease_nci_code": code,
        "disease_id": disease_id
    })

# Write the CSV files
with open("output/disease_node.csv", "w", newline='') as csvfile:
    fieldnames = ["disease_id", "disease_name", "disease_nci_code"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in disease_data:
        writer.writerow(row)

with open("output/disease_subclass_edge.csv", "w", newline='') as csvfile:
    fieldnames = ["higher_disease", "lower_disease"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in disease_subclass_data:
        writer.writerow(row)

# Molecular Edge

In [3]:
# Step 1: Read and preprocess the Neoplasm_Core_Rels_NCIt_Molecular.csv file
with open("data/Neoplasm_Core_Rels_NCIt_Molecular.csv", "r") as file:
    reader = csv.DictReader(file)
    relationships = [row for row in reader]

# Rename columns
for rel in relationships:
    rel["Disease Code"] = rel.pop("Code")
    rel["Disease Preferred Term"] = rel.pop("Preferred Term")
    rel["Molecular Code"] = rel.pop("Code2")
    rel["Molecular Term"] = rel.pop("Preferred Term2")

# Step 2: Extract unique molecular nodes
molecular_nodes = {}
molecular_id_lookup = {}
for idx, rel in enumerate(relationships, start=1):
    code = rel["Molecular Code"]
    term = rel["Molecular Term"]
    if code not in molecular_nodes:
        molecular_nodes[code] = term
        molecular_id_lookup[code] = idx

# Save to molecular_node.csv
with open("output/molecular_node.csv", "w", newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["molecular_id", "molecular_nci_code", "molecular_description"])
    for code, description in molecular_nodes.items():
        writer.writerow([molecular_id_lookup[code], code, description])

# Step 3: Create the edges and update disease_node.csv if necessary
disease_node_data = []
disease_id_lookup = {}
with open("output/disease_node.csv", "r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        disease_node_data.append(row)
        disease_id_lookup[row["disease_nci_code"]] = row["disease_id"]

edges = []
for rel in relationships:
    disease_code = rel["Disease Code"]
    if disease_code not in disease_id_lookup:
        # Add missing disease to disease_node.csv
        new_id = len(disease_node_data) + 1
        disease_node_data.append({
            "disease_id": new_id,
            "disease_name": rel["Disease Preferred Term"],
            "disease_nci_code": disease_code,
            "Synonyms": "NA",
            "Definition": "NA",
            "Neoplastic Status": "NA"
        })
        disease_id_lookup[disease_code] = new_id

    edges.append({
        "disease_id": disease_id_lookup[disease_code],
        "molecular_id": molecular_id_lookup[rel["Molecular Code"]],
        "Relationship": rel["Relationship"]
    })

# Save the updated disease_node.csv
with open("output/disease_node.csv", "w", newline='') as csvfile:
    fieldnames = ["disease_id", "disease_name", "disease_nci_code", "Synonyms", "Definition", "Neoplastic Status"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in disease_node_data:
        writer.writerow(row)

# Save the edges to disease_molecular_edge.csv
with open("output/disease_molecular_edge.csv", "w", newline='') as csvfile:
    fieldnames = ["disease_id", "molecular_id", "Relationship"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for edge in edges:
        writer.writerow(edge)


# Merge hierarchy with more info
## Synonyms, Definition, Neoplastic Status

In [4]:
# Step 1: Read both CSV files

# Read Neoplasm_Core.csv into a dictionary for easy lookup
neoplasm_core_data = {}
with open("data/Neoplasm_Core.csv", "r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Ensure the disease_nci_code (Code) does not contain trailing `)`
        code = row["Code"].replace(")", "").strip()
        neoplasm_core_data[code] = row

# Read disease_node.csv
disease_data = []
with open("output/disease_node.csv", "r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        disease_data.append(row)

# Step 2 & 3: Merge the information
merged_data = []
for disease in disease_data:
    nci_code = disease["disease_nci_code"]
    if nci_code in neoplasm_core_data:
        neoplasm = neoplasm_core_data[nci_code]
        # Merge data excluding the "Preferred Term"
        merged_entry = {
            **disease,
            "Synonyms": neoplasm["Synonyms"],
            "Definition": neoplasm["Definition"],
            "Neoplastic Status": neoplasm["Neoplastic Status"]
        }
    else:
        # If the code isn't in Neoplasm_Core.csv, keep the disease entry and fill new fields with 'N/A'
        merged_entry = {
            **disease,
            "Synonyms": "N/A",
            "Definition": "N/A",
            "Neoplastic Status": "N/A"
        }
    merged_data.append(merged_entry)

# Step 4: Write the merged data back to disease_node.csv
with open("output/disease_node.csv", "w", newline='') as csvfile:
    fieldnames = ["disease_id", "disease_name", "disease_nci_code", "Synonyms", "Definition", "Neoplastic Status"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in merged_data:
        writer.writerow(row)


## Add source to all generated files

In [5]:
import os

output_folder = "output"

# List all files in the output directory
files = [f for f in os.listdir(output_folder) if f.endswith('.csv')]

for filename in files:
    path = os.path.join(output_folder, filename)
    
    # Read the existing CSV and add the "source" column
    with open(path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        rows = list(reader)
        fieldnames = reader.fieldnames + ['source']  # Add "source" to the fieldnames

    # Write the new CSV with the added "source" column
    with open(path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in rows:
            row['source'] = 'NCI_Thesaurus'  # Add "source" to each row
            writer.writerow(row)

print("Added 'source' column to all CSV files in the 'output' folder.")

Added 'source' column to all CSV files in the 'output' folder.
