In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
# read in the input tsv file
df = pd.read_csv('data/database/cgi_biomarkers_per_variant.tsv', sep='\t')

## Data pre-processing

In [3]:
# remove NA
df = df.fillna('').iloc[640:660]

# replace synonyms
%run 'helper_funcs.ipynb'
df['Primary Tumor type full name'] = df['Primary Tumor type full name'].apply(find_preferred_term_by_synonym)
df['Primary Tumor type full name']

# extract variant from non-empty invididual mutation column
df['variant'] = df['individual_mutation'].apply(lambda x: x.split(':')[-1] if x else '')

The preferred term for 'acute Monoblastic Leukemia (FAB M5a)' is 'Acute Monoblastic Leukemia'.


In [4]:
print(df['Primary Tumor type full name'])

640                 Any cancer type
641                          Glioma
642           Breast adenocarcinoma
643         Prostate Adenocarcinoma
644           Breast adenocarcinoma
645                    Head an neck
646                           Renal
647                    Head an neck
648                         Bladder
649            Mantle Cell Lymphoma
650                           Renal
651                           Renal
652                 Any cancer type
653                           Renal
654                 Any cancer type
655                 Any cancer type
656                 Any cancer type
657                 Any cancer type
658                 Any cancer type
659    Chronic lymphocytic leukemia
Name: Primary Tumor type full name, dtype: object


## Update output/disease_node.csv

In [5]:
# Ensure the existing DataFrame is read in
disease_node_df = pd.read_csv('output/disease_node.csv')

# Function to check and append new diseases, ignoring case
def append_new_diseases(df, existing_df, start_id):
    new_entries = []
    # Create a set for faster lookups, converting existing disease names to lowercase
    existing_disease_names = set(existing_df['disease_name'].str.lower())
    
    for disease in df['Primary Tumor type full name'].unique():
        # Convert the disease to lowercase before the check
        disease_lower = disease.lower()
        if disease_lower not in existing_disease_names:
            new_entries.append({
                'disease_name': disease,  # Keep the original case for the name
                'disease_id': start_id,
                'source': 'cgi'
            })
            existing_disease_names.add(disease_lower)  # Add the new name in lowercase for future checks
            start_id += 1
            
    # Append new entries to the existing DataFrame
    return existing_df.append(new_entries, ignore_index=True), start_id

# Find the next disease_id to use
next_disease_id = disease_node_df['disease_id'].max() + 1

# Update the disease_node_df with new diseases from df
disease_node_df, next_disease_id = append_new_diseases(df, disease_node_df, next_disease_id)

# Save the updated DataFrame
disease_node_df.to_csv('output/disease_node.csv', index=False)


  return existing_df.append(new_entries, ignore_index=True), start_id


## Create or update output/snv.csv

In [6]:
# Check if snv.csv exists and create or append to it
snv_file_path = 'output/snv.csv'

if os.path.isfile(snv_file_path):
    snv_df = pd.read_csv(snv_file_path)
else:
    snv_df = pd.DataFrame(columns=['gene', 'variant', 'snv_id', 'source'])

# Generate snv_id starting from the last used ID if the file exists
next_snv_id = snv_df['snv_id'].max() + 1 if not snv_df.empty else 1

# Function to add new SNVs
def add_new_snvs(snv_df, df, start_id):
    for index, row in df.iterrows():
        if not ((snv_df['gene'] == row['Gene']) & (snv_df['variant'] == row['variant'])).any():
            snv_df = snv_df.append({
                'gene': row['Gene'],
                'variant': row['variant'],
                'snv_id': start_id,
                'source': 'cgi'
            }, ignore_index=True)
            start_id += 1
    return snv_df

# Update or create snv_df with new SNVs from df
snv_df = add_new_snvs(snv_df, df, next_snv_id)

# Save the snv DataFrame
snv_df.to_csv(snv_file_path, index=False)

  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({
  snv_df = snv_df.append({


## Create or update output/drug.csv

In [7]:
# Check if drug.csv exists and create or append to it
drug_file_path = 'output/drug.csv'

if os.path.isfile(drug_file_path):
    drug_df = pd.read_csv(drug_file_path)
else:
    drug_df = pd.DataFrame(columns=['drug_name', 'drug_family', 'drug_status', 'evidencel_level', 'drug_id', 'source'])

# Generate drug_id starting from the last used ID if the file exists
next_drug_id = drug_df['drug_id'].max() + 1 if not drug_df.empty else 1

# Function to add new drugs
def add_new_drugs(drug_df, df, start_id):
    for index, row in df.iterrows():
        if not ((drug_df['drug_name'] == row['Drug']) & (drug_df['drug_family'] == row['Drug family'])).any():
            drug_df = drug_df.append({
                'drug_name': row['Drug'],
                'drug_family': row['Drug family'],
                'drug_status': row['Drug status'],
                'evidencel_level': row['Evidence level'],
                'drug_id': start_id,
                'source': 'cgi'
            }, ignore_index=True)
            start_id += 1
    return drug_df

# Update or create drug_df with new drugs from df
drug_df = add_new_drugs(drug_df, df, next_drug_id)

# Save the drug DataFrame
drug_df.to_csv(drug_file_path, index=False)


  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({
  drug_df = drug_df.append({


## Create or update output/statement.csv

In [8]:
# Check if statement.csv exists and create or append to it
statement_file_path = 'output/statement.csv'

if os.path.isfile(statement_file_path):
    statement_df = pd.read_csv(statement_file_path)
else:
    statement_df = pd.DataFrame(columns=['statement_id', 'drug_id', 'disease_id', 'snv_id', 'association', 'source'])

# Generate statement_id starting from the last used ID if the file exists
next_statement_id = statement_df['statement_id'].max() + 1 if not statement_df.empty else 1


def add_new_statements(statement_df, df, disease_node_df, snv_df, drug_df, start_id):
    for index, row in df.iterrows():
        # Convert names to lowercase for case-insensitive matching
        primary_tumor_lower = row['Primary Tumor type full name'].lower()
        gene_lower = row['Gene'].lower()
        variant_lower = row['variant'].lower()
        drug_lower = row['Drug'].lower()
        drug_family_lower = row['Drug family'].lower()

        # Lookup IDs in the respective dataframes
        disease_match = disease_node_df[disease_node_df['disease_name'].str.lower() == primary_tumor_lower]
        if not disease_match.empty:
            disease_id = disease_match['disease_id'].values[0]
        else:
            # Handle the case where there is no match
            continue  # Skip this iteration

        snv_match = snv_df[(snv_df['gene'].str.lower() == gene_lower) & (snv_df['variant'].str.lower() == variant_lower)]
        if not snv_match.empty:
            snv_id = snv_match['snv_id'].values[0]
        else:
            # Handle the case where there is no match
            continue  # Skip this iteration

        drug_match = drug_df[(drug_df['drug_name'].str.lower() == drug_lower) & (drug_df['drug_family'].str.lower() == drug_family_lower)]
        if not drug_match.empty:
            drug_id = drug_match['drug_id'].values[0]
        else:
            # Handle the case where there is no match
            continue  # Skip this iteration

        # Check for unique association and add a new statement if not found
        if not ((statement_df['drug_id'] == drug_id) & (statement_df['disease_id'] == disease_id) & (statement_df['snv_id'] == snv_id)).any():
            statement_df = statement_df.append({
                'statement_id': start_id,
                'drug_id': drug_id,
                'disease_id': disease_id,
                'snv_id': snv_id,
                'association': row['Association'],
                'source': 'cgi'
            }, ignore_index=True)
            start_id += 1
    return statement_df


# Update or create statement_df with new statements from df
statement_df = add_new_statements(statement_df, df, disease_node_df, snv_df, drug_df, next_statement_id)

# Save the statement DataFrame
statement_df.to_csv(statement_file_path, index=False)

  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
  statement_df = statement_df.append({
