In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
# read in the input tsv file
df = pd.read_csv('data/database/01-Jan-2023-ClinicalEvidenceSummaries.tsv', sep='\t')

## Data pre-processing

In [3]:
# remove NA
df = df.fillna('').iloc[430:440]

# replace synonyms
%run 'helper_funcs.ipynb'
df['disease'] = df['disease'].apply(find_preferred_term_by_synonym)
df['disease']


The preferred term for 'acute Monoblastic Leukemia (FAB M5a)' is 'Acute Monoblastic Leukemia'.


430                      Breast Carcinoma
431                      Breast Carcinoma
432                  Mantle Cell Lymphoma
433          Chronic Lymphocytic Leukemia
434                   Plasma Cell Myeloma
435    Thyroid Gland Follicular Carcinoma
436    Thyroid Gland Follicular Carcinoma
437                  Renal Cell Carcinoma
438                      Breast Carcinoma
439                      Breast Carcinoma
Name: disease, dtype: object

## Update output/disease_node.csv

In [4]:
# Load or initialize disease_node.csv
disease_node_path = 'output/disease_node.csv'
disease_node_df = pd.read_csv(disease_node_path) if os.path.isfile(disease_node_path) else pd.DataFrame(columns=['disease_id', 'disease_name', 'source'])

# Update diseases
df['disease_lower'] = df['disease'].str.lower()
disease_node_df['disease_name_lower'] = disease_node_df['disease_name'].str.lower()
# Find new diseases not in disease_node_df
new_diseases = ~df['disease_lower'].isin(disease_node_df['disease_name_lower'])
new_diseases_df = df.loc[new_diseases, 'disease'].drop_duplicates().reset_index(drop=True)
new_diseases_df = pd.DataFrame({
    'disease_id': range(disease_node_df['disease_id'].max() + 1, disease_node_df['disease_id'].max() + 1 + len(new_diseases_df)),
    'disease_name': new_diseases_df,
    'source': 'civic'
})
disease_node_df = pd.concat([disease_node_df, new_diseases_df], ignore_index=True).drop(columns=['disease_name_lower'])
disease_node_df.to_csv(disease_node_path, index=False)

## Create or update output/snv.csv

In [5]:
# Load or initialize snv.csv
snv_path = 'output/snv.csv'
if os.path.isfile(snv_path):
    snv_df = pd.read_csv(snv_path)
    # Ensure that new columns exist in the dataframe
    if 'variant_summary' not in snv_df.columns:
        snv_df['variant_summary'] = ''
    if 'variant_origin' not in snv_df.columns:
        snv_df['variant_origin'] = ''
else:
    # Define columns including new ones
    snv_df = pd.DataFrame(columns=['gene', 'variant', 'snv_id', 'source', 'variant_summary', 'variant_origin'])

# Create a lowercase gene-variant combination column for case-insensitive comparison
df['gene_variant'] = df['gene'].str.lower() + '_' + df['variant'].str.lower()
snv_df['gene_variant'] = snv_df['gene'].str.lower() + '_' + snv_df['variant'].str.lower()

# Update existing records
for idx, row in df.iterrows():
    match = snv_df['gene_variant'] == row['gene_variant']
    if match.any():
        snv_df.loc[match, 'variant_summary'] = row['variant_summary']
        snv_df.loc[match, 'variant_origin'] = row['variant_origin']
        snv_df.loc[match, 'source'] = snv_df.loc[match, 'source'].apply(lambda x: x if 'civic' in x else f"{x}+civic")

# Add new records
new_snv_records = df.loc[~df['gene_variant'].isin(snv_df['gene_variant'])]
for _, new_row in new_snv_records.iterrows():
    next_snv_id = snv_df['snv_id'].max() + 1
    new_record = {
        'gene': new_row['gene'],
        'variant': new_row['variant'],
        'snv_id': next_snv_id,
        'source': 'civic',  # Assign the source for the new entry
        'variant_summary': new_row['variant_summary'],
        'variant_origin': new_row['variant_origin'],
        'gene_variant': new_row['gene_variant']  # Include the gene_variant for later removal
    }
    snv_df = snv_df.append(new_record, ignore_index=True)

# Drop the temporary 'gene_variant' column used for matching
snv_df.drop(columns=['gene_variant'], inplace=True)

# Save the updated snv_df to the CSV
snv_df.to_csv(snv_path, index=False)


  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)
  snv_df = snv_df.append(new_record, ignore_index=True)


## Create or update output/drug.csv

In [6]:
# Load or initialize drug.csv
drug_path = 'output/drug.csv'
if os.path.isfile(drug_path):
    drug_df = pd.read_csv(drug_path)
    # Ensure that new columns exist in the dataframe
    if 'drug_interaction_type' not in drug_df.columns:
        drug_df['drug_interaction_type'] = ''
else:
    # Define columns including new ones
    drug_df = pd.DataFrame(columns=['drug_name', 'drug_family', 'drug_status', 'evidencel_level', 'drug_id', 'source', 'drug_interaction_type'])

# Create a lowercase drug names column for case-insensitive comparison
df['drugs_lower'] = df['drugs'].str.lower()
drug_df['drug_name_lower'] = drug_df['drug_name'].str.lower()

# Update existing records with 'drug_interaction_type' and 'source'
for idx, row in df.iterrows():
    match = drug_df['drug_name_lower'] == row['drugs_lower']
    if match.any():
        # Update the 'drug_interaction_type' for existing drugs
        drug_df.loc[match, 'drug_interaction_type'] = row['drug_interaction_type']
        # Update source by appending '+civic' if necessary
        drug_df.loc[match, 'source'] = drug_df.loc[match, 'source'].astype(str).apply(lambda x: f"{x}+civic" if 'civic' not in x else x)

# Find new drugs not in drug_df
new_drugs = ~df['drugs_lower'].isin(drug_df['drug_name_lower'])
if new_drugs.any():
    # Drop duplicates based on the 'drugs' column, which is the original column name
    new_drugs_df = df.loc[new_drugs, ['drugs', 'drug_interaction_type']].drop_duplicates(subset=['drugs'])
    new_drugs_df['drug_id'] = range(drug_df['drug_id'].max() + 1, drug_df['drug_id'].max() + 1 + len(new_drugs_df))
    new_drugs_df['source'] = 'civic'
    # Rename columns to match those in drug_df
    new_drugs_df.rename(columns={'drugs': 'drug_name'}, inplace=True)
    # Select only the relevant columns to append to drug_df
    new_drugs_df = new_drugs_df[['drug_id', 'drug_name', 'source', 'drug_interaction_type']]
    # Append the new entries
    drug_df = pd.concat([drug_df, new_drugs_df], ignore_index=True)

# Now, update 'drug_interaction_type' for existing records, and concatenate '+civic' if necessary
for idx, row in df.iterrows():
    drug_idx = drug_df['drug_name'].str.lower() == row['drugs'].lower()
    if drug_idx.any():
        drug_df.loc[drug_idx, 'drug_interaction_type'] = row['drug_interaction_type']
        drug_df.loc[drug_idx, 'source'] = drug_df.loc[drug_idx, 'source'].apply(lambda x: x if 'civic' in x else f"{x}+civic")

# Save the updated drug_df to the CSV
drug_df.drop(columns=['drug_name_lower'], inplace=True)
drug_df.to_csv(drug_path, index=False)



## Create or update output/statement.csv

In [7]:
# Check if statement.csv exists and create or append to it
statement_file_path = 'output/statement.csv'

if os.path.isfile(statement_file_path):
    statement_df = pd.read_csv(statement_file_path)
    # Ensure that the 'source' column is of type string
    statement_df['source'] = statement_df['source'].astype(str)
else:
    statement_df = pd.DataFrame(columns=['statement_id', 'drug_id', 'disease_id', 'snv_id', 'association', 'source', 'clinical_significance'])

# Generate statement_id starting from the last used ID if the file exists
next_statement_id = statement_df['statement_id'].max() + 1 if not statement_df.empty else 1

def add_new_statements(statement_df, df, disease_node_df, snv_df, drug_df, start_id):
    # Creating lowercase columns for case insensitive comparison
    disease_node_df['disease_name_lower'] = disease_node_df['disease_name'].str.lower()
    snv_df['gene_variant_lower'] = snv_df['gene'].str.lower() + '_' + snv_df['variant'].str.lower()
    drug_df['drug_name_lower'] = drug_df['drug_name'].str.lower()
    
    # Iterate over the rows of the new data
    for index, row in df.iterrows():
        # Lookup IDs in the respective dataframes
        disease_id = disease_node_df.loc[disease_node_df['disease_name_lower'] == row['disease'].lower(), 'disease_id'].values[0]
        snv_id = snv_df.loc[snv_df['gene_variant_lower'] == (row['gene'].lower() + '_' + row['variant'].lower()), 'snv_id'].values[0]
        drug_id = drug_df.loc[drug_df['drug_name_lower'] == row['drugs'].lower(), 'drug_id'].values[0]

        # Check for unique association and add a new statement if not found
        if not ((statement_df['drug_id'] == drug_id) & (statement_df['disease_id'] == disease_id) & (statement_df['snv_id'] == snv_id)).any():
            new_entry = {
                'statement_id': start_id,
                'drug_id': drug_id,
                'disease_id': disease_id,
                'snv_id': snv_id,
                'association': "",
                'source': 'civic',  # Set source to 'civic' for new entries
                'clinical_significance': row['clinical_significance']  # Assume this column exists in df
            }
            statement_df = statement_df.append(new_entry, ignore_index=True)
            start_id += 1
        else:
            # If the entry already exists, we update 'clinical_significance' and append '+civic' to 'source' if needed
            match_idx = (statement_df['drug_id'] == drug_id) & (statement_df['disease_id'] == disease_id) & (statement_df['snv_id'] == snv_id)
            statement_df.loc[match_idx, 'clinical_significance'] = row['clinical_significance']  # Assume this column exists in df
            # Update source only if 'civic' is not already there
            statement_df.loc[match_idx, 'source'] = statement_df.loc[match_idx, 'source'].apply(lambda x: x if 'civic' in x else x + '+civic')
    
    # Cleanup the DataFrame
    disease_node_df.drop(columns=['disease_name_lower'], inplace=True)
    snv_df.drop(columns=['gene_variant_lower'], inplace=True)
    drug_df.drop(columns=['drug_name_lower'], inplace=True)
    
    return statement_df

# Update or create statement_df with new statements from df
statement_df = add_new_statements(statement_df, df, disease_node_df, snv_df, drug_df, next_statement_id)

# Save the statement DataFrame
statement_df.to_csv(statement_file_path, index=False)


  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
  statement_df = statement_df.append(new_entry, ignore_index=True)
