# Data preprocessing

The bulk of this paper is dependent upon multiple different manipulations of cancer drug response data. This notebook is designed to generate all required files for experiments.

In [17]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import re
import random
import pubchempy as pcp
from rdkit.Chem import AllChem
from rdkit import Chem
import pickle

## Required files

There are 5 required data files before preprocessing. Please download both DepMap files manually from the hyperlinks. The cell below will download everything else.

(1) CTRPv2 response data

(2) GDSC response data 

(3) gCSI response data 

(4) [DepMap cell line gene expression](https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2024Q4&filename=OmicsExpressionProteinCodingGenesTPMLogp1.csv)

(5) [DepMap manifest](https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2024Q4&filename=Model.csv) 

(6) TCGA gene expression - Obtained individually from [UCSC Xena Browser](https://xenabrowser.net/datapages/?cohort=GDC%20TCGA%20Acute%20Myeloid%20Leukemia%20(LAML)&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443) (GDC hub, STAR TPM). Link provided is Google Drive maintained with all patient expression.

In [None]:
!wget https://ctd2-data.nci.nih.gov/Public/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/CTRPv2.0_2015_ctd2_ExpandedDataset.zip #CTRPv2
!unzip CTRPv2.0_2015_ctd2_ExpandedDataset.zip
!wget https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/GDSC2_fitted_dose_response_27Oct23.xlsx #GDSC2
!wget http://research-pub.gene.com/gCSI_GRvalues2019/gCSI_GRdata_v1.3.tsv.tar.gz #gCSI
!tar -xzvf gCSI_GRdata_v1.3.tsv.tar.gz
!gdown --id 1wdGLJVAVCtK7Az4qtjsTvw-RJ-PLiIue #TCGA patient expression, please contact authors if this fails. Alternatively, download files manually from XENA browser above.
!unzip tcga_expression_data.zip

--2025-05-28 17:11:15--  https://ctd2-data.nci.nih.gov/Public/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/CTRPv2.0_2015_ctd2_ExpandedDataset.zip
Resolving ctd2-data.nci.nih.gov (ctd2-data.nci.nih.gov)... 129.43.254.216, 2607:f220:41d:21c1::812b:fed8
Connecting to ctd2-data.nci.nih.gov (ctd2-data.nci.nih.gov)|129.43.254.216|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 342737645 (327M) [application/zip]
Saving to: ‘CTRPv2.0_2015_ctd2_ExpandedDataset.zip’


2025-05-28 17:13:15 (2.72 MB/s) - ‘CTRPv2.0_2015_ctd2_ExpandedDataset.zip’ saved [342737645/342737645]

Archive:  CTRPv2.0_2015_ctd2_ExpandedDataset.zip
  inflating: CTRPv2.0._COLUMNS.xlsx  
  inflating: CTRPv2.0._INFORMER_SET.xlsx  
  inflating: CTRPv2.0._README.docx   
  inflating: MANIFEST.txt            
  inflating: v20._COLUMNS.txt        
  inflating: v20.data.curves_post_qc.txt  
  inflating: v20.data.per_cpd_avg.txt  
  inflating: v20.data.per_cpd_post_qc.txt  
  inflating: v20.data.per_cpd_pre_qc.txt

## Generate response files for each dataset

In [4]:
if not os.path.isdir('input_files'):
    os.mkdir('input_files')

We generate response files where each line is a tab-separated list containing (cell line, drug, response).

### CTRP

In [5]:
if not os.path.isdir('input_files/ctrp'):
    os.mkdir('input_files/ctrp')
cellLine_meta_df = pd.read_csv('v20.meta.per_cell_line.txt', sep='\t')
experiment_meta_df = pd.read_csv('v20.meta.per_experiment.txt', sep='\t')
drug_meta_df = pd.read_csv('v20.meta.per_compound.txt', sep='\t')
experiment_results = pd.read_csv('v20.data.curves_post_qc.txt', sep='\t')

In [6]:
# Map ID values to drugs and cell lines, this is how they are represented in experiments.
id2cell = dict(zip(cellLine_meta_df.master_ccl_id, cellLine_meta_df.ccl_name))
id2drug = dict(zip(drug_meta_df.master_cpd_id, drug_meta_df.cpd_name))

# Get dict where key = experiment ID, val = cell line ID
exp2cell = dict(zip(experiment_meta_df.experiment_id, experiment_meta_df.master_ccl_id))

#### AUC

In [7]:
# Create dict that is the final precursor for our tuple output file
auc_dict = dict(zip(zip(experiment_results.experiment_id, experiment_results.master_cpd_id), experiment_results.area_under_curve))
auc_named_dict = {}
for k,v in auc_dict.items():
    auc_named_dict[(id2cell[exp2cell[k[0]]], id2drug[k[1]])] = v

# Prevent from appending to file if it's already there
if os.path.exists('input_files/ctrp/ctrp_auc.txt'):
    os.remove('input_files/ctrp/ctrp_auc.txt')

with open('input_files/ctrp/ctrp_auc.txt', 'a') as f:
    for k,v in auc_named_dict.items():
        f.write(f'{k[0]}\t{k[1]}\t{v}\n')

#### EC50

In [8]:
ec50_dict = dict(zip(zip(experiment_results.experiment_id, experiment_results.master_cpd_id), experiment_results.apparent_ec50_umol))
ec50_named_dict = {}
for k,v in ec50_dict.items():
    if v >= -20 and v <= 20: # CTRP reports values in apparent ec50 uM, some of these values are extreme and disrupt training (exploding loss). Filter for realistic values. (20 + -6 = 10^14 M concentration...)
        ec50_named_dict[(id2cell[exp2cell[k[0]]], id2drug[k[1]])] = v

# Prevent from appending to file if it's already there
if os.path.exists('input_files/ctrp/ctrp_ec50.txt'):
    os.remove('input_files/ctrp/ctrp_ec50.txt')

with open('input_files/ctrp/ctrp_ec50.txt', 'a') as f:
    for k,v in ec50_named_dict.items():
        f.write(f'{k[0]}\t{k[1]}\t{v}\n')

### GDSC

GDSC is much easier, it's all in one file.

In [10]:
if not os.path.isdir('input_files/gdsc'):
    os.mkdir('input_files/gdsc')
gdsc_response_df = pd.read_excel('GDSC2_fitted_dose_response_27Oct23.xlsx')
gdsc_response_df[['CELL_LINE_NAME', 'DRUG_NAME', 'LN_IC50']].to_csv('input_files/gdsc/gdsc_ec50.txt', sep='\t', header=False, index=False)
gdsc_response_df[['CELL_LINE_NAME', 'DRUG_NAME', 'AUC']].to_csv('input_files/gdsc/gdsc_auc.txt', sep='\t', header=False, index=False)

### gCSI

In [11]:
gCSI_df = pd.read_csv('/research/labs/microbiome/chia/m214779/gCSI/gCSI_GRmetrics_v1.3.tsv', sep='\t')

In [12]:
if not os.path.isdir('input_files/gcsi'):
    os.mkdir('input_files/gcsi')

# gCSI data has missing values, have to filter for them for each metric as we go

temp = gCSI_df[gCSI_df['GR50'].notna()]
temp = temp[temp['GR50'] < np.inf]
temp = temp[temp['GR50'] > -np.inf]
temp['GR50'] = temp['GR50'].apply(np.log) # log transform GR50 for prediction
temp[['Norm_CellLineName', 'Norm_DrugName', 'GR50']].to_csv('input_files/gcsi/gcsi_gr50.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GR_AOC'].notna()]
temp = temp[temp['GR_AOC'] < np.inf]
temp = temp[temp['GR_AOC'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GR_AOC']].to_csv('input_files/gcsi/gcsi_aoc.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GRmax'].notna()]
temp = temp[temp['GRmax'] < np.inf]
temp = temp[temp['GRmax'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GRmax']].to_csv('input_files/gcsi/gcsi_grmax.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['Emax'].notna()]
temp = temp[temp['Emax'] < np.inf]
temp = temp[temp['Emax'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'Emax']].to_csv('input_files/gcsi/gcsi_emax.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GRinf'].notna()]
temp = temp[temp['GRinf'] < np.inf]
temp = temp[temp['GRinf'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GRinf']].to_csv('input_files/gcsi/gcsi_grinf.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GR_05uM_fit'].notna()]
temp = temp[temp['GR_05uM_fit'] < np.inf]
temp = temp[temp['GR_05uM_fit'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GR_05uM_fit']].to_csv('input_files/gcsi/gcsi_gr_05um_fit.txt', sep='\t', header=False, index=False)

### Filtering datasets for cell lines with expression values in DepMap

In [16]:
index_df = pd.read_csv('Model.csv')
strip_dict = pd.Series(index_df['ModelID'].values,index=index_df['StrippedCellLineName']).to_dict()

# Get all of our input response files
response_files = []
for dirpath, dirnames, filenames in os.walk('input_files'):
    for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        response_files.append(file_path)

for response_file in response_files:
    print(response_file)
    curr_df = pd.read_csv(response_file, sep='\t', header=None, index_col=None)
    print(curr_df.shape)
    curr_df.iloc[:,0] = curr_df.iloc[:,0].str.replace('-', '')
    curr_df.iloc[:,0] = curr_df.iloc[:,0].str.upper()
    curr_df.iloc[:,0] = curr_df.iloc[:,0].map(strip_dict)
    curr_df = curr_df[curr_df.iloc[:,0].notna()]
    print(curr_df.shape) # print change in df shape for each input file
    root, dataset, metric = response_file.replace('.txt', '').split('/')
    curr_df.to_csv(f'{root}/{dataset}/{metric}'+ '_expfilt.txt', sep='\t', header=False, index=False)

input_files/gdsc/gdsc_ec50.txt
(242036, 3)
(234020, 3)
input_files/gdsc/gdsc_auc.txt
(242036, 3)
(234020, 3)
input_files/gcsi/gcsi_grmax.txt
(16304, 3)
(15835, 3)
input_files/gcsi/gcsi_gr50.txt
(10765, 3)
(10446, 3)
input_files/gcsi/gcsi_grinf.txt
(16304, 3)
(15835, 3)
input_files/gcsi/gcsi_gr_05um_fit.txt
(16304, 3)
(15835, 3)
input_files/gcsi/gcsi_emax.txt
(16688, 3)
(16203, 3)
input_files/gcsi/gcsi_aoc.txt
(16304, 3)
(15835, 3)
input_files/ctrp/ctrp_auc.txt
(387130, 3)
(365321, 3)
input_files/ctrp/ctrp_ec50.txt
(282825, 3)
(266709, 3)


### Filter datasets by morgan fingerprint files

We need to ensure that our datasets only contain drugs for which we are able to obtain fingerprints. If you would like to generate morgan fingerprint files manually, a function is included below to do so. Otherwise, there is an option to download the pre-generated fingerprint files from Google Drive.

Synonym file can be downloaded from [GDSC site](https://www.cancerrxgene.org/downloads/bulk_download) under "All compounds screened".

In [None]:
def generateMorganFingerprint(response_file, synonym_file, output_location):
    df = pd.read_csv(response_file, sep='\t', header=None)
    drug_list = set(list(df.iloc[:,1]))

    synonym_df = pd.read_csv(synonym_file)
    drug_name_dict = dict(zip(synonym_df.DRUG_NAME, synonym_df.SYNONYMS))

    for key,value in drug_name_dict.items():
        value = str(value)
        drug_name_dict[key] = value.split(', ')

    smile_dict = {}
    for item in drug_list:
        if item == 'OligomycinA':
            item = 'Oligomycin A'
        ids = pcp.get_cids(item, 'name', list_return='flat')
        if len(ids) == 0:
            print('No compound found for {}, searching synonyms'.format(item))
            curr_synonyms = drug_name_dict[item]
            for synonym in curr_synonyms:
                if synonym == 'nan':
                    continue
                ids = pcp.get_cids(synonym, 'name', list_return='flat')
                if len(ids) == 0:
                    print('No compound found for {}, synonym {}'.format(item, synonym))
                if len(ids) == 1:
                    print(f'Match found for synonym {synonym}!')
                    c = pcp.Compound.from_cid(ids[0])
                    smile_dict[item] = c.canonical_smiles
                if len(ids) > 1:
                    print('More than one ID found for {}, synonym {}. Using best match.'.format(item, synonym))
                    c = pcp.Compound.from_cid(ids[0])
                    smile_dict[item] = c.canonical_smiles
        if len(ids) == 1:
            c = pcp.Compound.from_cid(ids[0])
            smile_dict[item] = c.canonical_smiles
        if len(ids) > 1:
            print('More than one ID found for {}. Using best match.'.format(item))
            c = pcp.Compound.from_cid(ids[0])
            smile_dict[item] = c.canonical_smiles

    print(smile_dict)

    fpgen = AllChem.GetMorganGenerator(radius=2)
    fp_dict = {}

    for drug,smile_string in smile_dict.items():
        m1 = Chem.MolFromSmiles(smile_string)
        curr_fp = fpgen.GetFingerprint(m1)
        fp_dict[drug] = list(curr_fp)

    print(fp_dict)
    fp_df = pd.DataFrame.from_dict(fp_dict)
    fp_df = pd.DataFrame.transpose(fp_df)
    fp_df.to_csv(output_location, sep='\t')

In [17]:
# Download fingerprint files
!gdown --id 1XBMuz3YeHSRh1mtFdy3Rb-4mqMoF7FMQ
!unzip fingerprints.zip

Downloading...
From: https://drive.google.com/uc?id=1XBMuz3YeHSRh1mtFdy3Rb-4mqMoF7FMQ
To: /research/labs/microbiome/chia/m214779/drug_blind_generalization/fingerprints.zip
100%|█████████████████████████████████████████| 108k/108k [00:00<00:00, 180MB/s]
Archive:  fingerprints.zip
  inflating: fingerprints/ctrp_fingerprints.txt  
  inflating: fingerprints/gcsi_fingerprints.txt  
  inflating: fingerprints/gdsc_fingerprints.txt  


In [18]:
response_files = []
for dirpath, dirnames, filenames in os.walk('input_files'):
    for filename in filenames:
        if filename.endswith('expfilt.txt'):
            file_path = os.path.join(dirpath, filename)
            response_files.append(file_path)

for response_file in response_files:
    print(response_file)
    root, dataset, metric = response_file.replace('_expfilt.txt', '').split('/')
    response_df = pd.read_csv(response_file, sep='\t', header=None, index_col=None)
    print(response_df.shape)
    fp_df = pd.read_csv(f'fingerprints/{dataset}_fingerprints.txt', sep='\t', index_col=0)
    filt_drugs = list(fp_df.index)
    response_df = response_df[response_df.iloc[:,1].isin(filt_drugs)]
    print(response_df.shape)
    response_df.to_csv(f'{root}/{dataset}/{metric}'+ '_expfilt_fpfilt.txt', sep='\t', header=False, index=False)

input_files/gdsc/gdsc_ec50_expfilt.txt
(234020, 3)
(194491, 3)
input_files/gdsc/gdsc_auc_expfilt.txt
(234020, 3)
(194491, 3)
input_files/gcsi/gcsi_emax_expfilt.txt
(16203, 3)
(11470, 3)
input_files/gcsi/gcsi_gr_05um_fit_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_grmax_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_grinf_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_aoc_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_gr50_expfilt.txt
(10446, 3)
(7635, 3)
input_files/ctrp/ctrp_auc_expfilt.txt
(365321, 3)
(365321, 3)
input_files/ctrp/ctrp_ec50_expfilt.txt
(266709, 3)
(266709, 3)


### Filter expression data to match with TCGA expression

Original planned utility of model was with patient few-shot example, so expression has been filtered to match up with TCGA

In [None]:
for i,f in enumerate(os.listdir('tcga_expression_data')):
    if i == 0:
        df = pd.read_csv(f'tcga_expression_data/{f}', sep='\t', header=0, index_col=0)
    else:
        temp = pd.read_csv(f'tcga_expression_data/{f}', sep='\t', header=0, index_col=0)
        df = df.join(temp)

# Map ensembleID to gene names so I can align the datasets
depmap_df = pd.read_csv('OmicsExpressionProteinCodingGenesTPMLogp1.csv', index_col=0)
depmap_geneNames = pd.read_csv('gencode.v36.annotation.gtf.gene.probemap', sep='\t', header=0)

id2name = dict(zip(depmap_geneNames.iloc[:,0], depmap_geneNames.iloc[:,1]))
tcga_names = [id2name[x] for x in list(df.index)]
df.index = tcga_names

# Remove weird parenthesis things from end of gene names in depmap
depmap_df = depmap_df.T
replace_index = [re.sub(r'\([^)]*\)', '', x) for x in list(depmap_df.index)]
replace_index = [x.strip() for x in replace_index]
depmap_df.index = replace_index

# Drop repeat index in both
depmap_df = depmap_df[~depmap_df.index.duplicated(keep='first')]
df = df[~df.index.duplicated(keep='first')]

# Filter by inner join index, but don't do the actual inner join so we can keep df separate
temp = df.loc[list(set(depmap_df.join(df, how='inner').index)),:]
temp = temp.T
print(temp.shape)

temp = depmap_df.loc[(depmap_df.join(df, how='inner').index),:]
temp = temp.T
print(temp.shape)
temp.to_csv('input_files/depmap_expression_pt_filtered.txt', sep='\t', header=True, index=True)

We have our three required types of input files for the model:

(1) Response data, in input_files/

(2) Morgan fingerprint drug representations, in fingerprints/

(3) Expression mapping for cell lines, in input_files/

Now, we can clean up intermediate files. Retain GDSC files for future experiments.

In [None]:
!rm -f v20*
!rm -f CTRP*
!rm -f *.zip
!rm -f gCSI*
!rm -f MANIFEST.txt
!rm -r -f OUTPUT/
!rm -r -f tcga_expression_data/
!rm -f Model.csv
!rm -f OmicsExpressionProteinCodingGenesTPMLogp1.csv

## Cancer type dictionaries

I wanted to track performance by specific cancer types of cell lines in downstream analyses, so a required input to my train method is a dictionary mapping cell lines to cancer type. This info is available in Model.csv. These are actually optional to include, train.py handles no cancer type dict.

In [18]:
df = pd.read_csv('Model.csv')
# the best categorization of cancer type is in the CCLE name
cancer_types = [str(x).split('_',1) for x in list(df['CCLEName'])] 
split_cancer_type = [x[1].upper() if len(x) > 1 else x[0].upper() for x in cancer_types]
print(set(split_cancer_type)) # list of unique cancer types for CCLE cell lines
cancer_type_dict = dict(zip(list(df['ModelID']), split_cancer_type))
with open('cancerType_dict.pkl', 'wb') as f:
    pickle.dump(cancer_type_dict, f)

{'AUTONOMIC_GANGLIA', 'COLON', 'VULVA', 'STOMACH', 'MPNST', 'BRAIN_BONE', 'THYROID', 'LARGE_INTESTINE', 'SKIN', 'KIDNEY', 'ADRENAL_CORTEX', 'SKIN_FV3_RESISTANT', 'SKIN_CJ2_RESISTANT', 'MELANOMA_SKIN', 'UPPER_AERODIGESTIVE_TRACT', 'SOFT_TISSUE', 'BILIARY_TRACT', 'BRAIN', 'URINARY_TRACT', 'MELANOMA_EYE', 'GASTRIC', 'OESOPHAGUS', 'ENDOMETRIUM', 'OSTEOSARCOMA', 'CENTRAL_NERVOUS_SYSTEM', 'BONE', 'PROSTATE', 'SKIN_CJ1_RESISTANT', 'BREAST', 'SALIVARY_GLAND', 'UVEA', 'TESTIS', 'HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 'EYE', 'SARCOMA', 'OVARY', 'SKIN_FV1_RESISTANT', 'CERVIX', 'SKIN_FV2_RESISTANT', 'LIVER', 'LUNG', 'NAN', 'PLEURA', 'PLACENTA', 'SKIN_CJ3_RESISTANT', 'MATCHED_NORMAL_TISSUE', 'SMALL_INTESTINE', 'PANCREAS', 'FIBROBLAST', 'PRIMARY', 'ENGINEERED'}


## Permutation experiments

We need to generate 3 types of files for each dataset+metric: intradrug permutation, intracell permutation, and one-hot cell line representations.

In [None]:
response_files = []
for dirpath, dirnames, filenames in os.walk('input_files'):
    for filename in filenames:
        if filename.endswith('fpfilt.txt'): # get filtered files only
            file_path = os.path.join(dirpath, filename)
            response_files.append(file_path)


for response_file in response_files:
    root, dataset, metric = response_file.replace('_expfilt_fpfilt.txt', '').split('/')
    df = pd.read_csv(response_file, sep='\t', header=None)
    
    #Generate intradrug shuffle files
    all_df = []
    for unique_drug in set(list(df.iloc[:,1])):
        temp_df = df.loc[df.iloc[:,1]==unique_drug,:]
        temp_response = list(temp_df.iloc[:,2])
        random.shuffle(temp_response)
        temp_df.iloc[:,2] = temp_response
        all_df.append(temp_df)

    final_df = pd.concat(all_df)
    final_df.to_csv(f'{root}/{dataset}/{metric}_intradrug_shuffle.txt', sep='\t', header=None, index=None)

    #Generate intracell shuffle
    all_df = []
    for unique_cell in set(list(df.iloc[:,0])):
        temp_df = df.loc[df.iloc[:,0]==unique_cell,:]
        temp_response = list(temp_df.iloc[:,2])
        random.shuffle(temp_response)
        temp_df.iloc[:,2] = temp_response
        all_df.append(temp_df)

    final_df = pd.concat(all_df)
    final_df.to_csv(f'{root}/{dataset}/{metric}_intracell_shuffle.txt', sep='\t', header=None, index=None)

    # Create one-hot cell line representations - THESE ARE USED AS CELL LINE REPRESENTATION INPUT, NOT RESPONSE
    unique_cellLines = list(set(list(df.iloc[:,0])))
    temp_array = np.zeros((len(unique_cellLines), len(unique_cellLines)))
    print(metric, len(unique_cellLines))
    for i,cellLine in enumerate(unique_cellLines):
        temp_array[i,i] = 1
    final_df = pd.DataFrame(temp_array)
    final_df.index = (unique_cellLines)
    final_df.to_csv(f'{root}/{dataset}/{metric}_one_hot.txt')

## Dataset diversity experiments

For these experiments, we must generate a split of the dataset where all unique drugs have at least 500 cell lines. This will remove 14 drugs. We only perform the remaining experiments in GDSC EC50. These experiments are then run using subsets.py and subsets_drug_blind.py.

In [19]:
df = pd.read_csv('input_files/gdsc/gdsc_ec50_expfilt_fpfilt.txt', sep='\t', header=None)
# Create datasets where drugs have been tested on a minimum of 500 cell lines
drugs = list(df.iloc[:,1])
unique_drugs = list(set(drugs))
drug_counts = {drug:drugs.count(drug) for drug in unique_drugs} # Create dict with drug names and # of cell lines drug is tested on
bad_drugs = [k for k,v in drug_counts.items() if v < 500] # create list of drugs tested on less than 500 cell lines so we leave those out
df = df[~df.iloc[:,1].isin(bad_drugs)] # filter drug response dataset by drugs w/ less than 500 cell lines
df.to_csv('input_files/gdsc/gdsc_ec50_numCellLineSubset.txt', sep='\t', header=None, index=None)

## Performance prediction

To perform the first performance prediction experiment, we need to generate a constant drug-blind train, test, and validation split for input to training. All models will be trained on the same val/test splits, but the composition of unique drugs in the training set will change for every model. We need to know the drugs in the training set before we make the validation and test sets though. This is the same as creating a normal drug-blind split like in subsets_drug_blind.py but input files must be created before training lots of models on them.

In [19]:
seed = 123 # Change this to whatever you want to generate more replicates for this experiment
random.seed(seed)
df = pd.read_csv('input_files/gdsc/gdsc_ec50_expfilt_fpfilt.txt', sep='\t', header=None)

# we need to presplit into training test and validation for each replicate due to drug-blind condition
unique_drugs = list(set(df.iloc[:,1]))
sample_idxs = random.sample(list(range(0,len(unique_drugs))), int(0.80*len(unique_drugs))) # 80% size training set
sampled_drugs = [unique_drugs[i] for i in sample_idxs]
val_drugs_presplit = [drug for drug in unique_drugs if drug not in sampled_drugs] # 20% validation set, before splitting into test
val_idx = random.sample(list(range(0,len(val_drugs_presplit))), int(0.5*len(val_drugs_presplit))) 
val_drugs = [val_drugs_presplit[i] for i in val_idx]
test_drugs = [drug for drug in val_drugs_presplit if drug not in val_drugs]
sampled_df = df.loc[df.iloc[:,1].isin(sampled_drugs),:]
val_df = df.loc[df.iloc[:,1].isin(val_drugs),:]
test_df = df.loc[df.iloc[:,1].isin(test_drugs)]

sampled_df.to_csv(f'input_files/gdsc/gdsc_uniqueDrug_drugBlind_train_seed{seed}.txt', sep='\t', index=False, header=False)
val_df.to_csv(f'input_files/gdsc/gdsc_uniqueDrug_drugBlind_val_exp_seed{seed}.txt', sep='\t', index=False, header=False)
test_df.to_csv(f'input_files/gdsc/gdsc_uniqueDrug_drugBlind_test_exp_seed{seed}.txt', sep='\t', index=False, header=False)

These output files are train/val/test inputs for subsets_uniqueDrug.py.

## Mechanism specific datasets

For the mechanism specific training, we have to partition the datasets according to their pathway. 

In [20]:
!wget https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/screened_compounds_rel_8.5.csv

--2025-06-02 16:14:59--  https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/screened_compounds_rel_8.5.csv
Resolving cog.sanger.ac.uk (cog.sanger.ac.uk)... 193.62.203.61, 193.62.203.62, 193.62.203.63
Connecting to cog.sanger.ac.uk (cog.sanger.ac.uk)|193.62.203.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46414 (45K) [text/csv]
Saving to: ‘screened_compounds_rel_8.5.csv’


2025-06-02 16:15:01 (444 KB/s) - ‘screened_compounds_rel_8.5.csv’ saved [46414/46414]



In [31]:
gdsc_moa_df = pd.read_csv('screened_compounds_rel_8.5.csv')
gdsc_moa_df = gdsc_moa_df.loc[~gdsc_moa_df.loc[:,'TARGET_PATHWAY'].isna(),:] # remove drugs with unknown targets
print(f'There are {len(set(gdsc_moa_df.iloc[:,5]))} unique pathways in GDSC')
drug_pathways_dict = {y:x for y,x in zip(gdsc_moa_df.loc[:, 'DRUG_NAME'],gdsc_moa_df.loc[:,'TARGET_PATHWAY'])}
fp_df = pd.read_csv('fingerprints/gdsc_fingerprints.txt', sep='\t', index_col=0)

There are 24 unique pathways in GDSC


In [32]:
response_df = pd.read_csv('input_files/gdsc/gdsc_ec50_expfilt_fpfilt.txt', sep='\t', header=None)

wanted_pathways = ['PI3K/MTOR signaling','Mitosis','EGFR signaling','ERK MAPK signaling','DNA replication']

for pw in wanted_pathways:
    curr_drugs = []
    for k,v in drug_pathways_dict.items():
        if v == pw and k in fp_df.index:
            curr_drugs.append(k)
    print(f'{pw}: {curr_drugs}')

    
    temp_df = response_df[response_df.iloc[:,1].isin(curr_drugs)]
    print(temp_df.shape)
    print(pw.replace('/', '').replace(' ', '_'))
    pw = pw.replace('/', '').replace(' ', '_')
    temp_df.to_csv(f'input_files/gdsc/gdsc_{pw}_exp.txt',  sep='\t', index=False, header=False)

PI3K/MTOR signaling: ['Rapamycin', 'AZD6482', 'OSI-027', 'Temsirolimus', 'MK-2206', 'Dactolisib', 'Pictilisib', 'AZD8055', 'PF-4708671', 'AZD2014', 'AZD8186', 'Uprosertib', 'Alpelisib', 'Taselisib', 'CZC24832', 'GSK2110183B', 'Buparlisib', 'Afuresertib', 'Ipatasertib', 'GNE-317', 'AMG-319', 'LJI308', 'AT13148']
(19862, 3)
PI3KMTOR_signaling
Mitosis: ['Paclitaxel', 'Tozasertib', 'Vinorelbine', 'Alisertib', 'Vinblastine', 'Docetaxel', 'ZM447439']
(6241, 3)
Mitosis
EGFR signaling: ['Erlotinib', 'Lapatinib', 'Gefitinib', 'Afatinib', 'Sapitinib', 'AZD3759', 'Osimertinib']
(6517, 3)
EGFR_signaling
ERK MAPK signaling: ['VX-11e', 'Refametinib', 'PLX-4720', 'PD0325901', 'SB590885', 'Selumetinib', 'Trametinib', 'Dabrafenib', 'SCH772984', 'KRAS (G12C) Inhibitor-12', 'Ulixertinib']
(11528, 3)
ERK_MAPK_signaling
DNA replication: ['Gemcitabine', 'Bleomycin', 'Camptothecin', 'Cisplatin', 'Methotrexate', 'Irinotecan', 'Oxaliplatin', 'Temozolomide', 'SN-38', 'Epirubicin', 'Cyclophosphamide', 'Leflunomi