# Data preprocessing

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import re

## Required files

There are 5 required data files before preprocessing. Please download both DepMap files manually from the hyperlinks. The cell below will download everything else.

(1) CTRPv2 response data

(2) GDSC response data 

(3) gCSI response data 

(4) [DepMap cell line gene expression](https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2024Q4&filename=OmicsExpressionProteinCodingGenesTPMLogp1.csv)

(5) [DepMap manifest](https://depmap.org/portal/data_page/?tab=allData&releasename=DepMap%20Public%2024Q4&filename=Model.csv) 

(6) TCGA gene expression - Obtained individually from [UCSC Xena Browser](https://xenabrowser.net/datapages/?cohort=GDC%20TCGA%20Acute%20Myeloid%20Leukemia%20(LAML)&removeHub=https%3A%2F%2Fxena.treehouse.gi.ucsc.edu%3A443) (GDC hub, STAR TPM). Link provided is Google Drive maintained with all patient expression.

In [None]:
!wget https://ctd2-data.nci.nih.gov/Public/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/CTRPv2.0_2015_ctd2_ExpandedDataset.zip #CTRPv2
!unzip CTRPv2.0_2015_ctd2_ExpandedDataset.zip
!wget https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/GDSC2_fitted_dose_response_27Oct23.xlsx #GDSC2
!wget http://research-pub.gene.com/gCSI_GRvalues2019/gCSI_GRdata_v1.3.tsv.tar.gz #gCSI
!tar -xzvf gCSI_GRdata_v1.3.tsv.tar.gz
!gdown --id 1wdGLJVAVCtK7Az4qtjsTvw-RJ-PLiIue #TCGA patient expression, please contact authors if this fails. Alternatively, download files manually from XENA browser above.
!unzip tcga_expression_data.zip

--2025-05-28 17:11:15--  https://ctd2-data.nci.nih.gov/Public/Broad/CTRPv2.0_2015_ctd2_ExpandedDataset/CTRPv2.0_2015_ctd2_ExpandedDataset.zip
Resolving ctd2-data.nci.nih.gov (ctd2-data.nci.nih.gov)... 129.43.254.216, 2607:f220:41d:21c1::812b:fed8
Connecting to ctd2-data.nci.nih.gov (ctd2-data.nci.nih.gov)|129.43.254.216|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 342737645 (327M) [application/zip]
Saving to: ‘CTRPv2.0_2015_ctd2_ExpandedDataset.zip’


2025-05-28 17:13:15 (2.72 MB/s) - ‘CTRPv2.0_2015_ctd2_ExpandedDataset.zip’ saved [342737645/342737645]

Archive:  CTRPv2.0_2015_ctd2_ExpandedDataset.zip
  inflating: CTRPv2.0._COLUMNS.xlsx  
  inflating: CTRPv2.0._INFORMER_SET.xlsx  
  inflating: CTRPv2.0._README.docx   
  inflating: MANIFEST.txt            
  inflating: v20._COLUMNS.txt        
  inflating: v20.data.curves_post_qc.txt  
  inflating: v20.data.per_cpd_avg.txt  
  inflating: v20.data.per_cpd_post_qc.txt  
  inflating: v20.data.per_cpd_pre_qc.txt

## Generate response files for each dataset

In [4]:
if not os.path.isdir('input_files'):
    os.mkdir('input_files')

We generate response files where each line is a tab-separated list containing (cell line, drug, response).

### CTRP

In [5]:
if not os.path.isdir('input_files/ctrp'):
    os.mkdir('input_files/ctrp')
cellLine_meta_df = pd.read_csv('v20.meta.per_cell_line.txt', sep='\t')
experiment_meta_df = pd.read_csv('v20.meta.per_experiment.txt', sep='\t')
drug_meta_df = pd.read_csv('v20.meta.per_compound.txt', sep='\t')
experiment_results = pd.read_csv('v20.data.curves_post_qc.txt', sep='\t')

In [6]:
# Map ID values to drugs and cell lines, this is how they are represented in experiments.
id2cell = dict(zip(cellLine_meta_df.master_ccl_id, cellLine_meta_df.ccl_name))
id2drug = dict(zip(drug_meta_df.master_cpd_id, drug_meta_df.cpd_name))

# Get dict where key = experiment ID, val = cell line ID
exp2cell = dict(zip(experiment_meta_df.experiment_id, experiment_meta_df.master_ccl_id))

#### AUC

In [7]:
# Create dict that is the final precursor for our tuple output file
auc_dict = dict(zip(zip(experiment_results.experiment_id, experiment_results.master_cpd_id), experiment_results.area_under_curve))
auc_named_dict = {}
for k,v in auc_dict.items():
    auc_named_dict[(id2cell[exp2cell[k[0]]], id2drug[k[1]])] = v

# Prevent from appending to file if it's already there
if os.path.exists('input_files/ctrp/ctrp_auc.txt'):
    os.remove('input_files/ctrp/ctrp_auc.txt')

with open('input_files/ctrp/ctrp_auc.txt', 'a') as f:
    for k,v in auc_named_dict.items():
        f.write(f'{k[0]}\t{k[1]}\t{v}\n')

#### EC50

In [8]:
ec50_dict = dict(zip(zip(experiment_results.experiment_id, experiment_results.master_cpd_id), experiment_results.apparent_ec50_umol))
ec50_named_dict = {}
for k,v in ec50_dict.items():
    if v >= -20 and v <= 20: # CTRP reports values in apparent ec50 uM, some of these values are extreme and disrupt training (exploding loss). Filter for realistic values. (20 + -6 = 10^14 M concentration...)
        ec50_named_dict[(id2cell[exp2cell[k[0]]], id2drug[k[1]])] = v

# Prevent from appending to file if it's already there
if os.path.exists('input_files/ctrp/ctrp_ec50.txt'):
    os.remove('input_files/ctrp/ctrp_ec50.txt')

with open('input_files/ctrp/ctrp_ec50.txt', 'a') as f:
    for k,v in ec50_named_dict.items():
        f.write(f'{k[0]}\t{k[1]}\t{v}\n')

### GDSC

GDSC is much easier, it's all in one file.

In [10]:
if not os.path.isdir('input_files/gdsc'):
    os.mkdir('input_files/gdsc')
gdsc_response_df = pd.read_excel('GDSC2_fitted_dose_response_27Oct23.xlsx')
gdsc_response_df[['CELL_LINE_NAME', 'DRUG_NAME', 'LN_IC50']].to_csv('input_files/gdsc/gdsc_ec50.txt', sep='\t', header=False, index=False)
gdsc_response_df[['CELL_LINE_NAME', 'DRUG_NAME', 'AUC']].to_csv('input_files/gdsc/gdsc_auc.txt', sep='\t', header=False, index=False)

### gCSI

In [11]:
gCSI_df = pd.read_csv('/research/labs/microbiome/chia/m214779/gCSI/gCSI_GRmetrics_v1.3.tsv', sep='\t')

In [12]:
if not os.path.isdir('input_files/gcsi'):
    os.mkdir('input_files/gcsi')

# gCSI data has missing values, have to filter for them for each metric as we go

temp = gCSI_df[gCSI_df['GR50'].notna()]
temp = temp[temp['GR50'] < np.inf]
temp = temp[temp['GR50'] > -np.inf]
temp['GR50'] = temp['GR50'].apply(np.log) # log transform GR50 for prediction
temp[['Norm_CellLineName', 'Norm_DrugName', 'GR50']].to_csv('input_files/gcsi/gcsi_gr50.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GR_AOC'].notna()]
temp = temp[temp['GR_AOC'] < np.inf]
temp = temp[temp['GR_AOC'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GR_AOC']].to_csv('input_files/gcsi/gcsi_aoc.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GRmax'].notna()]
temp = temp[temp['GRmax'] < np.inf]
temp = temp[temp['GRmax'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GRmax']].to_csv('input_files/gcsi/gcsi_grmax.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['Emax'].notna()]
temp = temp[temp['Emax'] < np.inf]
temp = temp[temp['Emax'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'Emax']].to_csv('input_files/gcsi/gcsi_emax.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GRinf'].notna()]
temp = temp[temp['GRinf'] < np.inf]
temp = temp[temp['GRinf'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GRinf']].to_csv('input_files/gcsi/gcsi_grinf.txt', sep='\t', header=False, index=False)

temp = gCSI_df[gCSI_df['GR_05uM_fit'].notna()]
temp = temp[temp['GR_05uM_fit'] < np.inf]
temp = temp[temp['GR_05uM_fit'] > -np.inf]
temp[['Norm_CellLineName', 'Norm_DrugName', 'GR_05uM_fit']].to_csv('input_files/gcsi/gcsi_gr_05um_fit.txt', sep='\t', header=False, index=False)

## Filtering datasets for cell lines with expression values in DepMap

In [16]:
index_df = pd.read_csv('Model.csv')
strip_dict = pd.Series(index_df['ModelID'].values,index=index_df['StrippedCellLineName']).to_dict()

# Get all of our input response files
response_files = []
for dirpath, dirnames, filenames in os.walk('input_files'):
    for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        response_files.append(file_path)

for response_file in response_files:
    print(response_file)
    curr_df = pd.read_csv(response_file, sep='\t', header=None, index_col=None)
    print(curr_df.shape)
    curr_df.iloc[:,0] = curr_df.iloc[:,0].str.replace('-', '')
    curr_df.iloc[:,0] = curr_df.iloc[:,0].str.upper()
    curr_df.iloc[:,0] = curr_df.iloc[:,0].map(strip_dict)
    curr_df = curr_df[curr_df.iloc[:,0].notna()]
    print(curr_df.shape) # print change in df shape for each input file
    root, dataset, metric = response_file.replace('.txt', '').split('/')
    curr_df.to_csv(f'{root}/{dataset}/{metric}'+ '_expfilt.txt', sep='\t', header=False, index=False)

input_files/gdsc/gdsc_ec50.txt
(242036, 3)
(234020, 3)
input_files/gdsc/gdsc_auc.txt
(242036, 3)
(234020, 3)
input_files/gcsi/gcsi_grmax.txt
(16304, 3)
(15835, 3)
input_files/gcsi/gcsi_gr50.txt
(10765, 3)
(10446, 3)
input_files/gcsi/gcsi_grinf.txt
(16304, 3)
(15835, 3)
input_files/gcsi/gcsi_gr_05um_fit.txt
(16304, 3)
(15835, 3)
input_files/gcsi/gcsi_emax.txt
(16688, 3)
(16203, 3)
input_files/gcsi/gcsi_aoc.txt
(16304, 3)
(15835, 3)
input_files/ctrp/ctrp_auc.txt
(387130, 3)
(365321, 3)
input_files/ctrp/ctrp_ec50.txt
(282825, 3)
(266709, 3)


## Filter datasets by morgan fingerprint files

In [17]:
# Download fingerprint files
!gdown --id 1XBMuz3YeHSRh1mtFdy3Rb-4mqMoF7FMQ
!unzip fingerprints.zip

Downloading...
From: https://drive.google.com/uc?id=1XBMuz3YeHSRh1mtFdy3Rb-4mqMoF7FMQ
To: /research/labs/microbiome/chia/m214779/drug_blind_generalization/fingerprints.zip
100%|█████████████████████████████████████████| 108k/108k [00:00<00:00, 180MB/s]
Archive:  fingerprints.zip
  inflating: fingerprints/ctrp_fingerprints.txt  
  inflating: fingerprints/gcsi_fingerprints.txt  
  inflating: fingerprints/gdsc_fingerprints.txt  


In [18]:
response_files = []
for dirpath, dirnames, filenames in os.walk('input_files'):
    for filename in filenames:
        if filename.endswith('expfilt.txt'):
            file_path = os.path.join(dirpath, filename)
            response_files.append(file_path)

for response_file in response_files:
    print(response_file)
    root, dataset, metric = response_file.replace('_expfilt.txt', '').split('/')
    response_df = pd.read_csv(response_file, sep='\t', header=None, index_col=None)
    print(response_df.shape)
    fp_df = pd.read_csv(f'fingerprints/{dataset}_fingerprints.txt', sep='\t', index_col=0)
    filt_drugs = list(fp_df.index)
    response_df = response_df[response_df.iloc[:,1].isin(filt_drugs)]
    print(response_df.shape)
    response_df.to_csv(f'{root}/{dataset}/{metric}'+ '_expfilt_fpfilt.txt', sep='\t', header=False, index=False)

input_files/gdsc/gdsc_ec50_expfilt.txt
(234020, 3)
(194491, 3)
input_files/gdsc/gdsc_auc_expfilt.txt
(234020, 3)
(194491, 3)
input_files/gcsi/gcsi_emax_expfilt.txt
(16203, 3)
(11470, 3)
input_files/gcsi/gcsi_gr_05um_fit_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_grmax_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_grinf_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_aoc_expfilt.txt
(15835, 3)
(11198, 3)
input_files/gcsi/gcsi_gr50_expfilt.txt
(10446, 3)
(7635, 3)
input_files/ctrp/ctrp_auc_expfilt.txt
(365321, 3)
(365321, 3)
input_files/ctrp/ctrp_ec50_expfilt.txt
(266709, 3)
(266709, 3)


## Filter expression data to match with TCGA expression

Original planned utility of model was with patient few-shot example, so expression has been filtered to match up with TCGA

In [21]:
for i,f in enumerate(os.listdir('tcga_expression_data')):
    if i == 0:
        df = pd.read_csv(f'tcga_expression_data/{f}', sep='\t', header=0, index_col=0)
    else:
        temp = pd.read_csv(f'tcga_expression_data/{f}', sep='\t', header=0, index_col=0)
        df = df.join(temp)
    print(df.shape)


# Map ensembleID to gene names so I can align the datasets
depmap_df = pd.read_csv('OmicsExpressionProteinCodingGenesTPMLogp1.csv', index_col=0)
depmap_geneNames = pd.read_csv('gencode.v36.annotation.gtf.gene.probemap', sep='\t', header=0)

id2name = dict(zip(depmap_geneNames.iloc[:,0], depmap_geneNames.iloc[:,1]))
tcga_names = [id2name[x] for x in list(df.index)]
df.index = tcga_names

# Remove weird parenthesis things from end of gene names in depmap
depmap_df = depmap_df.T
replace_index = [re.sub(r'\([^)]*\)', '', x) for x in list(depmap_df.index)]
replace_index = [x.strip() for x in replace_index]
print(replace_index)
depmap_df.index = replace_index

# Drop repeat index in both
depmap_df = depmap_df[~depmap_df.index.duplicated(keep='first')]
df = df[~df.index.duplicated(keep='first')]

# Filter by inner join index, but don't do the actual inner join so we can keep df separate
temp = df.loc[list(set(depmap_df.join(df, how='inner').index)),:]
temp = temp.T
print(temp.shape)

temp = depmap_df.loc[(depmap_df.join(df, how='inner').index),:]
temp = temp.T
print(temp.shape)
temp.to_csv('input_files/depmap_expression_pt_filtered.txt', sep='\t', header=True, index=True)

(60660, 424)
(60660, 1013)
(60660, 1567)
(60660, 1754)
(60660, 2227)
(60660, 2306)
(60660, 2354)
(60660, 2505)
(60660, 3039)
(60660, 3467)
(60660, 3981)
(60660, 4179)
(60660, 4354)
(60660, 4783)
(60660, 5393)
(60660, 5450)
(60660, 6016)
(60660, 6172)
(60660, 6757)
(60660, 7080)
(60660, 7202)
(60660, 7774)
(60660, 7865)
(60660, 8130)
(60660, 8313)
(60660, 8490)
(60660, 8534)
(60660, 8621)
(60660, 8930)
(60660, 9378)
(60660, 9930)
(60660, 11156)


FileNotFoundError: [Errno 2] No such file or directory: 'OmicsExpressionProteinCodingGenesTPMLogp1.csv'