In [32]:
import pandas as pd
import sys, os

print("Python:", sys.version)
print("Pandas:", pd.__version__)
# Python: 3.12.7 
# Pandas: 2.2.2

Python: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Pandas: 2.2.2


In [6]:
'''
All data was downloaded from GDSC's publically available bulk data downloads, here:
https://www.cancerrxgene.org/downloads/bulk_download

The datasets to be joined include "GDSC1-dataset", "GDSC2-dataset", and "Cell-line-annotation"

The GDSC1 and GDSC2 datasets were changed from .xlsx to .csv as they were too large to read into pandas.
'''

gdsc1_data = pd.read_csv("GDSC1_fitted_dose_response_27Oct23.csv")
print(gdsc1_data.shape, "\n", gdsc1_data.head(3))

(333161, 19) 
   DATASET  NLME_RESULT_ID  NLME_CURVE_ID  COSMIC_ID CELL_LINE_NAME  \
0   GDSC1             342       15580432     684057            ES5   
1   GDSC1             342       15580806     684059            ES7   
2   GDSC1             342       15581198     684062          EW-11   

  SANGER_MODEL_ID     TCGA_DESC  DRUG_ID  DRUG_NAME PUTATIVE_TARGET  \
0       SIDM00263  UNCLASSIFIED        1  Erlotinib            EGFR   
1       SIDM00269  UNCLASSIFIED        1  Erlotinib            EGFR   
2       SIDM00203  UNCLASSIFIED        1  Erlotinib            EGFR   

     PATHWAY_NAME  COMPANY_ID WEBRELEASE  MIN_CONC  MAX_CONC   LN_IC50  \
0  EGFR signaling        1045          Y  0.007813       2.0  3.966813   
1  EGFR signaling        1045          Y  0.007813       2.0  2.692090   
2  EGFR signaling        1045          Y  0.007813       2.0  2.477990   

        AUC      RMSE   Z_SCORE  
0  0.985678  0.026081  1.299144  
1  0.972690  0.110059  0.156076  
2  0.944459  0.08701

In [12]:
# read in the supplementary GDSC dataset with metadata for cell lines
# this gives more context to the GDSC1 and GDSC2 datasets only refer to 
# COSMIC ID, a numeric cell line identifier
cell_line_data = pd.read_excel("Cell_Lines_Details.xlsx", 
                               sheet_name = "Cell line details")
# print(cell_line_data.shape, "\n", cell_line_data.head(3))

  for idx, row in parser.parse():


In [14]:
gdsc2_data = pd.read_csv("GDSC2_fitted_dose_response_27Oct23.csv")
print(gdsc2_data.shape, "\n", gdsc2_data.head(3))

(242036, 19) 
   DATASET  NLME_RESULT_ID  NLME_CURVE_ID  COSMIC_ID CELL_LINE_NAME  \
0   GDSC2             343       15946310     683667         PFSK-1   
1   GDSC2             343       15946548     684052           A673   
2   GDSC2             343       15946830     684057            ES5   

  SANGER_MODEL_ID     TCGA_DESC  DRUG_ID     DRUG_NAME PUTATIVE_TARGET  \
0       SIDM01132            MB     1003  Camptothecin            TOP1   
1       SIDM00848  UNCLASSIFIED     1003  Camptothecin            TOP1   
2       SIDM00263  UNCLASSIFIED     1003  Camptothecin            TOP1   

      PATHWAY_NAME  COMPANY_ID WEBRELEASE  MIN_CONC  MAX_CONC   LN_IC50  \
0  DNA replication        1046          Y    0.0001       0.1 -1.463887   
1  DNA replication        1046          Y    0.0001       0.1 -4.869455   
2  DNA replication        1046          Y    0.0001       0.1 -3.360586   

        AUC      RMSE   Z_SCORE  
0  0.930220  0.089052  0.433123  
1  0.614970  0.111351 -1.421100  
2  0

In [16]:
# gdsc1 and gdsc2 have identical structures
gdsc_all = pd.concat([gdsc1_data, gdsc2_data], ignore_index=True)

print(gdsc_all.shape)
print(gdsc_all['DATASET'].value_counts())

(575197, 19)
DATASET
GDSC1    333161
GDSC2    242036
Name: count, dtype: int64


In [31]:
gdsc_all.head(3)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.966813,0.985678,0.026081,1.299144
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.69209,0.97269,0.110059,0.156076
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912


In [18]:
# prepare cell_line_data for merge
cell_line_data = cell_line_data.rename(columns={'COSMIC identifier': 'COSMIC_ID'})
cell_line_data['COSMIC_ID'] = cell_line_data['COSMIC_ID'].astype('Int64') # was float before

print("gdsc_all COSMIC_ID dtype:", gdsc_all['COSMIC_ID'].dtype)
print("cell_line_data COSMIC_ID dtype:", cell_line_data['COSMIC_ID'].dtype)
print("Shared IDs:", gdsc_all['COSMIC_ID'].isin(cell_line_data['COSMIC_ID']).sum())

gdsc_all COSMIC_ID dtype: int64
cell_line_data COSMIC_ID dtype: Int64
Shared IDs: 575197


In [20]:
# add cell line data to gdsc_all by merging on COSMIC_ID
gdsc_merged = pd.merge(gdsc_all,
                       cell_line_data,
                       on='COSMIC_ID',
                       how='left')

print(gdsc_merged.shape, '\n', gdsc_merged.head(3))

(575197, 31) 
   DATASET  NLME_RESULT_ID  NLME_CURVE_ID  COSMIC_ID CELL_LINE_NAME  \
0   GDSC1             342       15580432     684057            ES5   
1   GDSC1             342       15580806     684059            ES7   
2   GDSC1             342       15581198     684062          EW-11   

  SANGER_MODEL_ID     TCGA_DESC  DRUG_ID  DRUG_NAME PUTATIVE_TARGET  ...  \
0       SIDM00263  UNCLASSIFIED        1  Erlotinib            EGFR  ...   
1       SIDM00269  UNCLASSIFIED        1  Erlotinib            EGFR  ...   
2       SIDM00203  UNCLASSIFIED        1  Erlotinib            EGFR  ...   

  Copy Number Alterations (CNA)  Gene Expression Methylation  Drug\nResponse  \
0                             Y                Y           Y               Y   
1                             Y                Y           Y               Y   
2                             Y                Y           Y               Y   

   GDSC\nTissue descriptor 1  GDSC\nTissue\ndescriptor 2  \
0                 

In [22]:
gdsc_merged.columns
# gdsc_merged.to_csv("GDSC1and2_w_CellLineData.csv", index=False)

Index(['DATASET', 'NLME_RESULT_ID', 'NLME_CURVE_ID', 'COSMIC_ID',
       'CELL_LINE_NAME', 'SANGER_MODEL_ID', 'TCGA_DESC', 'DRUG_ID',
       'DRUG_NAME', 'PUTATIVE_TARGET', 'PATHWAY_NAME', 'COMPANY_ID',
       'WEBRELEASE', 'MIN_CONC', 'MAX_CONC', 'LN_IC50', 'AUC', 'RMSE',
       'Z_SCORE', 'Sample Name', 'Whole Exome Sequencing (WES)',
       'Copy Number Alterations (CNA)', 'Gene Expression', 'Methylation',
       'Drug\nResponse', 'GDSC\nTissue descriptor 1',
       'GDSC\nTissue\ndescriptor 2', 'Cancer Type\n(matching TCGA label)',
       'Microsatellite \ninstability Status (MSI)', 'Screen Medium',
       'Growth Properties'],
      dtype='object')