In [25]:
from datasets import load_dataset
import random
import pandas as pd
import duckdb
import numpy as np
import pyarrow.parquet as pq
import pyarrow as pa
import os
import sqlite3

GDSC data lownloaded from: https://www.cancerrxgene.org/downloads/bulk_download

In [2]:
df1 = pd.read_excel('/hpc/home/yc583/Tahoe100M_practice/data/GDSC1_fitted_dose_response_27Oct23.xlsx')
df2 = pd.read_excel('/hpc/home/yc583/Tahoe100M_practice/data/GDSC2_fitted_dose_response_27Oct23.xlsx')
gdsc = pd.concat([df1, df2], ignore_index=True)
gdsc

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.966813,0.985678,0.026081,1.299144
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.692090,0.972690,0.110059,0.156076
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.477990,0.944459,0.087019,-0.035912
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.033564,0.950758,0.016290,-0.434437
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,2.966007,0.954778,0.180255,0.401702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575192,GDSC2,343,16188242,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.127082,0.976746,0.074498,0.156872
575193,GDSC2,343,16188695,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,8.576377,0.913378,0.057821,-1.626959
575194,GDSC2,343,16188953,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.519636,0.975001,0.058090,0.608442
575195,GDSC2,343,16189493,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.694579,0.969969,0.101013,0.809684


In [3]:
# check duplicated drug_name - cell_line pairs within and between 2 versions of data
len(duckdb.query("SELECT DRUG_NAME, CELL_LINE_NAME FROM gdsc GROUP BY DRUG_NAME, CELL_LINE_NAME HAVING COUNT(*)>1").df())

99012

In [4]:
# check duplicated drug_name - cell_line pairs within single version of data
len(duckdb.query("SELECT DRUG_NAME, CELL_LINE_NAME, DATASET FROM gdsc GROUP BY DRUG_NAME, CELL_LINE_NAME, DATASET HAVING COUNT(*)>1").df())

24355

For duplicated pairs between 2 versions of release, use the IC50 in GDSC2. For duplicated pairs within the same version, use the aggregated value of IC50.

In [5]:
def exp_mean(x):
    return np.log(np.exp(x).mean())
    
gdsc['LN_IC50'] = gdsc.groupby(['DRUG_NAME', 'CELL_LINE_NAME', 'DATASET'], as_index=False)['LN_IC50'].transform(exp_mean)
gdsc = gdsc.drop_duplicates(subset=['DRUG_NAME', 'CELL_LINE_NAME'], keep='last')
gdsc

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
16,GDSC1,342,15596237,688007,NCI-H187,SIDM00767,SCLC,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.416401,0.983964,0.038981,0.805580
31,GDSC1,342,15602266,713878,LU-139,SIDM00293,SCLC,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.690722,0.979948,0.037594,1.051568
69,GDSC1,342,15632614,753597,NCI-H128,SIDM00650,SCLC,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,4.670882,0.979820,0.031659,1.930496
70,GDSC1,342,15632655,753599,NCI-H1304,SIDM00648,SCLC,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,3.217352,0.981365,0.020940,0.627088
73,GDSC1,342,15635334,753614,TE-15,SIDM00249,ESCA,1,Erlotinib,EGFR,EGFR signaling,1045,Y,0.007813,2.0,0.977622,0.877478,0.077917,-1.381320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575192,GDSC2,343,16188242,1659928,SNU-175,SIDM00216,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.127082,0.976746,0.074498,0.156872
575193,GDSC2,343,16188695,1660034,SNU-407,SIDM00214,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,8.576377,0.913378,0.057821,-1.626959
575194,GDSC2,343,16188953,1660035,SNU-61,SIDM00194,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.519636,0.975001,0.058090,0.608442
575195,GDSC2,343,16189493,1674021,SNU-C5,SIDM00498,COREAD,2499,N-acetyl cysteine,Metabolism,Metabolism,1101,Y,2.001054,2000.0,10.694579,0.969969,0.101013,0.809684


In [6]:
gdsc.to_csv('/hpc/home/yc583/Tahoe100M_practice/data/GDSC_processed.csv', index=False)

Down-sample Tahoe data to 10M. Saved the data to parquet database.

In [27]:
output_dir = '/hpc/home/yc583/Tahoe100M_practice/data/Tahoe_downsampled_parquet'
os.makedirs(output_dir, exist_ok=True)

In [None]:
chunks = pd.read_csv('/hpc/group/naderilab/eleanor/prose_data/data/Tahoe_downsampled/txt/gz', 
                     sep='\t',
                     compression='gzip',
                     chunksize=100, 
                     low_memory=False)
for chunk in chunks:
    table = pa.Table.from_pandas(chunk)
    pq.write_to_dataset(
        table, 
        root_path=output_dir, 
        compression="snappy" ,
        # existing_data_behavior="delete_matching" 
    )
        

Merge down-sampled Tahoe data with processed GDSC data

In [None]:
tahoe = duckdb.read_parquet(f"{output_dir}/**/*.parquet", hive_partitioning=True)
gdsc = pl.scan_csv('/hpc/home/yc583/Tahoe100M_practice/data/GDSC_processed.csv')

In [6]:
# get mapping table between cell-line name and cell-line cellosaur id
cell_line_metadata = load_dataset(
    "vevotx/Tahoe-100M",
    name="cell_line_metadata",
    split="train").to_pandas()

cell_line_mapping = cell_line_metadata.drop_duplicates(subset=['cell_name', 'Cell_ID_Cellosaur'])
cell_line_mapping = cell_line_mapping[['cell_name', 'Cell_ID_Cellosaur']]
db = sqlite3.connect('/hpc/home/yc583/Tahoe100M_practice/data/Tahoe_downsampled_parquet/cell_line_mapping_table.db')
cell_line_mapping.to_sql('cell_line', db, if_exists='replace')

Unnamed: 0,genes,expressions,canonical_smiles,drug,cell_line_id,cell_name,Cell_ID_Cellosaur
0,"1,5,19,21,31,56,68,77,78,85,99,100,106,107,108...","-2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_0480,PANC-1,CVCL_0480
1,"1,15,19,26,32,35,38,59,70,76,78,109,117,121,13...","-2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_0546,SW480,CVCL_0546
2,"1,10,11,19,23,26,31,35,43,45,56,58,68,70,75,76...","-2.0,1.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,5.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1717,SW1417,CVCL_1717
3,"1,20,21,31,45,56,68,69,77,88,103,104,124,138,1...","-2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1717,SW1417,CVCL_1717
4,"1,10,19,43,56,77,86,95,103,109,112,124,128,137...","-2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1056,A498,CVCL_1056
...,...,...,...,...,...,...,...
536319,"1,40,69,214,218,232,252,292,294,334,427,475,51...","-2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...",CCNC(=O)CCCC=CCC1C(CC(C1C=CC(CCC2=CC=CC=C2)O)O)O,Bimatoprost,CVCL_0099,SNU-1,CVCL_0099
536320,"1,19,33,45,56,70,78,103,156,167,171,200,202,21...","-2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,2...",CCNC(=O)CCCC=CCC1C(CC(C1C=CC(CCC2=CC=CC=C2)O)O)O,Bimatoprost,CVCL_1285,HOP62,CVCL_1285
536321,"1,7,31,76,99,103,106,114,128,130,164,171,174,1...","-2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,2.0,1...",CCNC(=O)CCCC=CCC1C(CC(C1C=CC(CCC2=CC=CC=C2)O)O)O,Bimatoprost,CVCL_0428,MIA PaCa-2,CVCL_0428
536322,"1,26,68,112,113,124,128,134,136,153,161,211,23...","-2.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...",CCNC(=O)CCCC=CCC1C(CC(C1C=CC(CCC2=CC=CC=C2)O)O)O,Bimatoprost,CVCL_0293,HEC-1-A,CVCL_0293


In [None]:
tahoe_cell = duckdb.sql(
    """
    SELECT * 
    FROM tahoe t JOIN cell_line c ON t.cell_line_id = c.Cell_ID_Cellosaur
    """
)

In [None]:
tahoe_gdsc = duckdb.sql(
    """
    SELECT *
    FROM tahoe_cell tc JOIN gdsc g ON tc.drug = g.DRUG_NAME AND tc.cell_name = g.CELL_LINE_NAME
    """
).df()

In [None]:
tahoe_gdsc['genes'] = tahoe_gdsc['genes'].apply(lambda x: (x.split(','))[1:])
tahoe_gdsc['expression'] = tahoe_gdsc['expression'].apply(lambda x: (x.split(','))[1:])

In [14]:
tahoe_gdsc.to_csv('/hpc/home/yc583/Tahoe100M_practice/data/Tahoe_GDSC_merged.csv', index=False)

In [2]:
pd.read_table('/hpc/home/yc583/Tahoe100M_practice/data/Tahoe_downsampled.txt')

Unnamed: 0,genes,expressions,canonical_smiles,drug,cell_line_id
0,"1,5,19,21,31,56,68,77,78,85,99,100,106,107,108...","-2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_0480
1,"1,15,19,26,32,35,38,59,70,76,78,109,117,121,13...","-2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_0546
2,"1,10,11,19,23,26,31,35,43,45,56,58,68,70,75,76...","-2.0,1.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,5.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1717
3,"1,20,21,31,45,56,68,69,77,88,103,104,124,138,1...","-2.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1717
4,"1,10,19,43,56,77,86,95,103,109,112,124,128,137...","-2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1056
5,"1,5,19,32,43,57,77,128,139,146,154,214,227,236...","-2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_0131
6,"1,21,22,26,32,49,112,127,131,138,139,154,212,2...","-2.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,2.0,2.0,3...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_0179
7,"1,11,19,22,32,42,43,56,77,103,107,112,149,164,...","-2.0,1.0,1.0,1.0,3.0,1.0,2.0,1.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1056
8,"1,21,26,59,69,70,90,95,106,108,140,190,200,203...","-2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1478
9,"1,5,9,11,42,43,58,68,90,95,99,103,114,139,143,...","-2.0,1.0,1.0,3.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2...",C1=CC2=C(C(=C1)O)N=CC=C2,8-Hydroxyquinoline,CVCL_1478
