In [1]:

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)



In [2]:
data = "GDSC1and2_w_CellLineData.csv"  


df = pd.read_csv(data)
print(df.shape)
df.head()


(575197, 24)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,3.966813,0.985678,0.026081,1.299144,ES5,bone,ewings_sarcoma,,R,Adherent
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.69209,0.97269,0.110059,0.156076,ES7,bone,ewings_sarcoma,,R,Adherent
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912,EW-11,bone,ewings_sarcoma,,R,Adherent
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,bone,ewings_sarcoma,,R,Semi-Adherent
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.966007,0.954778,0.180255,0.401702,COLO-829,skin,melanoma,SKCM,R,Adherent


In [3]:

print("\nColumn Type:")
for x in df.columns: 
    print(x, df[x].dtype)


print("\nMissing values")
print(df.isna().sum().sort_values(ascending=False))


Column Type:
DATASET object
NLME_RESULT_ID int64
NLME_CURVE_ID int64
COSMIC_ID int64
CELL_LINE_NAME object
SANGER_MODEL_ID object
TCGA_DESC object
DRUG_ID int64
DRUG_NAME object
PUTATIVE_TARGET object
PATHWAY_NAME object
COMPANY_ID int64
MIN_CONC float64
MAX_CONC float64
LN_IC50 float64
AUC float64
RMSE float64
Z_SCORE float64
Sample Name object
GDSC_Tissue_descriptor_1 object
GDSC_Tissue_descriptor_2 object
Cancer_Type_TCGA object
Medium object
Growth object

Missing values
Cancer_Type_TCGA            102030
PUTATIVE_TARGET              30807
TCGA_DESC                     1647
Growth                         361
MAX_CONC                         0
Medium                           0
GDSC_Tissue_descriptor_2         0
GDSC_Tissue_descriptor_1         0
Sample Name                      0
Z_SCORE                          0
RMSE                             0
AUC                              0
LN_IC50                          0
DATASET                          0
NLME_RESULT_ID               

In [4]:

#--for reference--

DATASET object
CELL_LINE_NAME object
SANGER_MODEL_ID object
TCGA_DESC object
DRUG_NAME object
PUTATIVE_TARGET object
PATHWAY_NAME object
Sample Name object
GDSC_Tissue_descriptor_1 object
GDSC_Tissue_descriptor_2 object
Cancer_Type_TCGA object
Medium object
Growth object


NLME_RESULT_ID int64
NLME_CURVE_ID int64
COSMIC_ID int64
DRUG_ID int64
COMPANY_ID int64
MIN_CONC float64
MAX_CONC float64
LN_IC50 float64
AUC float64
RMSE float64
Z_SCORE float64



SyntaxError: invalid syntax (933165928.py, line 3)

In [5]:
#copy df 
df_clean = df.copy()

#standardize categorical columns --data cleaning
cat_cols = [
    "DATASET", "CELL_LINE_NAME", "SANGER_MODEL_ID", "TCGA_DESC",
    "DRUG_NAME", "PUTATIVE_TARGET", "PATHWAY_NAME", "Sample Name",
    "GDSC_Tissue_descriptor_1", "GDSC_Tissue_descriptor_2",
    "Cancer_Type_TCGA", "Medium", "Growth"
]


for c in cat_cols:
    if c in df_clean.columns:
        df_clean[c] = (
            df_clean[c]
            .astype(str)
            .str.strip()
        )


In [6]:
#fill missing columns with 'unknown' so it is not null
for c in ["PUTATIVE_TARGET", "PATHWAY_NAME", "Growth", "TCGA_DESC", "Cancer_Type_TCGA"]:
    if c in df_clean.columns:
        df_clean[c] = df_clean[c].fillna("Unknown").replace({"nan": "Unknown"})


In [7]:
#make sure auc and ln_ic50 exist  and drop null
df_clean = df_clean.dropna(subset=["AUC", "LN_IC50"]).copy()

In [8]:
# keep best 95% using RMSE threshold 
rmse_thr = df_clean["RMSE"].quantile(0.95)
df_clean = df_clean[df_clean["RMSE"] <= rmse_thr].copy()

print("After cleaning:", df_clean.shape)
df_clean.head()

After cleaning: (546438, 24)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,3.966813,0.985678,0.026081,1.299144,ES5,bone,ewings_sarcoma,Unknown,R,Adherent
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.69209,0.97269,0.110059,0.156076,ES7,bone,ewings_sarcoma,Unknown,R,Adherent
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912,EW-11,bone,ewings_sarcoma,Unknown,R,Adherent
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,bone,ewings_sarcoma,Unknown,R,Semi-Adherent
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.966007,0.954778,0.180255,0.401702,COLO-829,skin,melanoma,SKCM,R,Adherent


In [9]:
# get rid of bad values, make sure everything is numeric 
df_clean["AUC"] = pd.to_numeric(df_clean["AUC"], errors="coerce")
df_clean["LN_IC50"] = pd.to_numeric(df_clean["LN_IC50"], errors="coerce")
df_clean = df_clean.dropna(subset=["AUC", "LN_IC50"]).copy()


In [10]:
import numpy as np

# params 
eps = 1e-6        
w = 1.0  


#AUC: clip → logit → winsorize → z-score


#clip to (0,1) so logit is defined
df_clean["AUC_clip"] = np.clip(df_clean["AUC"].astype(float), eps, 1 - eps)

#logit for linearity 
df_clean["AUC_logit"] = np.log(df_clean["AUC_clip"] / (1.0 - df_clean["AUC_clip"]))

#winsorize for extreme tails
l, h = np.nanpercentile(df_clean["AUC_logit"], [w, 100 - w])
df_clean["AUC_wz"] = df_clean["AUC_logit"].clip(l, h)

#standardize (z-score)
meanAUC = df_clean["AUC_logit"].mean()
stdAUC = df_clean["AUC_logit"].std(ddof=0)
df_clean["AUC_z"] = (df_clean["AUC_logit"] - meanAUC) / (stdAUC if stdAUC != 0 else 1.0)



In [11]:
#IC50: LN_IC50 → pIC50 → winsorize → z-score


#natural-log IC50 to pIC50 (−log10 IC50)
df_clean["pIC50"] = -df_clean["LN_IC50"].astype(float) / np.log(10.0)

#winsorize pIC50 to reduce outlier extremes 
l2, h2 = np.nanpercentile(df_clean["pIC50"], [w, 100 - w])
df_clean["pIC50_wz"] = df_clean["pIC50"].clip(l2, h2)

#standardize
pic_mu = df_clean["pIC50"].mean()
pic_sd = df_clean["pIC50"].std(ddof=0)
df_clean["pIC50_z"] = (df_clean["pIC50"] - pic_mu) / (pic_sd if pic_sd != 0 else 1.0)


In [12]:
# ICI50 to original units, then also the log 
df_clean["IC50_raw_value"] = np.exp(df_clean["LN_IC50"])    



In [13]:
# no 0 division 
df_clean["AUC_IC50_ratio"] = (
    df_clean["AUC"] / df_clean["LN_IC50"].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan)


In [14]:
print("AUC (min/max):", float(df_clean["AUC"].min()), float(df_clean["AUC"].max()))
print("AUC_logit mean/std:", float(df_clean["AUC_logit"].mean()), float(df_clean["AUC_logit"].std(ddof=0)))
print("pIC50 mean/std:", float(df_clean["pIC50"].mean()), float(df_clean["pIC50"].std(ddof=0)))

df_clean[["AUC","AUC_clip","AUC_logit","AUC_wz","AUC_z",
          "LN_IC50","pIC50","pIC50_wz","pIC50_z"]].head()

AUC (min/max): 0.005996 0.999552
AUC_logit mean/std: 2.480423667483118 1.4522381181610695
pIC50 mean/std: -1.0774121600577669 1.1578201298874615


Unnamed: 0,AUC,AUC_clip,AUC_logit,AUC_wz,AUC_z,LN_IC50,pIC50,pIC50_wz,pIC50_z
0,0.985678,0.985678,4.231533,4.231533,1.2058,3.966813,-1.722765,-1.722765,-0.557386
1,0.97269,0.97269,3.572812,3.572812,0.752211,2.69209,-1.16916,-1.16916,-0.079242
2,0.944459,0.944459,2.833491,2.833491,0.243119,2.47799,-1.076177,-1.076177,0.001066
3,0.950758,0.950758,2.960513,2.960513,0.330586,2.033564,-0.883166,-0.883166,0.167769
4,0.954778,0.954778,3.049895,3.049895,0.392134,2.966007,-1.28812,-1.28812,-0.181987


In [15]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

df_encoded = df_clean.copy()
cat_cols = ["GDSC_Tissue_descriptor_1", "Cancer_Type_TCGA"]

# Use sparse_output instead of sparse
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=int)

encoded_arr = ohe.fit_transform(df_encoded[cat_cols])
encoded_cols = ohe.get_feature_names_out(cat_cols)

df_ohe = pd.DataFrame(encoded_arr, columns=encoded_cols, index=df_encoded.index)
df_encoded = pd.concat([df_encoded.drop(columns=cat_cols), df_ohe], axis=1)

print(f"Added {len(encoded_cols)} one-hot columns.")
df_encoded.filter(regex="^(GDSC_Tissue_descriptor_1|Cancer_Type_TCGA)").head()

Added 51 one-hot columns.


Unnamed: 0,GDSC_Tissue_descriptor_1_aero_dig_tract,GDSC_Tissue_descriptor_1_bone,GDSC_Tissue_descriptor_1_breast,GDSC_Tissue_descriptor_1_digestive_system,GDSC_Tissue_descriptor_1_kidney,GDSC_Tissue_descriptor_1_large_intestine,GDSC_Tissue_descriptor_1_leukemia,GDSC_Tissue_descriptor_1_lung,GDSC_Tissue_descriptor_1_lung_NSCLC,GDSC_Tissue_descriptor_1_lung_SCLC,GDSC_Tissue_descriptor_1_lymphoma,GDSC_Tissue_descriptor_1_myeloma,GDSC_Tissue_descriptor_1_nervous_system,GDSC_Tissue_descriptor_1_neuroblastoma,GDSC_Tissue_descriptor_1_pancreas,GDSC_Tissue_descriptor_1_skin,GDSC_Tissue_descriptor_1_soft_tissue,GDSC_Tissue_descriptor_1_thyroid,GDSC_Tissue_descriptor_1_urogenital_system,Cancer_Type_TCGA_ACC,Cancer_Type_TCGA_ALL,Cancer_Type_TCGA_BLCA,Cancer_Type_TCGA_BRCA,Cancer_Type_TCGA_CESC,Cancer_Type_TCGA_CLL,Cancer_Type_TCGA_COAD/READ,Cancer_Type_TCGA_DLBC,Cancer_Type_TCGA_ESCA,Cancer_Type_TCGA_GBM,Cancer_Type_TCGA_HNSC,Cancer_Type_TCGA_KIRC,Cancer_Type_TCGA_LAML,Cancer_Type_TCGA_LCML,Cancer_Type_TCGA_LGG,Cancer_Type_TCGA_LIHC,Cancer_Type_TCGA_LUAD,Cancer_Type_TCGA_LUSC,Cancer_Type_TCGA_MB,Cancer_Type_TCGA_MESO,Cancer_Type_TCGA_MM,Cancer_Type_TCGA_NB,Cancer_Type_TCGA_OV,Cancer_Type_TCGA_PAAD,Cancer_Type_TCGA_PRAD,Cancer_Type_TCGA_SCLC,Cancer_Type_TCGA_SKCM,Cancer_Type_TCGA_STAD,Cancer_Type_TCGA_THCA,Cancer_Type_TCGA_UCEC,Cancer_Type_TCGA_UNABLE TO CLASSIFY,Cancer_Type_TCGA_Unknown
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
