# Notebook for Feature Engineering

This notebook will contain a few steps to get the data ready for model training

1. Data Cleaning (removing nulls, and other bad data)
2. Feature Engineering (creating target features AUC+IC50)
3. Creating Train Test Splits (including leave out drugs and leave out cell lines)

In [7]:
import pyspark
import pandas as pd

# load in the merged dataframe, containing GDSC1, GDSC2, and cell line metadata
gdsc_merged = pd.read_csv("GDSC1and2_w_CellLineData.csv")
df = gdsc_merged
gdsc_merged.head(5)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,...,3.966813,0.985678,0.026081,1.299144,ES5,bone,ewings_sarcoma,,R,Adherent
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,...,2.69209,0.97269,0.110059,0.156076,ES7,bone,ewings_sarcoma,,R,Adherent
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,...,2.47799,0.944459,0.087019,-0.035912,EW-11,bone,ewings_sarcoma,,R,Adherent
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,...,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,bone,ewings_sarcoma,,R,Semi-Adherent
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,...,2.966007,0.954778,0.180255,0.401702,COLO-829,skin,melanoma,SKCM,R,Adherent


## Data Cleaning

In [21]:

import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 140)

data = "GDSC1and2_w_CellLineData.csv"  


df = pd.read_csv(data)
print(df.shape)
df.head()


(575197, 24)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,3.966813,0.985678,0.026081,1.299144,ES5,bone,ewings_sarcoma,,R,Adherent
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.69209,0.97269,0.110059,0.156076,ES7,bone,ewings_sarcoma,,R,Adherent
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912,EW-11,bone,ewings_sarcoma,,R,Adherent
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,bone,ewings_sarcoma,,R,Semi-Adherent
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.966007,0.954778,0.180255,0.401702,COLO-829,skin,melanoma,SKCM,R,Adherent


In [23]:
#copy df 
df_clean = df.copy()

#standardize categorical columns --data cleaning
cat_cols = [
    "DATASET", "CELL_LINE_NAME", "SANGER_MODEL_ID", "TCGA_DESC",
    "DRUG_NAME", "PUTATIVE_TARGET", "PATHWAY_NAME", "Sample Name",
    "GDSC_Tissue_descriptor_1", "GDSC_Tissue_descriptor_2",
    "Cancer_Type_TCGA", "Medium", "Growth"
]


for c in cat_cols:
    if c in df_clean.columns:
        df_clean[c] = (
            df_clean[c]
            .astype(str)
            .str.strip()
        )

#fill missing columns with 'unknown' so it is not null
for c in ["PUTATIVE_TARGET", "PATHWAY_NAME", "Growth", "TCGA_DESC", "Cancer_Type_TCGA"]:
    if c in df_clean.columns:
        df_clean[c] = df_clean[c].fillna("Unknown").replace({"nan": "Unknown"})


#make sure auc and ln_ic50 exist  and drop null
df_clean = df_clean.dropna(subset=["AUC", "LN_IC50"]).copy()

# keep best 95% using RMSE threshold 
rmse_thr = df_clean["RMSE"].quantile(0.95)
df_clean = df_clean[df_clean["RMSE"] <= rmse_thr].copy()

print("After cleaning:", df_clean.shape)
df_clean.head()


After cleaning: (546438, 24)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,3.966813,0.985678,0.026081,1.299144,ES5,bone,ewings_sarcoma,Unknown,R,Adherent
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.69209,0.97269,0.110059,0.156076,ES7,bone,ewings_sarcoma,Unknown,R,Adherent
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912,EW-11,bone,ewings_sarcoma,Unknown,R,Adherent
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,bone,ewings_sarcoma,Unknown,R,Semi-Adherent
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.966007,0.954778,0.180255,0.401702,COLO-829,skin,melanoma,SKCM,R,Adherent


In [24]:
# get rid of bad values, make sure everything is numeric 
df_clean["AUC"] = pd.to_numeric(df_clean["AUC"], errors="coerce")
df_clean["LN_IC50"] = pd.to_numeric(df_clean["LN_IC50"], errors="coerce")
df_clean = df_clean.dropna(subset=["AUC", "LN_IC50"]).copy()


In [25]:
import numpy as np

# params 
eps = 1e-6        
w = 1.0  


#AUC: clip → logit → winsorize → z-score


#clip to (0,1) so logit is defined
df_clean["AUC_clip"] = np.clip(df_clean["AUC"].astype(float), eps, 1 - eps)

#logit for linearity 
df_clean["AUC_logit"] = np.log(df_clean["AUC_clip"] / (1.0 - df_clean["AUC_clip"]))

#winsorize for extreme tails
l, h = np.nanpercentile(df_clean["AUC_logit"], [w, 100 - w])
df_clean["AUC_wz"] = df_clean["AUC_logit"].clip(l, h)

#standardize (z-score)
meanAUC = df_clean["AUC_logit"].mean()
stdAUC = df_clean["AUC_logit"].std(ddof=0)
df_clean["AUC_z"] = (df_clean["AUC_logit"] - meanAUC) / (stdAUC if stdAUC != 0 else 1.0)

#IC50: LN_IC50 → pIC50 → winsorize → z-score


#natural-log IC50 to pIC50 (−log10 IC50)
df_clean["pIC50"] = -df_clean["LN_IC50"].astype(float) / np.log(10.0)

#winsorize pIC50 to reduce outlier extremes 
l2, h2 = np.nanpercentile(df_clean["pIC50"], [w, 100 - w])
df_clean["pIC50_wz"] = df_clean["pIC50"].clip(l2, h2)

#standardize
pic_mu = df_clean["pIC50"].mean()
pic_sd = df_clean["pIC50"].std(ddof=0)
df_clean["pIC50_z"] = (df_clean["pIC50"] - pic_mu) / (pic_sd if pic_sd != 0 else 1.0)

# ICI50 to original units, then also the log 
df_clean["IC50_raw_value"] = np.exp(df_clean["LN_IC50"])    

# no 0 division 
df_clean["AUC_IC50_ratio"] = (
    df_clean["AUC"] / df_clean["LN_IC50"].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan)

print("AUC (min/max):", float(df_clean["AUC"].min()), float(df_clean["AUC"].max()))
print("AUC_logit mean/std:", float(df_clean["AUC_logit"].mean()), float(df_clean["AUC_logit"].std(ddof=0)))
print("pIC50 mean/std:", float(df_clean["pIC50"].mean()), float(df_clean["pIC50"].std(ddof=0)))

df_clean[["AUC","AUC_clip","AUC_logit","AUC_wz","AUC_z",
          "LN_IC50","pIC50","pIC50_wz","pIC50_z"]].head()


AUC (min/max): 0.005996 0.999552
AUC_logit mean/std: 2.4804236674831186 1.4522381181610693
pIC50 mean/std: -1.077412160057767 1.1578201298874615


Unnamed: 0,AUC,AUC_clip,AUC_logit,AUC_wz,AUC_z,LN_IC50,pIC50,pIC50_wz,pIC50_z
0,0.985678,0.985678,4.231533,4.231533,1.2058,3.966813,-1.722765,-1.722765,-0.557386
1,0.97269,0.97269,3.572812,3.572812,0.752211,2.69209,-1.16916,-1.16916,-0.079242
2,0.944459,0.944459,2.833491,2.833491,0.243119,2.47799,-1.076177,-1.076177,0.001066
3,0.950758,0.950758,2.960513,2.960513,0.330586,2.033564,-0.883166,-0.883166,0.167769
4,0.954778,0.954778,3.049895,3.049895,0.392134,2.966007,-1.28812,-1.28812,-0.181987


In [26]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

df_encoded = df_clean.copy()
cat_cols = ["GDSC_Tissue_descriptor_1", "Cancer_Type_TCGA"]

# Use sparse_output instead of sparse
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False, dtype=int)

encoded_arr = ohe.fit_transform(df_encoded[cat_cols])
encoded_cols = ohe.get_feature_names_out(cat_cols)

df_ohe = pd.DataFrame(encoded_arr, columns=encoded_cols, index=df_encoded.index)
df_encoded = pd.concat([df_encoded.drop(columns=cat_cols), df_ohe], axis=1)

print(f"Added {len(encoded_cols)} one-hot columns.")
df_encoded.filter(regex="^(GDSC_Tissue_descriptor_1|Cancer_Type_TCGA)").head()

Added 51 one-hot columns.


Unnamed: 0,GDSC_Tissue_descriptor_1_aero_dig_tract,GDSC_Tissue_descriptor_1_bone,GDSC_Tissue_descriptor_1_breast,GDSC_Tissue_descriptor_1_digestive_system,GDSC_Tissue_descriptor_1_kidney,GDSC_Tissue_descriptor_1_large_intestine,GDSC_Tissue_descriptor_1_leukemia,GDSC_Tissue_descriptor_1_lung,GDSC_Tissue_descriptor_1_lung_NSCLC,GDSC_Tissue_descriptor_1_lung_SCLC,GDSC_Tissue_descriptor_1_lymphoma,GDSC_Tissue_descriptor_1_myeloma,GDSC_Tissue_descriptor_1_nervous_system,GDSC_Tissue_descriptor_1_neuroblastoma,GDSC_Tissue_descriptor_1_pancreas,GDSC_Tissue_descriptor_1_skin,GDSC_Tissue_descriptor_1_soft_tissue,GDSC_Tissue_descriptor_1_thyroid,GDSC_Tissue_descriptor_1_urogenital_system,Cancer_Type_TCGA_ACC,Cancer_Type_TCGA_ALL,Cancer_Type_TCGA_BLCA,Cancer_Type_TCGA_BRCA,Cancer_Type_TCGA_CESC,Cancer_Type_TCGA_CLL,Cancer_Type_TCGA_COAD/READ,Cancer_Type_TCGA_DLBC,Cancer_Type_TCGA_ESCA,Cancer_Type_TCGA_GBM,Cancer_Type_TCGA_HNSC,Cancer_Type_TCGA_KIRC,Cancer_Type_TCGA_LAML,Cancer_Type_TCGA_LCML,Cancer_Type_TCGA_LGG,Cancer_Type_TCGA_LIHC,Cancer_Type_TCGA_LUAD,Cancer_Type_TCGA_LUSC,Cancer_Type_TCGA_MB,Cancer_Type_TCGA_MESO,Cancer_Type_TCGA_MM,Cancer_Type_TCGA_NB,Cancer_Type_TCGA_OV,Cancer_Type_TCGA_PAAD,Cancer_Type_TCGA_PRAD,Cancer_Type_TCGA_SCLC,Cancer_Type_TCGA_SKCM,Cancer_Type_TCGA_STAD,Cancer_Type_TCGA_THCA,Cancer_Type_TCGA_UCEC,Cancer_Type_TCGA_UNABLE TO CLASSIFY,Cancer_Type_TCGA_Unknown
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [40]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

df_encoded = df_clean.copy()
cat_cols = ["GDSC_Tissue_descriptor_1", "Cancer_Type_TCGA"]

# Create a LabelEncoder for each categorical column
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le  # Save for later use
    
    # Optional: Print the mapping to see what each number represents
    print(f"\n{col} mapping:")
    print(dict(zip(le.classes_, le.transform(le.classes_))))

print(f"\nLabel encoded {len(cat_cols)} columns.")
df_encoded[cat_cols].head()


GDSC_Tissue_descriptor_1 mapping:
{'aero_dig_tract': np.int64(0), 'bone': np.int64(1), 'breast': np.int64(2), 'digestive_system': np.int64(3), 'kidney': np.int64(4), 'large_intestine': np.int64(5), 'leukemia': np.int64(6), 'lung': np.int64(7), 'lung_NSCLC': np.int64(8), 'lung_SCLC': np.int64(9), 'lymphoma': np.int64(10), 'myeloma': np.int64(11), 'nervous_system': np.int64(12), 'neuroblastoma': np.int64(13), 'pancreas': np.int64(14), 'skin': np.int64(15), 'soft_tissue': np.int64(16), 'thyroid': np.int64(17), 'urogenital_system': np.int64(18)}

Cancer_Type_TCGA mapping:
{'ACC': np.int64(0), 'ALL': np.int64(1), 'BLCA': np.int64(2), 'BRCA': np.int64(3), 'CESC': np.int64(4), 'CLL': np.int64(5), 'COAD/READ': np.int64(6), 'DLBC': np.int64(7), 'ESCA': np.int64(8), 'GBM': np.int64(9), 'HNSC': np.int64(10), 'KIRC': np.int64(11), 'LAML': np.int64(12), 'LCML': np.int64(13), 'LGG': np.int64(14), 'LIHC': np.int64(15), 'LUAD': np.int64(16), 'LUSC': np.int64(17), 'MB': np.int64(18), 'MESO': np.int64(

Unnamed: 0,GDSC_Tissue_descriptor_1,Cancer_Type_TCGA
0,1,31
1,1,31
2,1,31
3,1,31
4,15,26


In [41]:
# label_encoders = {}
# for col in cat_cols:
#     le = LabelEncoder()
#     df_encoded[col] = le.fit_transform(df_encoded[col])
#     label_encoders[col] = le  # Save for later use

#print
for col, le in label_encoders.items():
    print(f"\n{col}:")
    print("-" * 50)
    mapping = dict(zip(le.classes_, range(len(le.classes_))))
    for category, label in mapping.items():
        print(f"  {label}: {category}")


GDSC_Tissue_descriptor_1:
--------------------------------------------------
  0: aero_dig_tract
  1: bone
  2: breast
  3: digestive_system
  4: kidney
  5: large_intestine
  6: leukemia
  7: lung
  8: lung_NSCLC
  9: lung_SCLC
  10: lymphoma
  11: myeloma
  12: nervous_system
  13: neuroblastoma
  14: pancreas
  15: skin
  16: soft_tissue
  17: thyroid
  18: urogenital_system

Cancer_Type_TCGA:
--------------------------------------------------
  0: ACC
  1: ALL
  2: BLCA
  3: BRCA
  4: CESC
  5: CLL
  6: COAD/READ
  7: DLBC
  8: ESCA
  9: GBM
  10: HNSC
  11: KIRC
  12: LAML
  13: LCML
  14: LGG
  15: LIHC
  16: LUAD
  17: LUSC
  18: MB
  19: MESO
  20: MM
  21: NB
  22: OV
  23: PAAD
  24: PRAD
  25: SCLC
  26: SKCM
  27: STAD
  28: THCA
  29: UCEC
  30: UNABLE TO CLASSIFY
  31: Unknown


## Feature Engineering

In [42]:
df_encoded.head()

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth,AUC_clip,AUC_logit,AUC_wz,AUC_z,pIC50,pIC50_wz,pIC50_z,IC50_raw_value,AUC_IC50_ratio
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,3.966813,0.985678,0.026081,1.299144,ES5,1,ewings_sarcoma,31,R,Adherent,0.985678,4.231533,4.231533,1.2058,-1.722765,-1.722765,-0.557386,52.815938,0.248481
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.69209,0.97269,0.110059,0.156076,ES7,1,ewings_sarcoma,31,R,Adherent,0.97269,3.572812,3.572812,0.752211,-1.16916,-1.16916,-0.079242,14.762497,0.361314
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912,EW-11,1,ewings_sarcoma,31,R,Adherent,0.944459,2.833491,2.833491,0.243119,-1.076177,-1.076177,0.001066,11.917287,0.381139
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,1,ewings_sarcoma,31,R,Semi-Adherent,0.950758,2.960513,2.960513,0.330586,-0.883166,-0.883166,0.167769,7.641271,0.467533
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.966007,0.954778,0.180255,0.401702,COLO-829,15,melanoma,26,R,Adherent,0.954778,3.049895,3.049895,0.392134,-1.28812,-1.28812,-0.181987,19.414244,0.321907


## Creation of Interaction Term IC50 + AUC

In this segment we create sensitivity, disagreement and weighted averages for these terms. The goal is to have two metric targets and be able to train with either a dual output approach or a singlet output with weighted averages. I'll let the training team decided which to use or go with the one with better performance.

In [43]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

df = df_encoded.copy()
scaler = StandardScaler()
df[["z_LN_IC50", "z_AUC"]] = scaler.fit_transform(df[["LN_IC50", "AUC"]])

# For interpretability: low LN_IC50 = sensitive, low AUC = sensitive
df["z_IC50_sens"] = df["z_LN_IC50"]


# Sensitivity (average of both)
df["sensitivity"] = (df["z_IC50_sens"] + df["z_AUC"]) / 2

# Disagreement (difference between metrics)
df["disagreement"] = df["z_AUC"] - df["z_IC50_sens"]


# Weighted averages of both metrics for different α
alphas = [0.25, 0.5, 0.75]
for a in alphas:
    df[f"y_weighted_{a}"] = a * df["z_IC50_sens"] + (1 - a) * df["z_AUC"]

df.head(5)


Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth,AUC_clip,AUC_logit,AUC_wz,AUC_z,pIC50,pIC50_wz,pIC50_z,IC50_raw_value,AUC_IC50_ratio,z_LN_IC50,z_AUC,z_IC50_sens,sensitivity,disagreement,y_weighted_0.25,y_weighted_0.5,y_weighted_0.75
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,3.966813,0.985678,0.026081,1.299144,ES5,1,ewings_sarcoma,31,R,Adherent,0.985678,4.231533,4.231533,1.2058,-1.722765,-1.722765,-0.557386,52.815938,0.248481,0.557386,0.714071,0.557386,0.635729,0.156685,0.6749,0.635729,0.596557
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.69209,0.97269,0.110059,0.156076,ES7,1,ewings_sarcoma,31,R,Adherent,0.97269,3.572812,3.572812,0.752211,-1.16916,-1.16916,-0.079242,14.762497,0.361314,0.079242,0.639497,0.079242,0.359369,0.560255,0.499433,0.359369,0.219306
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.47799,0.944459,0.087019,-0.035912,EW-11,1,ewings_sarcoma,31,R,Adherent,0.944459,2.833491,2.833491,0.243119,-1.076177,-1.076177,0.001066,11.917287,0.381139,-0.001066,0.477401,-0.001066,0.238167,0.478468,0.357784,0.238167,0.11855
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,1,ewings_sarcoma,31,R,Semi-Adherent,0.950758,2.960513,2.960513,0.330586,-0.883166,-0.883166,0.167769,7.641271,0.467533,-0.167769,0.513569,-0.167769,0.1729,0.681338,0.343234,0.1729,0.002565
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,EGFR signaling,1045,0.007813,2.0,2.966007,0.954778,0.180255,0.401702,COLO-829,15,melanoma,26,R,Adherent,0.954778,3.049895,3.049895,0.392134,-1.28812,-1.28812,-0.181987,19.414244,0.321907,0.181987,0.536651,0.181987,0.359319,0.354664,0.447985,0.359319,0.270653


## Tissue Descriptors

Lets focus on using only Tissue Descriptor 1 "GDSC_Tissue_descriptor_1". I would rather lean on the larger sample size of each bin within Tissue Descriptor 1 for ease/simplicity. I think there is a good argument for usind Tissue descriptor 2 for personalized medicine and specific drug discovery but its not worth it for now. We can't do a simple bootstrapping method to bring up categories with 5 samples up to 60 without creating significant bias. Tissue Descriptor 1 still has imbalances but the effects won't be as severe. It would be reasonable to combine oversampling and undersampling in this case though.

## Data Splits for Model Training
1
Random splits: This approach is also called Mixed-Set in [8, 39], and it is generally the least challenging, leading to the highest observed performance scores. In this scenario, a randomly selected subset of drug-cell line pairs is excluded from the training set and used as the test set. This train-test Splitting Strategy quantifies how accurate a model is in filling the gaps in a drug-cell lines matrix containing some unobserved values. Practically, this would correspond to filling a non-exhaustive screening on a panel of otherwise known cell lines and drugs. In this scenario, the model is not evaluated in terms of its ability to generalize to cell lines or drugs for which we completely lack drug response measurements.

2
Unseen cell lines: In this case, the train and test splits are made by ensuring that the cell lines in the training set are not present in the test. The test set is constructed by randomly selecting a subset of cell lines and all of their IC50 values from the entire dataset. To achieve high performance scores in this validation, the models need to be able to generalize to unseen cell lines. With respect to the Random Splits, this therefore increases the difficulty of the prediction task.

3
Unseen drugs: The train and test splits are made to ensure that the drugs that appear in the test set are not present in the training set. To perform well in this setting, the model must be able to generalize well to completely unseen drugs.

4
Unseen cell line-drug pairs: This is the most stringent validation setting. In this case, the training and test splits are built to ensure that each of the cell lines and drugs present in the test set are both absent from the training set. This setting therefore evaluates the ability of the model to generalize at the same time to unseen drugs and cell lines, which should be the ultimate goal of the cancer drug sensitivity prediction field. However, until now, generalization in this setting has been nearly impossible, and as such, it is infrequently utilized in evaluations [9].

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def random_split(df, test_size=0.2, random_state=42):
    """
    Completely random splitting.
    """
    return train_test_split(df, test_size=test_size, random_state=random_state, shuffle=True)

def unseen_cell_lines_split(df, test_size=0.2, random_state=42):
    """
    Splits dataset so that cell lines in test are unseen in train.
    """
    cell_lines = df['COSMIC_ID'].unique()
    train_lines, test_lines = train_test_split(cell_lines, test_size=test_size, random_state=random_state)
    train_df = df[df['COSMIC_ID'].isin(train_lines)]
    test_df = df[df['COSMIC_ID'].isin(test_lines)]
    return train_df, test_df

def unseen_drugs_split(df, test_size=0.2, random_state=42):
    """
    Splits dataset so that drugs in test are unseen in train.
    """
    drugs = df['DRUG_ID'].unique()
    train_drugs, test_drugs = train_test_split(drugs, test_size=test_size, random_state=random_state)
    train_df = df[df['DRUG_ID'].isin(train_drugs)]
    test_df = df[df['DRUG_ID'].isin(test_drugs)]
    return train_df, test_df

def unseen_cell_line_drug_pairs_split(df, test_size=0.2, random_state=42):
    """
    Creates disjoint sets of both drugs and cell lines.
    Test set contains combinations of unseen drugs and unseen cell lines.
    """
    drugs = df['DRUG_ID'].unique()
    cell_lines = df['COSMIC_ID'].unique()

    # Select subsets of drugs and cell lines for the test set
    test_drugs = np.random.default_rng(random_state).choice(drugs, size=int(len(drugs) * test_size), replace=False)
    test_cells = np.random.default_rng(random_state + 1).choice(cell_lines, size=int(len(cell_lines) * test_size), replace=False)

    # Test = only pairs where BOTH drug and cell line are unseen
    test_df = df[(df['DRUG_ID'].isin(test_drugs)) & (df['COSMIC_ID'].isin(test_cells))]

    # Train = everything else (ensures no leakage)
    train_df = df[~df.index.isin(test_df.index)]

    return train_df, test_df


In [14]:
train_df, test_df = unseen_drugs_split(df)
train_df.head(5)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,Medium,Growth,z_LN_IC50,z_AUC,z_IC50_sens,sensitivity,disagreement,y_weighted_0.25,y_weighted_0.5,y_weighted_0.75
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Adherent,0.587657,0.738863,-0.587657,0.075603,1.32652,0.407233,0.075603,-0.256027
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Adherent,0.113887,0.665612,-0.113887,0.275862,0.779499,0.470737,0.275862,0.080988
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Adherent,0.034313,0.506391,-0.034313,0.236039,0.540704,0.371215,0.236039,0.100863
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Semi-Adherent,-0.130864,0.541917,0.130864,0.336391,0.411053,0.439154,0.336391,0.233627
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,...,R,Adherent,0.215692,0.564589,-0.215692,0.174448,0.780282,0.369519,0.174448,-0.020622


In [20]:
test = set(test_df['DRUG_ID'].unique()) 
train = set(train_df['DRUG_ID'].unique())

test.intersection(train)
# Confirm no intersection in left out sets.

set()