## Cytoxicity Model

### Prepare Packages

In [15]:
import warnings

import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from xgboost import XGBRegressor
from boruta import BorutaPy

warnings.filterwarnings("ignore")

### Train The Model

#### Import Data

In [16]:
# Load the GDSC2 dataset
df = pd.read_csv('Datasets/Cytoxicity Model/GDSC2 Dose Response Oct 27.csv')
df.head(10)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,PATHWAY_NAME,COMPANY_ID,WEBRELEASE,MIN_CONC,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE
0,GDSC2,343,15946310,683667,PFSK-1,SIDM01132,MB,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-1463887,93022,89052,433123
1,GDSC2,343,15946548,684052,A673,SIDM00848,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-4869455,61497,111351,-14211
2,GDSC2,343,15946830,684057,ES5,SIDM00263,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-3360586,791072,142855,-599569
3,GDSC2,343,15947087,684059,ES7,SIDM00269,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-504494,59266,135539,-1516647
4,GDSC2,343,15947369,684062,EW-11,SIDM00203,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-3741991,734047,128059,-807232
5,GDSC2,343,15947651,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-5142961,582439,137581,-1570016
6,GDSC2,343,15947932,687448,COLO-829,SIDM00909,SKCM,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-1235034,867348,9347,557727
7,GDSC2,343,15948212,687452,5637,SIDM00807,BLCA,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-2632632,834067,76169,-203221
8,GDSC2,343,15948491,687455,RT4,SIDM01085,BLCA,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-2963191,821438,94466,-3832
9,GDSC2,343,15948772,687457,SW780,SIDM01160,BLCA,1003,Camptothecin,TOP1,DNA replication,1046,Y,1,1,-1449138,90505,74109,441154


#### Clean Data

In [17]:
target_cell_lines = ['MDA-MB-231', 'HCC38', 'HCC70', 'BT-20', 'MDA-MB-468', 
                     'MDA-MB-157', 'MDA-MB-436', 'Hs578T', 'BT-549', 'CAL148', 'SUM229PE']

# Find cell line column
cell_line_col = None
for col in df.columns:
    if 'cell' in col.lower() or 'line' in col.lower():
        cell_line_col = col
        break

if cell_line_col is None:
    for col in ['CELL_LINE_NAME', 'Cell_Line', 'SANGER_MODEL_ID', 'MODEL']:
        if col in df.columns:
            cell_line_col = col
            break

# Filter data
if cell_line_col:
    df_cleaned = df[df[cell_line_col].isin(target_cell_lines)].copy()
else:
    raise ValueError("Unable to locate cell line column in the input file.")

#### Enrich Data

In [18]:
# Load Screened Compounds dataset and merge with cleaned data
df_compounds = pd.read_csv('Datasets/Cytoxicity Model/Screened Compounds v8.5.csv')

# Find drug name columns (exclude cell line columns)
drug_col_cleaned = None
drug_col_compounds = None

# Check for drug/compound name columns in df_cleaned (exclude cell line columns)
for col in df_cleaned.columns:
    if (any(keyword in col.lower() for keyword in ['drug', 'compound']) and 
        not any(keyword in col.lower() for keyword in ['cell', 'line'])):
        drug_col_cleaned = col
        break

# Check for drug/compound name columns in df_compounds
for col in df_compounds.columns:
    if (any(keyword in col.lower() for keyword in ['drug', 'compound']) and 
        not any(keyword in col.lower() for keyword in ['cell', 'line'])):
        drug_col_compounds = col
        break

# Merge datasets
if drug_col_cleaned and drug_col_compounds:
    # Convert to string to avoid type mismatch
    df_cleaned[drug_col_cleaned] = df_cleaned[drug_col_cleaned].astype(str)
    df_compounds[drug_col_compounds] = df_compounds[drug_col_compounds].astype(str)
    
    df_merged = df_cleaned.merge(
        df_compounds,
        left_on=drug_col_cleaned,
        right_on=drug_col_compounds,
        how='left'
    )
else:
    raise ValueError("Could not align drug names between the cytoxicity and compound tables.")

df_merged.head(10)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME_x,PUTATIVE_TARGET,...,MAX_CONC,LN_IC50,AUC,RMSE,Z_SCORE,SCREENING_SITE,DRUG_NAME_y,SYNONYMS,TARGET,TARGET_PATHWAY
0,GDSC2,343,15971670,749717,HCC38,SIDM00675,BRCA,1003,Camptothecin,TOP1,...,1,-1699764,88698,90364,304696,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
1,GDSC2,343,15992681,905951,BT-549,SIDM00122,BRCA,1003,Camptothecin,TOP1,...,1,-84884,915161,71434,767997,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
2,GDSC2,343,15994803,905960,MDA-MB-231,SIDM00146,BRCA,1003,Camptothecin,TOP1,...,1,-2117922,885006,78646,77022,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
3,GDSC2,343,16006591,906801,BT-20,SIDM00893,BRCA,1003,Camptothecin,TOP1,...,1,-2468058,883584,79927,-113616,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
4,GDSC2,343,16022519,907048,HCC70,SIDM00673,BRCA,1003,Camptothecin,TOP1,...,1,36032,98317,54686,1426347,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
5,GDSC2,343,16043192,908123,MDA-MB-468,SIDM00628,BRCA,1003,Camptothecin,TOP1,...,1,-2512764,848713,109626,-137957,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
6,GDSC2,343,16107739,925338,MDA-MB-157,SIDM00529,BRCA,1003,Camptothecin,TOP1,...,1,-1172609,933856,113471,591715,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
7,GDSC2,343,16134195,1240172,MDA-MB-436,SIDM00629,BRCA,1003,Camptothecin,TOP1,...,1,-1257174,908811,73073,545672,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication
8,GDSC2,343,15971671,749717,HCC38,SIDM00675,BRCA,1004,Vinblastine,Microtubule destabiliser,...,1,-50341,940476,203087,1373245,SANGER,Vinblastine,Velban,Microtubule destabiliser,Mitosis
9,GDSC2,343,15992682,905951,BT-549,SIDM00122,BRCA,1004,Vinblastine,Microtubule destabiliser,...,1,-2179739,842367,117993,602931,SANGER,Vinblastine,Velban,Microtubule destabiliser,Mitosis


#### Retrieve SMILES

In [19]:
import pubchempy as pcp
from chembl_webresource_client.new_client import new_client

# Initialize ChEMBL client
molecule = new_client.molecule

def get_smiles_from_name(drug_name):
    """Get SMILES from drug name using PubChem"""
    try:
        compounds = pcp.get_compounds(drug_name, 'name')
        if compounds:
            return compounds[0].canonical_smiles
    except:
        pass
    return None

def get_smiles_from_chembl_id(drug_id):
    """Get SMILES from ChEMBL ID"""
    try:
        if str(drug_id).startswith('CHEMBL'):
            mol_data = molecule.get(drug_id)
            if mol_data and 'molecule_structures' in mol_data:
                return mol_data['molecule_structures']['canonical_smiles']
    except:
        pass
    return None

def get_smiles_from_chembl_name(drug_name):
    """Get SMILES from drug name using ChEMBL"""
    try:
        mol_data = molecule.search(drug_name)
        if mol_data:
            for mol in mol_data[:3]:  # Check first 3 results
                if mol.get('molecule_structures'):
                    return mol['molecule_structures']['canonical_smiles']
    except:
        pass
    return None

# Add SMILES column to your merged dataset
df_merged['SMILES'] = None

# Get drug name and ID columns
drug_name_col = None
drug_id_col = None

for col in df_merged.columns:
    if any(keyword in col.lower() for keyword in ['drug', 'compound']) and 'name' in col.lower():
        drug_name_col = col
    elif any(keyword in col.lower() for keyword in ['drug', 'compound']) and 'id' in col.lower():
        drug_id_col = col

# Retrieve SMILES for each drug
for idx, row in df_merged.iterrows():
    smiles = None
    
    # Try ChEMBL ID first if available
    if drug_id_col and pd.notna(row[drug_id_col]):
        smiles = get_smiles_from_chembl_id(row[drug_id_col])
    
    # Try drug name in PubChem if no SMILES yet
    if not smiles and drug_name_col and pd.notna(row[drug_name_col]):
        smiles = get_smiles_from_name(row[drug_name_col])
    
    # Try drug name in ChEMBL if still no SMILES
    if not smiles and drug_name_col and pd.notna(row[drug_name_col]):
        smiles = get_smiles_from_chembl_name(row[drug_name_col])
    
    df_merged.at[idx, 'SMILES'] = smiles

In [20]:
# Filter out rows with empty/invalid SMILES
df_merged = df_merged[
    df_merged['SMILES'].notna() & 
    (df_merged['SMILES'] != '') & 
    (df_merged['SMILES'].str.strip() != '')
].copy()

df_merged.head(10)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME_x,PUTATIVE_TARGET,...,LN_IC50,AUC,RMSE,Z_SCORE,SCREENING_SITE,DRUG_NAME_y,SYNONYMS,TARGET,TARGET_PATHWAY,SMILES
0,GDSC2,343,15971670,749717,HCC38,SIDM00675,BRCA,1003,Camptothecin,TOP1,...,-1699764,88698,90364,304696,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
1,GDSC2,343,15992681,905951,BT-549,SIDM00122,BRCA,1003,Camptothecin,TOP1,...,-84884,915161,71434,767997,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
2,GDSC2,343,15994803,905960,MDA-MB-231,SIDM00146,BRCA,1003,Camptothecin,TOP1,...,-2117922,885006,78646,77022,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
3,GDSC2,343,16006591,906801,BT-20,SIDM00893,BRCA,1003,Camptothecin,TOP1,...,-2468058,883584,79927,-113616,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
4,GDSC2,343,16022519,907048,HCC70,SIDM00673,BRCA,1003,Camptothecin,TOP1,...,36032,98317,54686,1426347,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
5,GDSC2,343,16043192,908123,MDA-MB-468,SIDM00628,BRCA,1003,Camptothecin,TOP1,...,-2512764,848713,109626,-137957,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
6,GDSC2,343,16107739,925338,MDA-MB-157,SIDM00529,BRCA,1003,Camptothecin,TOP1,...,-1172609,933856,113471,591715,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
7,GDSC2,343,16134195,1240172,MDA-MB-436,SIDM00629,BRCA,1003,Camptothecin,TOP1,...,-1257174,908811,73073,545672,SANGER,Camptothecin,"Camptothecine, (+)-Camptothecin",TOP1,DNA replication,CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3ccccc3nc2-1
8,GDSC2,343,15971671,749717,HCC38,SIDM00675,BRCA,1004,Vinblastine,Microtubule destabiliser,...,-50341,940476,203087,1373245,SANGER,Vinblastine,Velban,Microtubule destabiliser,Mitosis,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...
9,GDSC2,343,15992682,905951,BT-549,SIDM00122,BRCA,1004,Vinblastine,Microtubule destabiliser,...,-2179739,842367,117993,602931,SANGER,Vinblastine,Velban,Microtubule destabiliser,Mitosis,CC[C@]1(O)C[C@@H]2CN(CCc3c([nH]c4ccccc34)[C@@]...


In [21]:
# Get all available RDKit descriptors
descriptor_names = [name[0] for name in Descriptors._descList]
calculator = MolecularDescriptorCalculator(descriptor_names)

# Calculate descriptors for each unique drug
unique_drugs = df_merged[['DRUG_ID', 'SMILES']].drop_duplicates()
drug_descriptors = []

for _, row in unique_drugs.iterrows():
    mol = Chem.MolFromSmiles(row['SMILES'])
    if mol:
        descriptors = calculator.CalcDescriptors(mol)
        result = {'DRUG_ID': row['DRUG_ID']}
        result.update({f'drug_{name}': desc for name, desc in zip(descriptor_names, descriptors)})
        drug_descriptors.append(result)

drug_features_df = pd.DataFrame(drug_descriptors)

# Merge back with main dataframe
df_expanded = df_merged.merge(drug_features_df, on='DRUG_ID', how='left')

In [22]:
# Filter to the 6 cell lines with complete multi-modal coverage
complete_cell_lines = ['MDA-MB-231', 'BT-20', 'BT-549', 'MDA-MB-468', 'MDA-MB-157', 'HCC70']

df_final = df_expanded[df_expanded['CELL_LINE_NAME'].isin(complete_cell_lines)].copy()

dataset_summary = pd.Series({
    "starting_pairs": len(df_expanded),
    "final_pairs": len(df_final),
    "unique_cell_lines": ", ".join(sorted(df_final['CELL_LINE_NAME'].unique())),
    "unique_drugs": df_final['DRUG_NAME_x'].nunique()
})

dataset_summary

starting_pairs                                                    1957
final_pairs                                                       1467
unique_cell_lines    BT-20, BT-549, HCC70, MDA-MB-157, MDA-MB-231, ...
unique_drugs                                                       238
dtype: object

In [23]:
# Load mutations data and create binary mutation matrix
mutations_df = pd.read_csv('Datasets/Cytoxicity Model/mutations_all_20250318.csv')

# Filter 6 cell lines using SANGER_MODEL_ID
target_model_ids = df_final['SANGER_MODEL_ID'].unique()
mutations_filtered = mutations_df[mutations_df['model_id'].isin(target_model_ids)]

# Create binary mutation matrix (1=mutated, 0=wild-type) 
mutation_matrix = mutations_filtered.pivot_table(
    index='model_id', 
    columns='gene_symbol', 
    values='coding', 
    fill_value=0,
    aggfunc='max'
)

# Add 'mut_' prefix to column names
mutation_matrix.columns = [f'mut_{gene}' for gene in mutation_matrix.columns]
mutation_matrix = mutation_matrix.reset_index()
mutation_matrix.rename(columns={'model_id': 'SANGER_MODEL_ID'}, inplace=True)

# Merge with main dataframe
df_with_mutations = df_final.merge(mutation_matrix, on='SANGER_MODEL_ID', how='left')

In [24]:
# Filter to cancer-relevant genes (common cancer genes)
cancer_genes = ['TP53', 'BRCA1', 'BRCA2', 'PIK3CA', 'PTEN', 'RB1', 'APC', 'KRAS', 'EGFR', 
                'MYC', 'ERBB2', 'CDH1', 'STK11', 'CDKN2A', 'ATM', 'CHEK2', 'PALB2', 'MLH1', 
                'MSH2', 'VHL', 'NF1', 'NF2', 'SMAD4', 'DCC', 'FHIT', 'WWOX', 'CTNNB1']

# Keep only cancer gene mutations
cancer_mut_cols = [col for col in df_with_mutations.columns if 
                   col.startswith('mut_') and col.replace('mut_', '') in cancer_genes]

# Create filtered dataframe with original columns + cancer mutations
original_cols = [col for col in df_with_mutations.columns if not col.startswith('mut_')]
df_filtered = df_with_mutations[original_cols + cancer_mut_cols].copy()


In [25]:
# Load CNV data - OPTIMIZED: only read needed columns and filter during load
target_model_ids = df_filtered['SANGER_MODEL_ID'].unique()

# Read only the columns we need to dramatically reduce load time
cnv_df = pd.read_csv(
    'Datasets/Cytoxicity Model/WES_pureCN_CNV_genes_20250207.csv',
    usecols=['model_id', 'symbol', 'total_copy_number']
)

# Filter CNV data to your 6 cell lines and cancer genes
cnv_filtered = cnv_df[
    (cnv_df['model_id'].isin(target_model_ids)) & 
    (cnv_df['symbol'].isin(cancer_genes))
]

# Create CNV matrix using total_copy_number
cnv_matrix = cnv_filtered.pivot_table(
    index='model_id',
    columns='symbol', 
    values='total_copy_number',
    fill_value=2,  # Default copy number is 2
    aggfunc='mean'  # Average if multiple entries per gene
)

# Add 'cnv_' prefix to column names
cnv_matrix.columns = [f'cnv_{gene}' for gene in cnv_matrix.columns]
cnv_matrix = cnv_matrix.reset_index()
cnv_matrix.rename(columns={'model_id': 'SANGER_MODEL_ID'}, inplace=True)

# Merge with your main dataframe
df_final_genomics = df_filtered.merge(cnv_matrix, on='SANGER_MODEL_ID', how='left')


In [26]:
# Fix target variable - convert commas to dots and make numeric
df_final_genomics['LN_IC50_numeric'] = pd.to_numeric(
    df_final_genomics['LN_IC50'].str.replace(',', '.'), 
    errors='coerce'
)

# Clean feature matrix - keep only drug molecular descriptors (remove genomics)
feature_cols = [col for col in df_final_genomics.columns 
                if col.startswith('drug_')]

X = df_final_genomics[feature_cols].select_dtypes(include=[np.number])
X = X.fillna(0)  # Fill missing values with 0
y = df_final_genomics['LN_IC50_numeric'].fillna(df_final_genomics['LN_IC50_numeric'].mean())

# Feature selection with BorutaPy
rf_selector = RandomForestRegressor(n_estimators=100, random_state=42)
boruta = BorutaPy(rf_selector, n_estimators='auto', verbose=0, random_state=42)
boruta.fit(X.values, y.values)

# Get selected features
selected_features = X.columns[boruta.support_].tolist()
X_selected = X[selected_features]

def get_scaffold(smiles):
    """Extract Murcko scaffold from SMILES"""
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            return Chem.MolToSmiles(scaffold)
    except:
        pass
    return "Unknown"
 
# Get scaffolds for each drug (map from DRUG_ID to scaffold)
drug_scaffolds = df_final_genomics.groupby('DRUG_ID')['SMILES'].first().apply(get_scaffold)
scaffold_groups = df_final_genomics['DRUG_ID'].map(drug_scaffolds)

gkf = GroupKFold(n_splits=5)

scores = []
for train_idx, test_idx in gkf.split(X_selected, y, scaffold_groups):
    X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    

In [27]:
# Scaffold-aware train/test split

def get_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(MurckoScaffold.GetScaffoldForMol(mol)) if mol else None

scaffolds = df_final_genomics.groupby('DRUG_ID')['SMILES'].first().apply(get_scaffold)
scaffold_groups = df_final_genomics['DRUG_ID'].map(scaffolds)

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X_selected, y, groups=scaffold_groups))

X_train, X_test = X_selected.iloc[train_idx], X_selected.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

#### Random Forest and XGBoost Models


In [None]:
trained_models = {}
metrics = []

models = {
    "Random Forest": RandomForestRegressor(
        n_estimators=500,
        max_depth=None,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    ),
    "XGBoost": XGBRegressor(
        n_estimators=600,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="reg:squarederror",
        eval_metric="rmse",
        random_state=42,
        n_jobs=-1
    )
}

for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model

    preds_train = model.predict(X_train)
    preds_test = model.predict(X_test)

    metrics.append({
        "model": name,
        "r2_train": r2_score(y_train, preds_train),
        "r2_test": r2_score(y_test, preds_test),
        "mae": mean_absolute_error(y_test, preds_test),
        "rmse": np.sqrt(mean_squared_error(y_test, preds_test))
    })

model_metrics = (
    pd.DataFrame(metrics)
    .set_index("model")
    .sort_values("r2_test", ascending=False)
)

model_metrics
