In this notebook, I'll be using Optuna to find the best hyperparameters for our Random Forest model using our Random Split

In [2]:
import pandas as pd

# load in the merged dataframe, containing GDSC1, GDSC2, and cell line metadata
gdsc_merged = pd.read_csv("../GDSC1and2_w_CellLineData.csv")
df = gdsc_merged
gdsc_merged.head(5)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,LN_IC50,AUC,RMSE,Z_SCORE,Sample Name,GDSC_Tissue_descriptor_1,GDSC_Tissue_descriptor_2,Cancer_Type_TCGA,Medium,Growth
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,...,3.966813,0.985678,0.026081,1.299144,ES5,bone,ewings_sarcoma,,R,Adherent
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,...,2.69209,0.97269,0.110059,0.156076,ES7,bone,ewings_sarcoma,,R,Adherent
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,...,2.47799,0.944459,0.087019,-0.035912,EW-11,bone,ewings_sarcoma,,R,Adherent
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,...,2.033564,0.950758,0.01629,-0.434437,SK-ES-1,bone,ewings_sarcoma,,R,Semi-Adherent
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,...,2.966007,0.954778,0.180255,0.401702,COLO-829,skin,melanoma,SKCM,R,Adherent


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df[["z_LN_IC50", "z_AUC"]] = scaler.fit_transform(df[["LN_IC50", "AUC"]])

# For interpretability: low LN_IC50 = sensitive, low AUC = sensitive
df["z_IC50_sens"] = df["z_LN_IC50"]


# Sensitivity (average of both)
df["sensitivity"] = (df["z_IC50_sens"] + df["z_AUC"]) / 2

# Disagreement (difference between metrics)
df["disagreement"] = df["z_AUC"] - df["z_IC50_sens"]


# Weighted averages of both metrics for different α
alphas = [0.25, 0.5, 0.75]
for a in alphas:
    df[f"y_weighted_{a}"] = a * df["z_IC50_sens"] + (1 - a) * df["z_AUC"]

df.head(5)

Unnamed: 0,DATASET,NLME_RESULT_ID,NLME_CURVE_ID,COSMIC_ID,CELL_LINE_NAME,SANGER_MODEL_ID,TCGA_DESC,DRUG_ID,DRUG_NAME,PUTATIVE_TARGET,...,Medium,Growth,z_LN_IC50,z_AUC,z_IC50_sens,sensitivity,disagreement,y_weighted_0.25,y_weighted_0.5,y_weighted_0.75
0,GDSC1,342,15580432,684057,ES5,SIDM00263,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Adherent,0.587657,0.738863,0.587657,0.66326,0.151206,0.701061,0.66326,0.625459
1,GDSC1,342,15580806,684059,ES7,SIDM00269,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Adherent,0.113887,0.665612,0.113887,0.389749,0.551725,0.52768,0.389749,0.251818
2,GDSC1,342,15581198,684062,EW-11,SIDM00203,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Adherent,0.034313,0.506391,0.034313,0.270352,0.472078,0.388372,0.270352,0.152333
3,GDSC1,342,15581542,684072,SK-ES-1,SIDM01111,UNCLASSIFIED,1,Erlotinib,EGFR,...,R,Semi-Adherent,-0.130864,0.541917,-0.130864,0.205526,0.672781,0.373722,0.205526,0.037331
4,GDSC1,342,15581930,687448,COLO-829,SIDM00909,SKCM,1,Erlotinib,EGFR,...,R,Adherent,0.215692,0.564589,0.215692,0.390141,0.348897,0.477365,0.390141,0.302917


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def random_split(df, test_size=0.2, random_state=42):
    """
    Completely random splitting.
    """
    return train_test_split(df, test_size=test_size, random_state=random_state, shuffle=True)

In [5]:
# Train test split with random splitting
train_df_random, test_df_random = random_split(df)

print(f"Training samples: {len(train_df_random)}")
print(f"Test samples: {len(test_df_random)}")
print(f"\nSample overlap statistics:")
print(f"Unique drugs in train: {train_df_random['DRUG_ID'].nunique()}")
print(f"Unique drugs in test: {test_df_random['DRUG_ID'].nunique()}")
print(f"Unique cell lines in train: {train_df_random['COSMIC_ID'].nunique()}")
print(f"Unique cell lines in test: {test_df_random['COSMIC_ID'].nunique()}")

Training samples: 460157
Test samples: 115040

Sample overlap statistics:
Unique drugs in train: 621
Unique drugs in test: 621
Unique cell lines in train: 978
Unique cell lines in test: 978


In [6]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Make explicit copies to avoid SettingWithCopyWarning
train_df_random = train_df_random.copy()
test_df_random = test_df_random.copy()

# Prepare features and target
X_train = train_df_random[['DRUG_ID', 'COSMIC_ID']]
y_train = train_df_random['z_LN_IC50']

X_test = test_df_random[['DRUG_ID', 'COSMIC_ID']]
y_test = test_df_random['z_LN_IC50']

## Model Training for z_LN_IC50 Prediction

Fitting modes to predict z_LN_IC50 

### Random Forest and Hyper Parameter Tuning for z_LN_IC50

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import optuna

# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20, None]
}

# Create Random Forest model
rf_base = RandomForestRegressor(random_state=42, n_jobs=-1)

# Perform Grid Search with cross-validation
print("Performing Grid Search for Random Forest hyperparameters...")
print("This may take a few minutes...")

grid_search = GridSearchCV(
    estimator=rf_base,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# Print best parameters
print(f"\nBest parameters found:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nBest cross-validation RMSE: {np.sqrt(-grid_search.best_score_):.4f}")

# Use the best model
rf_model = grid_search.best_estimator_

Performing Grid Search for Random Forest hyperparameters...
This may take a few minutes...
Fitting 3 folds for each of 15 candidates, totalling 45 fits


KeyboardInterrupt: 

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import optuna

def objective(trial):
    x = 0
    print('starting trial ', x)
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }
    

    model = RandomForestRegressor(**params)

    scores = cross_val_score(model, 
                             X_train, 
                             y_train, 
                             cv=3,
                             scoring = 'neg_root_mean_squared_error')
    x += 1
    return np.mean(scores)

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2025-11-11 18:01:16,961] A new study created in memory with name: no-name-8eb1a411-61d8-4ba9-93d4-74365d7b2631


starting trial  0


[I 2025-11-11 18:02:05,654] Trial 0 finished with value: -0.6487605746525781 and parameters: {'n_estimators': 176, 'max_depth': 40, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: -0.6487605746525781.


starting trial  0


[I 2025-11-11 18:03:49,831] Trial 1 finished with value: -0.6489057535903366 and parameters: {'n_estimators': 398, 'max_depth': 32, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: -0.6487605746525781.


starting trial  0


[I 2025-11-11 18:07:24,442] Trial 2 finished with value: -0.5285811404131047 and parameters: {'n_estimators': 838, 'max_depth': 45, 'min_samples_split': 16, 'min_samples_leaf': 16, 'max_features': None, 'bootstrap': True}. Best is trial 2 with value: -0.5285811404131047.


starting trial  0


[I 2025-11-11 18:11:15,634] Trial 3 finished with value: -0.6505162775931415 and parameters: {'n_estimators': 863, 'max_depth': 37, 'min_samples_split': 10, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 2 with value: -0.5285811404131047.


starting trial  0


[I 2025-11-11 18:12:39,573] Trial 4 finished with value: -0.6753332926384519 and parameters: {'n_estimators': 342, 'max_depth': 33, 'min_samples_split': 14, 'min_samples_leaf': 19, 'max_features': 'log2', 'bootstrap': False}. Best is trial 2 with value: -0.5285811404131047.


starting trial  0


[I 2025-11-11 18:15:45,428] Trial 5 finished with value: -0.5316785076987166 and parameters: {'n_estimators': 630, 'max_depth': 37, 'min_samples_split': 11, 'min_samples_leaf': 8, 'max_features': None, 'bootstrap': True}. Best is trial 2 with value: -0.5285811404131047.


starting trial  0


[I 2025-11-11 18:19:52,560] Trial 6 finished with value: -0.5357141042547405 and parameters: {'n_estimators': 793, 'max_depth': 39, 'min_samples_split': 14, 'min_samples_leaf': 4, 'max_features': None, 'bootstrap': True}. Best is trial 2 with value: -0.5285811404131047.


starting trial  0


[I 2025-11-11 18:21:49,271] Trial 7 finished with value: -0.5373256221943 and parameters: {'n_estimators': 369, 'max_depth': 35, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': None, 'bootstrap': True}. Best is trial 2 with value: -0.5285811404131047.


starting trial  0


[I 2025-11-11 18:23:33,901] Trial 8 finished with value: -0.6667369483579092 and parameters: {'n_estimators': 437, 'max_depth': 38, 'min_samples_split': 12, 'min_samples_leaf': 16, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 2 with value: -0.5285811404131047.


starting trial  0


[I 2025-11-11 18:26:01,668] Trial 9 finished with value: -0.5743820253101303 and parameters: {'n_estimators': 362, 'max_depth': 38, 'min_samples_split': 3, 'min_samples_leaf': 5, 'max_features': None, 'bootstrap': False}. Best is trial 2 with value: -0.5285811404131047.


In [15]:
best_params = study.best_params
print("Best Parameters: ", best_params)

Best Parameters:  {'n_estimators': 838, 'max_depth': 45, 'min_samples_split': 16, 'min_samples_leaf': 16, 'max_features': None, 'bootstrap': True}


In [16]:
# Make predictions

best_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
best_model.fit(X_train, y_train)

# y_train_pred_rf = rf_model.predict(X_train)
# y_test_pred_rf = rf_model.predict(X_test)
y_pred = best_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test R2: {r2:.4f}")

Test RMSE: 0.5254
Test R2: 0.7235


In [None]:
# Evaluate model
train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_train_pred_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
train_r2_rf = r2_score(y_train, y_train_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

print("Random Forest Model Performance:")
print(f"Train RMSE: {train_rmse_rf:.4f}")
print(f"Test RMSE: {test_rmse_rf:.4f}")
print(f"Train R²: {train_r2_rf:.4f}")
print(f"Test R²: {test_r2_rf:.4f}")

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': ['DRUG_ID', 'COSMIC_ID'],
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print(f"\nFeature Importance:")
print(feature_importance)

# Create scatter plot of actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_test_pred_rf, alpha=0.5, edgecolors='k', linewidth=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual z_LN_IC50', fontsize=12)
plt.ylabel('Predicted z_LN_IC50', fontsize=12)
plt.title('Random Forest: Actual vs Predicted Values (Test Set)', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print additional statistics
residuals_rf = y_test - y_test_pred_rf
print(f"\nResiduals Statistics:")
print(f"Mean Residual: {residuals_rf.mean():.4f}")
print(f"Std Residual: {residuals_rf.std():.4f}")
print(f"Min Residual: {residuals_rf.min():.4f}")
print(f"Max Residual: {residuals_rf.max():.4f}")

# Model Training for z_AUC Prediction

Now we train all models to predict z_AUC instead of z_LN_IC50

In [None]:
# Prepare target variable for z_AUC prediction
y_train_auc = train_df_random['z_AUC']
y_test_auc = test_df_random['z_AUC']

print(f"Training samples: {len(y_train_auc)}")
print(f"Test samples: {len(y_test_auc)}")

### Random Forest and Hyper Parameter Tuning for z_AUC

In [None]:
# Random Forest for z_AUC
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, 20, None]
}

rf_base_auc = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search_rf_auc = GridSearchCV(rf_base_auc, param_grid_rf, cv=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search_rf_auc.fit(X_train, y_train_auc)

print(f"Best params: {grid_search_rf_auc.best_params_}")
print(f"Best CV RMSE: {np.sqrt(-grid_search_rf_auc.best_score_):.4f}")

rf_model_auc = grid_search_rf_auc.best_estimator_
y_train_pred_rf_auc = rf_model_auc.predict(X_train)
y_test_pred_rf_auc = rf_model_auc.predict(X_test)

train_rmse_rf_auc = np.sqrt(mean_squared_error(y_train_auc, y_train_pred_rf_auc))
test_rmse_rf_auc = np.sqrt(mean_squared_error(y_test_auc, y_test_pred_rf_auc))
train_r2_rf_auc = r2_score(y_train_auc, y_train_pred_rf_auc)
test_r2_rf_auc = r2_score(y_test_auc, y_test_pred_rf_auc)

print("Random Forest (z_AUC):")
print(f"Train RMSE: {train_rmse_rf_auc:.4f}, Test RMSE: {test_rmse_rf_auc:.4f}")
print(f"Train R²: {train_r2_rf_auc:.4f}, Test R²: {test_r2_rf_auc:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(y_test_auc, y_test_pred_rf_auc, alpha=0.5, edgecolors='k', linewidth=0.5)
plt.plot([y_test_auc.min(), y_test_auc.max()], [y_test_auc.min(), y_test_auc.max()], 'r--', lw=2, label='Perfect Prediction')
plt.xlabel('Actual z_AUC', fontsize=12)
plt.ylabel('Predicted z_AUC', fontsize=12)
plt.title('Random Forest: z_AUC Prediction', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()