In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
from sklearn.model_selection import cross_val_score

In [2]:
# specify file paths
train_df = "train_subset.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
X = X.iloc[:, 1:]                       # remove cell line labels
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

# preview
print("Train Features Shape:", X.shape)
print("Train Targets Shape:", y.shape)

Train Features Shape: (742, 456)
Train Targets Shape: (742,)


In [3]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
X_test_scaled = scaler.transform(X_test)        # Transform test data with same scaler

In [5]:
# Ridge Regression
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)  # Train on scaled data
ridge_test_preds = ridge.predict(X_test_scaled)  # Predict on scaled test data

# Spearman Correlation
ridge_test_corr = spearmanr(y_test, ridge_test_preds).correlation
print(f"Validation Spearman correlation for Ridge Regression: {ridge_test_corr}")

Validation Spearman correlation for Ridge Regression: 0.1540741466299805


In [6]:
# Initialize and train the model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_scaled, y_train)  # Train on scaled training data

# Make predictions on the test set
rf_test_preds = rf.predict(X_test_scaled)

# Calculate Spearman correlation
rf_test_corr = spearmanr(y_test, rf_test_preds).correlation
print(f"Validation Spearman correlation for Random Forest: {rf_test_corr}")

Validation Spearman correlation for Random Forest: 0.4267535160109827


In [7]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.metrics import mean_squared_error

In [8]:
# Convert the scaled data into DMatrix format for XGBoost
train_dmatrix = xgb.DMatrix(X_train_scaled, label=y_train)
val_dmatrix = xgb.DMatrix(X_test_scaled, label=y_test)

# XGBoost parameters
params = {
    'objective': 'reg:squarederror',  
    'eval_metric': 'rmse',            
    'max_depth': 5,                  
    'eta': 0.1,                      
    'subsample': 0.8                 
}

# Train the XGBoost model
watchlist = [(train_dmatrix, 'train'), (val_dmatrix, 'eval')]
model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=watchlist, 
                  early_stopping_rounds=10, verbose_eval=True)

# Make predictions on the test set
y_pred = model.predict(val_dmatrix)

# Evaluate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test RMSE:", rmse)

# Calculate Spearman correlation
spearman_corr = spearmanr(y_pred, y_test).correlation
print("Spearman's correlation:", spearman_corr)

[0]	train-rmse:0.09867	eval-rmse:0.08591
[1]	train-rmse:0.09333	eval-rmse:0.08443
[2]	train-rmse:0.08873	eval-rmse:0.08358
[3]	train-rmse:0.08444	eval-rmse:0.08150
[4]	train-rmse:0.08065	eval-rmse:0.08024
[5]	train-rmse:0.07705	eval-rmse:0.07888
[6]	train-rmse:0.07361	eval-rmse:0.07908
[7]	train-rmse:0.07021	eval-rmse:0.07856
[8]	train-rmse:0.06694	eval-rmse:0.07793
[9]	train-rmse:0.06429	eval-rmse:0.07759
[10]	train-rmse:0.06168	eval-rmse:0.07715
[11]	train-rmse:0.05929	eval-rmse:0.07675
[12]	train-rmse:0.05766	eval-rmse:0.07636
[13]	train-rmse:0.05517	eval-rmse:0.07508
[14]	train-rmse:0.05333	eval-rmse:0.07507
[15]	train-rmse:0.05147	eval-rmse:0.07483
[16]	train-rmse:0.04981	eval-rmse:0.07431
[17]	train-rmse:0.04801	eval-rmse:0.07411
[18]	train-rmse:0.04619	eval-rmse:0.07422
[19]	train-rmse:0.04498	eval-rmse:0.07410
[20]	train-rmse:0.04405	eval-rmse:0.07419
[21]	train-rmse:0.04287	eval-rmse:0.07398
[22]	train-rmse:0.04156	eval-rmse:0.07383
[23]	train-rmse:0.04029	eval-rmse:0.07398
[2