In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
from sklearn.model_selection import cross_val_score

In [24]:
# specify file paths
train_df = "train.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
X = X.iloc[:, 1:]                       # remove cell line labels
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

# preview
print("Train Features Shape:", X.shape)
print("Train Targets Shape:", y.shape)

Train Features Shape: (742, 19920)
Train Targets Shape: (742,)


In [25]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform on training data
X_test_scaled = scaler.transform(X_test)        # Transform test data with same scaler

In [28]:
# Ridge Regression
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)  # Train on scaled data
ridge_test_preds = ridge.predict(X_test_scaled)  # Predict on scaled test data

# Spearman Correlation
ridge_test_corr = spearmanr(y_test, ridge_test_preds).correlation
print(f"Validation Spearman correlation for Ridge Regression: {ridge_test_corr}")

Validation Spearman correlation for Ridge Regression: 0.3233124644591067


In [29]:
# Initialize and train the model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_scaled, y_train)  # Train on scaled training data

# Make predictions on the test set
rf_test_preds = rf.predict(X_test_scaled)

# Calculate Spearman correlation
rf_test_corr = spearmanr(y_test, rf_test_preds).correlation
print(f"Validation Spearman correlation for Random Forest: {rf_test_corr}")

Validation Spearman correlation for Random Forest: 0.4317870776231393


In [30]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.metrics import mean_squared_error

In [32]:
# Convert the scaled data into DMatrix format for XGBoost
train_dmatrix = xgb.DMatrix(X_train_scaled, label=y_train)
val_dmatrix = xgb.DMatrix(X_test_scaled, label=y_test)

# XGBoost parameters
params = {
    'objective': 'reg:squarederror',  
    'eval_metric': 'rmse',            
    'max_depth': 5,                  
    'eta': 0.1,                      
    'subsample': 0.8                 
}

# Train the XGBoost model
watchlist = [(train_dmatrix, 'train'), (val_dmatrix, 'eval')]
model = xgb.train(params, train_dmatrix, num_boost_round=100, evals=watchlist, 
                  early_stopping_rounds=10, verbose_eval=True)

# Make predictions on the test set
y_pred = model.predict(val_dmatrix)

# Evaluate RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test RMSE:", rmse)

# Calculate Spearman correlation
spearman_corr = spearmanr(y_pred, y_test).correlation
print("Spearman's correlation:", spearman_corr)

[0]	train-rmse:0.09852	eval-rmse:0.08631
[1]	train-rmse:0.09283	eval-rmse:0.08441
[2]	train-rmse:0.08773	eval-rmse:0.08387
[3]	train-rmse:0.08356	eval-rmse:0.08217
[4]	train-rmse:0.07966	eval-rmse:0.08111
[5]	train-rmse:0.07547	eval-rmse:0.08016
[6]	train-rmse:0.07180	eval-rmse:0.07935
[7]	train-rmse:0.06829	eval-rmse:0.07824
[8]	train-rmse:0.06473	eval-rmse:0.07858
[9]	train-rmse:0.06150	eval-rmse:0.07801
[10]	train-rmse:0.05922	eval-rmse:0.07827
[11]	train-rmse:0.05663	eval-rmse:0.07832
[12]	train-rmse:0.05470	eval-rmse:0.07831
[13]	train-rmse:0.05242	eval-rmse:0.07850
[14]	train-rmse:0.05023	eval-rmse:0.07843
[15]	train-rmse:0.04858	eval-rmse:0.07815
[16]	train-rmse:0.04692	eval-rmse:0.07820
[17]	train-rmse:0.04495	eval-rmse:0.07798
[18]	train-rmse:0.04284	eval-rmse:0.07805
[19]	train-rmse:0.04135	eval-rmse:0.07779
[20]	train-rmse:0.04014	eval-rmse:0.07773
[21]	train-rmse:0.03896	eval-rmse:0.07786
[22]	train-rmse:0.03773	eval-rmse:0.07775
[23]	train-rmse:0.03621	eval-rmse:0.07753
[2