In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
from sklearn.model_selection import cross_val_score

In [16]:
# Load data
data_path = "/Users/peterher/Downloads/"
train_data = pd.read_csv(data_path + "train.csv")
train_targets = pd.read_csv(data_path + "train_targets.csv")
test_data = pd.read_csv(data_path + "test.csv")

In [20]:
# Cell lines as index 
train_data = train_data.set_index("Unnamed: 0")
test_data = test_data.set_index("Unnamed: 0")
train_targets = train_targets.set_index("sample")

In [21]:
# Merge train_data with train_targets
train_df = train_data.join(train_targets[['AAC', 'tissue']])

X = train_df.drop(columns=['AAC', 'tissue'])  # Gene expression features
y = train_df['AAC']  # Drug response values to predict

In [22]:
# Standardize features (?) 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_data)

In [30]:
# Train/Validation split - not scaled
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Train/Validation split - scaled 
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [28]:
# Ridge Regression
ridge = Ridge()
ridge.fit(X_train, y_train)
ridge_val_preds = ridge.predict(X_val)
ridge_val_corr = spearmanr(y_val, ridge_val_preds).correlation
print(f"Validation Spearman correlation for Ridge Regression: {ridge_val_corr}")

Validation Spearman correlation for Ridge Regression: 0.3339709178043436


In [29]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_val_preds = rf.predict(X_val)
rf_val_corr = spearmanr(y_val, rf_val_preds).correlation
print(f"Validation Spearman correlation for Random Forest: {rf_val_corr}")

Validation Spearman correlation for Random Forest: 0.42036497402971584
