In [1]:
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy import stats

In [2]:
# specify file paths
train_df = "../.data/train.csv"
train_targets = "../.data/train_targets.csv"

In [None]:
# read in files
X = pd.read_csv(train_df)
X = X.iloc[:, 1:]                       # remove cell line labels
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

# preview
print("Train Features Shape:", X.shape)
print("Train Targets Shape:", y.shape)

In [None]:
# split train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# train w/ scaling, linear SVR
print("Training Linear SVR model with scaling...")
linear_svr_pipeline = make_pipeline(StandardScaler(), 
                                    LinearSVR(max_iter=1000, 
                                              random_state=42))  # Using LinearSVR for faster training on large feature space
linear_svr_pipeline.fit(X_train, y_train)
print("Training completed.")

In [None]:
# make preds on test
y_pred = linear_svr_pipeline.predict(X_test)

In [None]:
# eval RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test RMSE:", rmse)

In [None]:
# spearman corr
res = stats.spearmanr(y_pred, y_test)
print("Spearman's correlation:", res)