In [10]:
import pandas as pd
from sklearn.svm import LinearSVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy import stats

In [11]:
# specify file paths
train_df = "../.data/train.csv"
train_targets = "../.data/train_targets.csv"

In [12]:
# read in files
X = pd.read_csv(train_df)
X = X.iloc[:, 1:]                       # remove cell line labels
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

# preview
print("Train Features Shape:", X.shape)
print("Train Targets Shape:", y.shape)

Train Features Shape: (742, 19920)
Train Targets Shape: (742,)


In [13]:
# split train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# hyperparam ranges for tuning
C_values = [0.1, 1.0, 10.0]
epsilon_values = [0.01, 0.1, 1.0]

In [15]:
best_rmse = float('inf')
best_params = {}

for C in C_values:
    for epsilon in epsilon_values:
        print(f"Training Linear SVR model with C={C} and epsilon={epsilon}...")
        linear_svr_pipeline = make_pipeline(StandardScaler(), LinearSVR(C=C, epsilon=epsilon, max_iter=1000, random_state=42))
        linear_svr_pipeline.fit(X_train, y_train)
        print("Training completed.")

        # make preds on test
        y_pred = linear_svr_pipeline.predict(X_test)

        # evak RMSE
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        print(f"Test RMSE with C={C}, epsilon={epsilon}: {rmse}")

        # update best parameters if better
        if rmse < best_rmse:
            best_rmse = rmse
            best_params = {'C': C, 'epsilon': epsilon}
            
print(f"Best parameters found: C={best_params['C']}, epsilon={best_params['epsilon']} with RMSE={best_rmse}")

Training Linear SVR model with C=0.1 and epsilon=0.01...
Training completed.




Test RMSE with C=0.1, epsilon=0.01: 0.09899819044257646
Training Linear SVR model with C=0.1 and epsilon=0.1...
Training completed.




Test RMSE with C=0.1, epsilon=0.1: 0.13143115775548203
Training Linear SVR model with C=0.1 and epsilon=1.0...
Training completed.




Test RMSE with C=0.1, epsilon=1.0: 0.1449254113125408
Training Linear SVR model with C=1.0 and epsilon=0.01...
Training completed.




Test RMSE with C=1.0, epsilon=0.01: 0.09899819044257646
Training Linear SVR model with C=1.0 and epsilon=0.1...
Training completed.




Test RMSE with C=1.0, epsilon=0.1: 0.13143115775548203
Training Linear SVR model with C=1.0 and epsilon=1.0...
Training completed.




Test RMSE with C=1.0, epsilon=1.0: 0.1449254113125408
Training Linear SVR model with C=10.0 and epsilon=0.01...
Training completed.




Test RMSE with C=10.0, epsilon=0.01: 0.09899819044257646
Training Linear SVR model with C=10.0 and epsilon=0.1...
Training completed.




Test RMSE with C=10.0, epsilon=0.1: 0.13143115775548203
Training Linear SVR model with C=10.0 and epsilon=1.0...
Training completed.
Test RMSE with C=10.0, epsilon=1.0: 0.1449254113125408
Best parameters found: C=0.1, epsilon=0.01 with RMSE=0.09899819044257646




In [16]:
# train best model
print("Retraining the best Linear SVR model...")
linear_svr_pipeline = make_pipeline(StandardScaler(), LinearSVR(C=best_params['C'], epsilon=best_params['epsilon'], max_iter=1000, random_state=42))
linear_svr_pipeline.fit(X_train, y_train)
print("Retraining completed.")

Retraining the best Linear SVR model...
Retraining completed.


In [17]:
# make preds on test
y_pred = linear_svr_pipeline.predict(X_test)

In [18]:
# eval RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Test RMSE:", rmse)

Test RMSE: 0.09899819044257646




In [19]:
# spearman corr
res = stats.spearmanr(y_pred, y_test)
print("Spearman's correlation:", res[0])

Spearman's correlation: 0.3233124644591067
