In [24]:
# load libraries
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import stats

In [25]:
# specify file paths
train_df = "train.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
X = X.iloc[:, 1:]                       # remove cell line labels
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

In [26]:
# split the training dataframe into train and val
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# initialize LASSO model
lasso = linear_model.Lasso()

# specify parameters for optimization
parameters = {
    'alpha': [0.1, 1, 10, 100],
    'max_iter': [5000, 7500]
  }

# identify optimal parameters
reg = GridSearchCV(
    estimator = lasso,
    param_grid = parameters,
    verbose=2
  )
reg.fit(X_train, y_train)
reg.best_params_                        # {'alpha': 10, 'max_iter': 5000}

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...........................alpha=0.1, max_iter=5000; total time=  30.0s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=  28.3s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=  35.0s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=  30.4s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=  28.6s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=  31.5s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=  29.5s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=  30.3s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=  23.7s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=  23.9s
[CV] END .............................alpha=1, max_iter=5000; total time=   6.3s
[CV] END .............................alpha=1, ma

{'alpha': 10, 'max_iter': 5000}

In [30]:
# test best model parameters on test data
reg_best = reg.best_estimator_

# get predicted values for test data
y_pred = reg_best.predict(X_test)

# get spearman's correlation
res = stats.spearmanr(y_pred, y_test)
res[0]                                  # 0.3304392621434899

0.3304392621434899