In [1]:
# load libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import stats

In [2]:
# specify file paths
train_df = "train_subset.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

print(X.shape)
print(y.shape)

(742, 457)
(742,)


In [3]:
# split the training dataframe into train and val
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# initialize LASSO model
en = ElasticNet()

# specify parameters for optimization
parameters = {
    'alpha': [0.1, 1, 10, 100],
    'l1_ratio': [0.2, 0.5, 0.8],
    'max_iter': [5000, 7500]
  }

# identify optimal parameters
reg = GridSearchCV(
    estimator = en,
    param_grid = parameters,
    verbose=2
  )
reg.fit(X_train, y_train)
print('Best params:', reg.best_params_ )

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=5000; total time=   0.8s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=5000; total time=   0.8s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=5000; total time=   0.7s


  model = cd_fast.enet_coordinate_descent(


[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=5000; total time=   1.3s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=5000; total time=   0.9s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=7500; total time=   0.8s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=7500; total time=   0.6s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=7500; total time=   0.5s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=7500; total time=   1.0s
[CV] END .............alpha=0.1, l1_ratio=0.2, max_iter=7500; total time=   0.8s
[CV] END .............alpha=0.1, l1_ratio=0.5, max_iter=5000; total time=   0.8s
[CV] END .............alpha=0.1, l1_ratio=0.5, max_iter=5000; total time=   0.4s
[CV] END .............alpha=0.1, l1_ratio=0.5, max_iter=5000; total time=   0.5s
[CV] END .............alpha=0.1, l1_ratio=0.5, max_iter=5000; total time=   0.3s
[CV] END .............alpha=0.1, l1_ratio=0.5, max_iter=5000; total time=   0.3s
[CV] END .............alpha=

In [7]:
# test best model parameters on test data
reg_best = reg.best_estimator_

# get predicted values for test data
y_pred = reg_best.predict(X_test)

# get spearman's correlation
res = stats.spearmanr(y_pred, y_test)
print('Spearman correlation:', res[0])          # previously 0.3625960117868458

Spearman correlation: 0.4096593593864911
