In [25]:
# load libraries
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy import stats

In [26]:
# specify file paths
train_df = "train_subset.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

print(X.shape)
print(y.shape)

(742, 457)
(742,)


In [None]:
### FIRST RUN: No scaling, just feature selection

In [27]:
# split the training dataframe into train and val
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# initialize LASSO model
lasso = linear_model.Lasso()

# specify parameters for optimization
parameters = {
    'alpha': [0.1, 1, 10, 100],
    'max_iter': [5000, 7500]
  }

# identify optimal parameters
reg = GridSearchCV(
    estimator = lasso,
    param_grid = parameters,
    verbose=2
  )
reg.fit(X_train, y_train)
print('Best params:', reg.best_params_ )

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.1s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.2s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.2s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.3s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.2s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.2s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.2s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.2s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.2s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.2s
[CV] END .............................alpha=1, max_iter=5000; total time=   0.0s
[CV] END .............................alpha=1, ma

In [31]:
# test best model parameters on test data
reg_best = reg.best_estimator_

# get predicted values for test data
y_pred = reg_best.predict(X_test)

# get spearman's correlation
res = stats.spearmanr(y_pred, y_test)
print('Spearman correlation:', res[0])              # previously 0.3304392621434899

Spearman correlation: 0.38029056801264327


In [None]:
### SECOND RUN: Scaling, just feature selection

In [35]:
# standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [36]:
# split the training dataframe into train and val
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [37]:
# initialize LASSO model
lasso = linear_model.Lasso()

# specify parameters for optimization
parameters = {
    'alpha': [0.1, 1, 10, 100],
    'max_iter': [5000, 7500]
  }

# identify optimal parameters
reg = GridSearchCV(
    estimator = lasso,
    param_grid = parameters,
    verbose=2
  )
reg.fit(X_train, y_train)
print('Best params:', reg.best_params_ )

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=5000; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.0s
[CV] END ...........................alpha=0.1, max_iter=7500; total time=   0.0s
[CV] END .............................alpha=1, max_iter=5000; total time=   0.0s
[CV] END .............................alpha=1, ma

In [38]:
# test best model parameters on test data
reg_best = reg.best_estimator_

# get predicted values for test data
y_pred = reg_best.predict(X_test)

# get spearman's correlation
res = stats.spearmanr(y_pred, y_test)
print('Spearman correlation:', res[0])

# TODO: Debug

Spearman correlation: nan


  res = stats.spearmanr(y_pred, y_test)
