In [None]:
# load libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from scipy import stats

import warnings

# suppress all warnings
warnings.filterwarnings('ignore')

# set seed
np.random.seed(42)

In [None]:
# specify file paths
train_df = "train_subset.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

print(X.shape)
print(y.shape)

(742, 457)
(742,)


Ensemble 1: Elastic Net, Random Forest, and XGBoost

In [None]:
# create dataframe to store results
model_df = pd.DataFrame(columns=['Model', 'Fold', 'Spearman', 'Pearson'])

# initialize the outer folds (5 folds, 80% train, 20% test)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# initialize variables to store best model correlation and features
best_corr = 0
best_fold = 0
best_combo = None

# loop through each of the outer five folds
fold = 1
for train_index, test_index in outer_cv.split(X):

  # initialize variables to store best model correlation and features from fold
  fold_best_corr = 0
  fold_best_combo = None

  # split train and test
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  # scale the features
  scaler = StandardScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  # initialize models
  ls = linear_model.Lasso(alpha = 0.01,
                          max_iter = 500)
  en = linear_model.ElasticNet(alpha = 1,
                               l1_ratio = 0.5,
                               max_iter = 1000)
  rf = RandomForestRegressor(n_estimators = 200,
                             max_depth = 20,
                             min_samples_split = 5,
                             min_samples_leaf = 1,
                             max_features = 'sqrt')
  xg = xgb.XGBRegressor(tree_method="hist",
                        early_stopping_rounds=2,
                        eval_metric="rmse", verbosity=0,
                        objective='reg:squarederror',
                        max_depth=5, subsample=0.8)

  # fit models
  ls.fit(X_train_scaled, y_train)
  en.fit(X_train, y_train)
  rf.fit(X_train_scaled, y_train)
  xg.fit(X_train, y_train, eval_set = [(X_test, y_test)], verbose=0)


  # get predicted values for test data
  ls_y_pred = ls.predict(X_test_scaled)
  en_y_pred = en.predict(X_test)
  rf_y_pred = rf.predict(X_test_scaled)
  xg_y_pred = xg.predict(X_test)

  # average predictions
  y_pred_combinations = {
      "y_pred1": (ls_y_pred + en_y_pred + rf_y_pred + xg_y_pred) / 4,
      "y_pred2": (ls_y_pred + rf_y_pred + xg_y_pred) / 3,
      "y_pred3": (ls_y_pred + en_y_pred + xg_y_pred) / 3,
      "y_pred4": (ls_y_pred + en_y_pred + rf_y_pred) / 3,
      "y_pred5": (en_y_pred + rf_y_pred + xg_y_pred) / 3,
      "y_pred6": (ls_y_pred + en_y_pred) / 2,
      "y_pred7": (ls_y_pred + rf_y_pred) / 2,
      "y_pred8": (ls_y_pred + xg_y_pred) / 2,
      "y_pred9": (en_y_pred + rf_y_pred) / 2,
      "y_pred10": (en_y_pred + xg_y_pred) / 2,
      "y_pred11": (rf_y_pred + xg_y_pred) / 2
  }

  # compute correlations
  correlations = {}
  for combination, y_pred in y_pred_combinations.items():
      spearman_corr, _ = stats.spearmanr(y_pred, y_test)
      pearson_corr, _ = stats.pearsonr(y_pred, y_test)
      correlations[combination] = {
          "Spearman": spearman_corr,
          "Pearson": pearson_corr,
      }

  # output results
  for combination, corr in correlations.items():
      print(f"{combination}: Spearman = {corr['Spearman']:.4f}, Pearson = {corr['Pearson']:.4f}")
      if corr['Spearman'] > best_corr:
          best_corr = corr['Spearman']
          fold_best_corr = corr['Spearman']
          best_fold = fold
          best_combo = combination
          fold_best_combo = combination

  # print results from fold
  print("Fold", fold, "best Spearman correlation:", fold_best_corr, "and Combo", fold_best_combo)

  fold += 1

# print results
print("\nBest correlation:", best_corr, "from Fold", best_fold, "and Combo", best_combo)

y_pred1: Spearman = 0.4303, Pearson = 0.5152
y_pred2: Spearman = 0.4116, Pearson = 0.4978
y_pred3: Spearman = 0.4131, Pearson = 0.4909
y_pred4: Spearman = 0.4231, Pearson = 0.5135
y_pred5: Spearman = 0.4455, Pearson = 0.5314
y_pred6: Spearman = 0.3924, Pearson = 0.4820
y_pred7: Spearman = 0.4039, Pearson = 0.4984
y_pred8: Spearman = 0.3564, Pearson = 0.4440
y_pred9: Spearman = 0.4456, Pearson = 0.5329
y_pred10: Spearman = 0.4151, Pearson = 0.4970
y_pred11: Spearman = 0.4304, Pearson = 0.5122
Fold 1 best Spearman correlation: 0.4456307324786274 and Combo y_pred9
y_pred1: Spearman = 0.4570, Pearson = 0.4590
y_pred2: Spearman = 0.4728, Pearson = 0.4538
y_pred3: Spearman = 0.4519, Pearson = 0.4597
y_pred4: Spearman = 0.4387, Pearson = 0.4627
y_pred5: Spearman = 0.4583, Pearson = 0.4390
y_pred6: Spearman = 0.4195, Pearson = 0.4621
y_pred7: Spearman = 0.4456, Pearson = 0.4636
y_pred8: Spearman = 0.4771, Pearson = 0.4587
y_pred9: Spearman = 0.4273, Pearson = 0.4424
y_pred10: Spearman = 0.4489