In [1]:
# load libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from scipy import stats

import warnings

# suppress all warnings
warnings.filterwarnings('ignore')

# set seed
np.random.seed(42)

In [2]:
# specify file paths
train_df = "train_subset.csv"
train_targets = "train_targets.csv"

# read in files
X = pd.read_csv(train_df)
y = pd.read_csv(train_targets)['AAC']   # keep only AAC column

print(X.shape)
print(y.shape)

(742, 457)
(742,)


Ensemble 1: Elastic Net, Random Forest, and XGBoost

In [5]:
# create dataframe to store results
model_df = pd.DataFrame(columns=['Model', 'Fold', 'Spearman', 'Pearson'])

# initialize the outer folds (5 folds, 80% train, 20% test)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# initialize variables to store best model correlation and features
best_corr = 0
best_fold = 0
en_feat = None
rf_feat = None
xg_feat = None

# loop through each of the outer five folds
fold = 1
for train_index, test_index in outer_cv.split(X):

  # split train and test
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  # initialize models
  en = linear_model.ElasticNet(alpha = 1,
                               l1_ratio = 0.5,
                               max_iter = 1000)
  rf = RandomForestRegressor(n_estimators = 200,
                             max_depth = 20,
                             min_samples_split = 5,
                             min_samples_leaf = 1,
                             max_features = 'sqrt')
  xg = xgb.XGBRegressor(tree_method="hist",
                        early_stopping_rounds=2,
                        eval_metric="rmse", verbosity=0,
                        objective='reg:squarederror',
                        max_depth=5, subsample=0.8)

  # fit models
  en.fit(X_train, y_train)
  rf.fit(X_train, y_train)
  xg.fit(X_train, y_train, eval_set = [(X_test, y_test)], verbose=0)


  # get predicted values for test data
  en_y_pred = en.predict(X_test)
  rf_y_pred = rf.predict(X_test)
  xg_y_pred = xg.predict(X_test)

  # average predictions
  y_pred = (en_y_pred + rf_y_pred + xg_y_pred) / 3

  # compute correlations
  s_cor = stats.spearmanr(y_pred, y_test)
  p_cor = stats.pearsonr(y_pred, y_test)

  # save model correlation and features (if better than previous)
  if s_cor[0] > best_corr:
          best_corr = s_cor[0]
          best_fold = fold
          en_feat = en.coef_
          rf_feat = rf.feature_importances_
          xg_feat = xg.feature_importances_

  # save results to dataframe
  new_row = pd.DataFrame({'Model': ['en+rf+xgb'], 'Fold': [fold], 'Spearman': [s_cor[0]], 'Pearson': [p_cor[0]]})
  model_df = pd.concat([model_df, new_row],ignore_index = True)

  # print results from fold
  print("Fold", fold, "Spearman correlation:", s_cor[0])

  fold += 1

# print results
print("\nBest correlation:", best_corr, "from Fold", best_fold)

# create feature importance dataframes
en_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': en_feat
}).sort_values(by='Weight', ascending=False)
rf_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
en_feat.to_csv("e1_en_features.csv", index=False)
rf_feat.to_csv("e1_rf_features.csv", index=False)
xg_feat.to_csv("e1_xg_features.csv", index=False)

model_df.to_csv('ensemble1.csv', index=False)

Fold 1 Spearman correlation: 0.44058447359393565
Fold 2 Spearman correlation: 0.45547493194850175
Fold 3 Spearman correlation: 0.5551900997703834
Fold 4 Spearman correlation: 0.5250143910017918
Fold 5 Spearman correlation: 0.54151172843996

Best correlation: 0.5551900997703834 from Fold 3


Ensemble 2: Random Forest and XGBoost

In [6]:
# create dataframe to store results
model_df = pd.DataFrame(columns=['Model', 'Fold', 'Spearman', 'Pearson'])

# initialize the outer folds (5 folds, 80% train, 20% test)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# initialize variables to store best model correlation and features
best_corr = 0
best_fold = 0
rf_feat = None
xg_feat = None

# loop through each of the outer five folds
fold = 1
for train_index, test_index in outer_cv.split(X):

  # split train and test
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  # initialize models
  rf = RandomForestRegressor(n_estimators = 200,
                             max_depth = 20,
                             min_samples_split = 5,
                             min_samples_leaf = 1,
                             max_features = 'sqrt')
  xg = xgb.XGBRegressor(tree_method="hist",
                        early_stopping_rounds=2,
                        eval_metric="rmse", verbosity=0,
                        objective='reg:squarederror',
                        max_depth=5, subsample=0.8)

  # fit models
  rf.fit(X_train, y_train)
  xg.fit(X_train, y_train, eval_set = [(X_test, y_test)], verbose=0)


  # get predicted values for test data
  rf_y_pred = rf.predict(X_test)
  xg_y_pred = xg.predict(X_test)

  # average predictions
  y_pred = (rf_y_pred + xg_y_pred) / 2

  # compute correlations
  s_cor = stats.spearmanr(y_pred, y_test)
  p_cor = stats.pearsonr(y_pred, y_test)

  # save model correlation and features (if better than previous)
  if s_cor[0] > best_corr:
          best_corr = s_cor[0]
          best_fold = fold
          rf_feat = rf.feature_importances_
          xg_feat = xg.feature_importances_

  # save results to dataframe
  new_row = pd.DataFrame({'Model': ['rf+xgb'], 'Fold': [fold], 'Spearman': [s_cor[0]], 'Pearson': [p_cor[0]]})
  model_df = pd.concat([model_df, new_row],ignore_index = True)

  # print results from fold
  print("Fold", fold, "Spearman correlation:", s_cor[0])

  fold += 1

# print results
print("\nBest correlation:", best_corr, "from Fold", best_fold)

# create feature importance dataframes
rf_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
rf_feat.to_csv("e2_rf_features.csv", index=False)
xg_feat.to_csv("e2_xg_features.csv", index=False)

model_df.to_csv('ensemble2.csv', index=False)

Fold 1 Spearman correlation: 0.4353132915957636
Fold 2 Spearman correlation: 0.46365285812712426
Fold 3 Spearman correlation: 0.5183584473882327
Fold 4 Spearman correlation: 0.5547495784744916
Fold 5 Spearman correlation: 0.551044016985519

Best correlation: 0.5547495784744916 from Fold 4
