In [13]:
# load libraries
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from scipy import stats

import warnings

# suppress all warnings
warnings.filterwarnings('ignore')

# set seed
np.random.seed(42)

In [2]:
# specify file paths
train_df = "train_subset.csv"
train_targets = "train_targets.csv"
test_df = "test.csv"

# read in files
X_train = pd.read_csv(train_df)
y_train = pd.read_csv(train_targets)['AAC']   # keep only AAC column
X_test = pd.read_csv(test_df)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(742, 457)
(742,)
(304, 19921)


In [14]:
# filter to keep only relevant genes
X_test = X_test[X_train.columns]
X_test.shape

# create train val splits to get internal spearman corr
Xtr, Xte, ytr, yte = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Submission 13: Elastic Net

Public score: 0.44524

In [15]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 1,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.38029056801264327


Submission 12: Elastic Net

Public score: 0.41931

In [16]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.8,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.40229131352393976


Submission 11: Elastic Net

Public score: 0.29042

In [11]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.2,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.36617845653784015


Submission 10: Ensemble 3: Elastic Net and LASSO

Public score: 0.35215

In [None]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                             l1_ratio = 0.5,
                             max_iter = 1000)
ls = linear_model.Lasso(alpha = 0.01,
                        max_iter = 500)

# elastic net
en.fit(X_train, y_train)
en_y_pred = en.predict(X_test)

# scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# lasso
ls.fit(X_train, y_train)
ls_y_pred = ls.predict(X_test)

# average predictions
y_pred = (en_y_pred * 0.6 + ls_y_pred * 0.4) / 2

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("ensemble3pred.csv")

Submission 9: Elastic Net

Public score: 0.35877

In [12]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.5,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.4096593593864911


Submission 8: Ensemble 1: Elastic Net, Random Forest, and XGBoost

Public score: 0.34287

In [None]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.5,
                              max_iter = 1000)
rf = RandomForestRegressor(n_estimators = 200,
                            max_depth = 20,
                            min_samples_split = 5,
                            min_samples_leaf = 1,
                            max_features = 'sqrt')
xg = xgb.XGBRegressor(tree_method="hist",
                      early_stopping_rounds=2,
                      eval_metric="rmse", verbosity=0,
                      objective='reg:squarederror',
                      max_depth=5, subsample=0.8)

# split the training dataframe into train and val for XGboost
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# fit models
en.fit(X_train, y_train)
rf.fit(X_train, y_train)
xg.fit(X_train_xg, y_train_xg, eval_set = [(X_test_xg, y_test_xg)], verbose=0)

# get predicted values for test data
en_y_pred = en.predict(X_test)
rf_y_pred = rf.predict(X_test)
xg_y_pred = xg.predict(X_test)

# average predictions
y_pred = (en_y_pred * 0.8 + rf_y_pred * 0.1 + xg_y_pred * 0.1) / 3

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("ensemble1pred.csv")

# get feature importances
en_feat = en.coef_
rf_feat = rf.feature_importances_
xg_feat = xg.feature_importances_

# create feature importance dataframes
en_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': en_feat
}).sort_values(by='Weight', ascending=False)
rf_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
en_feat.to_csv("e1_en_features.csv", index=False)
rf_feat.to_csv("e1_rf_features.csv", index=False)
xg_feat.to_csv("e1_xg_features.csv", index=False)

Submission 7: Ensemble 1: Elastic Net, Random Forest, and XGBoost

Public score: 0.29959

In [None]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.5,
                              max_iter = 1000)
rf = RandomForestRegressor(n_estimators = 200,
                            max_depth = 20,
                            min_samples_split = 5,
                            min_samples_leaf = 1,
                            max_features = 'sqrt')
xg = xgb.XGBRegressor(tree_method="hist",
                      #early_stopping_rounds=2,
                      eval_metric="rmse", verbosity=0,
                      objective='reg:squarederror',
                      max_depth=5, subsample=0.8)

# split the training dataframe into train and val for XGboost
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# fit models
en.fit(X_train, y_train)
rf.fit(X_train, y_train)
xg.fit(X_train_xg, y_train_xg, eval_set = [(X_test_xg, y_test_xg)], verbose=0)

# get predicted values for test data
en_y_pred = en.predict(X_test)
rf_y_pred = rf.predict(X_test)
xg_y_pred = xg.predict(X_test)

# average predictions
y_pred = (en_y_pred * 0.4 + rf_y_pred * 0.2 + xg_y_pred * 0.2) / 3

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("ensemble1pred.csv")

# get feature importances
en_feat = en.coef_
rf_feat = rf.feature_importances_
xg_feat = xg.feature_importances_

# create feature importance dataframes
en_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': en_feat
}).sort_values(by='Weight', ascending=False)
rf_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
en_feat.to_csv("e1_en_features.csv", index=False)
rf_feat.to_csv("e1_rf_features.csv", index=False)
xg_feat.to_csv("e1_xg_features.csv", index=False)

Submission 6: Ensemble 1: Elastic Net, Random Forest, and XGBoost

Public score: 0.23668

In [None]:
# get gene names
feat = X_train.columns

# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.5,
                              max_iter = 1000)
rf = RandomForestRegressor(n_estimators = 200,
                            max_depth = 20,
                            min_samples_split = 5,
                            min_samples_leaf = 1,
                            max_features = 'sqrt')
xg = xgb.XGBRegressor(tree_method="hist",
                      #early_stopping_rounds=2,
                      eval_metric="rmse", verbosity=0,
                      objective='reg:squarederror',
                      max_depth=5, subsample=0.8)

# scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# split the training dataframe into train and val for XGboost
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# fit models
en.fit(X_train, y_train)
rf.fit(X_train, y_train)
xg.fit(X_train_xg, y_train_xg, eval_set = [(X_test_xg, y_test_xg)], verbose=0)

# get predicted values for test data
en_y_pred = en.predict(X_test)
rf_y_pred = rf.predict(X_test)
xg_y_pred = xg.predict(X_test)

# average predictions
y_pred = (en_y_pred + rf_y_pred + xg_y_pred) / 3

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("ensemble1pred.csv")

# get feature importances
en_feat = en.coef_
rf_feat = rf.feature_importances_
xg_feat = xg.feature_importances_

Submission 5: Ensemble 1: Elastic Net, Random Forest, and XGBoost

Public score: 0.27724

In [None]:
# get gene names
feat = X_train.columns

# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.5,
                              max_iter = 1000)
rf = RandomForestRegressor(n_estimators = 200,
                            max_depth = 20,
                            min_samples_split = 5,
                            min_samples_leaf = 1,
                            max_features = 'sqrt')
xg = xgb.XGBRegressor(tree_method="hist",
                      #early_stopping_rounds=2,
                      eval_metric="rmse", verbosity=0,
                      objective='reg:squarederror',
                      max_depth=5, subsample=0.8)

# split the training dataframe into train and val for XGboost
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# fit models
en.fit(X_train, y_train)
rf.fit(X_train, y_train)
xg.fit(X_train_xg, y_train_xg, eval_set = [(X_test_xg, y_test_xg)], verbose=0)

# get predicted values for test data
en_y_pred = en.predict(X_test)
rf_y_pred = rf.predict(X_test)
xg_y_pred = xg.predict(X_test)

# average predictions
y_pred = (en_y_pred + rf_y_pred + xg_y_pred) / 3

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("ensemble1pred.csv")

# get feature importances
en_feat = en.coef_
rf_feat = rf.feature_importances_
xg_feat = xg.feature_importances_

# create feature importance dataframes
en_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': en_feat
}).sort_values(by='Weight', ascending=False)
rf_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
en_feat.to_csv("e1_en_features.csv", index=False)
rf_feat.to_csv("e1_rf_features.csv", index=False)
xg_feat.to_csv("e1_xg_features.csv", index=False)

Submission 3: Ensemble 2: Random Forest and XGBoost

Public score: 0.24924

In [None]:
# get gene names
feat = X_train.columns

# initialize models
rf = RandomForestRegressor(n_estimators = 200,
                            max_depth = 20,
                            min_samples_split = 5,
                            min_samples_leaf = 1,
                            max_features = 'sqrt')
xg = xgb.XGBRegressor(tree_method="hist",
                      early_stopping_rounds=2,
                      eval_metric="rmse", verbosity=0,
                      objective='reg:squarederror',
                      max_depth=5, subsample=0.8)

# scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# split the training dataframe into train and val for XGboost
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# fit models
rf.fit(X_train, y_train)
xg.fit(X_train_xg, y_train_xg, eval_set = [(X_test_xg, y_test_xg)], verbose=0)

# get predicted values for test data
rf_y_pred = rf.predict(X_test)
xg_y_pred = xg.predict(X_test)

# average predictions
y_pred = (rf_y_pred + xg_y_pred) / 2

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred
  })
predictions.to_csv("ensemble2pred.csv")

# get feature importances
rf_feat = rf.feature_importances_
xg_feat = xg.feature_importances_

# create feature importance dataframes
rf_feat = pd.DataFrame({
    'Feat': feat,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Feat': feat,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
rf_feat.to_csv("e2_rf_features.csv", index=False)
xg_feat.to_csv("e2_xg_features.csv", index=False)

Submission 2: Ensemble 2: Random Forest and XGBoost

Public score: 0.25283

In [None]:
# initialize models
rf = RandomForestRegressor(n_estimators = 200,
                            max_depth = 20,
                            min_samples_split = 5,
                            min_samples_leaf = 1,
                            max_features = 'sqrt')
xg = xgb.XGBRegressor(tree_method="hist",
                      early_stopping_rounds=2,
                      eval_metric="rmse", verbosity=0,
                      objective='reg:squarederror',
                      max_depth=5, subsample=0.8)

# split the training dataframe into train and val for XGboost
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# fit models
rf.fit(X_train, y_train)
xg.fit(X_train_xg, y_train_xg, eval_set = [(X_test_xg, y_test_xg)], verbose=0)

# get predicted values for test data
rf_y_pred = rf.predict(X_test)
xg_y_pred = xg.predict(X_test)

# average predictions
y_pred = (rf_y_pred + xg_y_pred) / 2

# save predictions
predictions = pd.DataFrame({
    'sampleId': sampleId,
    'AAC': y_pred
  })
predictions.to_csv("ensemble2pred.csv")

# get feature importances
rf_feat = rf.feature_importances_
xg_feat = xg.feature_importances_

# create feature importance dataframes
rf_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
rf_feat.to_csv("e2_rf_features.csv", index=False)
xg_feat.to_csv("e2_xg_features.csv", index=False)

Submission 1: Ensemble 2: Random Forest and XGBoost

Public score: 0.19703

In [None]:
# initialize models
rf = RandomForestRegressor(n_estimators = 200,
                            max_depth = 20,
                            min_samples_split = 5,
                            min_samples_leaf = 1,
                            max_features = 'sqrt')
xg = xgb.XGBRegressor(tree_method="hist",
                      #early_stopping_rounds=2,
                      eval_metric="rmse", verbosity=0,
                      objective='reg:squarederror',
                      max_depth=5, subsample=0.8)

# fit models
rf.fit(X_train, y_train)
xg.fit(X_train, y_train)

# get predicted values for test data
rf_y_pred = rf.predict(X_test)
xg_y_pred = xg.predict(X_test)

# average predictions
y_pred = (rf_y_pred + xg_y_pred) / 2

# save predictions
predictions = pd.DataFrame({
    'sampleId': sampleId,
    'AAC': y_pred
  })
predictions.to_csv("ensemble2pred.csv")

# get feature importances
rf_feat = rf.feature_importances_
xg_feat = xg.feature_importances_

# create feature importance dataframes
rf_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': rf_feat
}).sort_values(by='Weight', ascending=False)
xg_feat = pd.DataFrame({
    'Peak': X_train.columns,
    'Weight': xg_feat
}).sort_values(by='Weight', ascending=False)

# save feature importance dataframe
rf_feat.to_csv("e2_rf_features.csv", index=False)
xg_feat.to_csv("e2_xg_features.csv", index=False)