In [None]:
# load libraries
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from scipy import stats

import warnings

# suppress all warnings
warnings.filterwarnings('ignore')

# set seed
np.random.seed(42)

Submission 20: Elastic Net

Public score: 0.3515

In [None]:
# specify file paths
train_df = "train_subset_5.csv"
train_targets = "targets_5.csv"
test_df = "test.csv"

# read in files
X_train = pd.read_csv(train_df)
y_train = pd.read_csv(train_targets)['AAC']   # keep only AAC column
X_test = pd.read_csv(test_df)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

# filter to keep only relevant genes
X_test = X_test[X_train.columns]
print(X_test.shape)

# create train val splits to get internal spearman corr
Xtr, Xte, ytr, yte = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

(689, 520)
(689,)
(304, 19921)
(304, 520)


In [None]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.5,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.33977725215920485


Submission 18: Elastic Net

Public score: 0.3923

In [None]:
# specify file paths
train_df = "train_subset_375.csv"
train_targets = "targets_375.csv"
test_df = "test.csv"

# read in files
X_train = pd.read_csv(train_df)
y_train = pd.read_csv(train_targets)['AAC']   # keep only AAC column
X_test = pd.read_csv(test_df)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

# filter to keep only relevant genes
X_test = X_test[X_train.columns]
print(X_test.shape)

# create train val splits to get internal spearman corr
Xtr, Xte, ytr, yte = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.8,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.5226599741803645


Submission 17: Elastic Net

Public score: 0.362

In [None]:
# initialize models
en = linear_model.ElasticNet(alpha = 1,
                              l1_ratio = 0.5,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.5163882789347635


Submission 17: Elastic Net

Public score: 0.45033

In [None]:
# initialize models
en = linear_model.ElasticNet(alpha = 10,
                              l1_ratio = 0.2,
                              max_iter = 1000)

# fit model
en.fit(X_train, y_train)

# get predicted values for test data
y_pred = en.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("enpred.csv")

# get correlation
en.fit(Xtr, ytr)
y_pred = en.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.4643732513468072


Submission 16: Random Forest

Public score: 0.2329

In [None]:
# initialize models
rf = RandomForestRegressor(
    n_estimators = 100,
    max_depth = 20,
    min_samples_split = 2,
    min_samples_leaf = 1,
    max_features = 'sqrt'
)

# fit model
rf.fit(X_train, y_train)

# get predicted values for test data
y_pred = rf.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("rfpred.csv")

# get correlation
rf.fit(Xtr, ytr)
y_pred = rf.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.5369938201666503


Submission 15: LASSO

Public score: 0.364

In [None]:
# initialize models
ls = linear_model.Lasso(alpha = 1,
                        max_iter = 1000)

# fit model
ls.fit(X_train, y_train)

# get predicted values for test data
y_pred = ls.predict(X_test)

# save predictions
predictions = pd.DataFrame({
    #'sampleId': sampleId,
    'AAC': y_pred})
predictions.to_csv("lspred.csv")

# get correlation
ls.fit(Xtr, ytr)
y_pred = ls.predict(Xte)
print("Spearman correlation:", stats.spearmanr(y_pred, yte)[0])

Spearman correlation: 0.47962272913186116
