## Second Approach: Machine Learning
Different models of classic machine learning are employed.

#### Import of libraries and dataset

In [1]:
# import all libraries
import numpy as np
import pandas as pd
import re

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [2]:
from helpers import sample_data, load_data

# load data.
data_oligo_1 = np.delete(load_data("data/data-oligo/011021_SFL_SYN211_Oligo_1uM_Rawdata_270spectralcolumns.csv"), 0, 1).T
data_oligo_1 = data_oligo_1[data_oligo_1[:, 0] != -999, :]
data_oligo_2 = np.delete(load_data("data/data-oligo/051021_SFL_SYN211_Oligo_5uM_rawdata_270spectralcolumns.csv"), 0, 1).T
data_oligo_2 = data_oligo_2[data_oligo_2[:, 0] != -999, :]

data_oligo = np.append(data_oligo_1, data_oligo_2, axis = 0)

y_oligo = np.expand_dims(np.zeros(len(data_oligo)), axis=1)

data_PFF1 = np.delete(load_data("data/data-pff/191121_G80_AInII_SYn211_AsynPFF_5microM_rawdata_290spectracolumns.csv"), 0, 1).T
data_PFF1 = data_PFF1[data_PFF1[:, 0] != -999, :]
data_PFF2 = np.delete(load_data("data/data-pff/220421_G80_AInII_SYn211_AsynPFF_20microM_880_spectralcolumns.csv"), 0, 1).T
data_PFF2 = data_PFF2[data_PFF2[:, 0] != -999, :]

data_PFF =  np.append(data_PFF1, data_PFF2, axis = 0)

y_PFF = np.expand_dims(np.ones(len(data_PFF)), axis=1)

mix_50_50 = np.delete(load_data("data/data-mix/1221_G80_AI_SYn211_2uMPFF50__2uMOligo50__rawdata_840spectralcolumns.csv"), 0, 1).T
mix_50_50 = mix_50_50[mix_50_50[:, 0] != -999, :]

y_50_50 = np.expand_dims(np.ones(len(data_PFF))*.5, axis=1)

mix_75_25 = np.delete(load_data("data/data-mix/1221_G80_AI_SYn211_4.5uMPFF75__1.5uMOligo25__Rawdata_710spectralcolumns.csv"), 0, 1).T
mix_75_25 = mix_75_25[mix_75_25[:, 0] != -999, :]

y_75_25 = np.expand_dims(np.ones(len(mix_75_25))*.75, axis=1)

mix_25_75 = np.delete(load_data("data/data-mix/1221_G80_AI_SYn211_4.5uMOligo75%_1.5uMPFF25%_Rawdata_730spectralcolumns.csv"), 0, 1).T
mix_25_75 = mix_25_75[mix_25_75[:, 0] != -999, :]

y_25_75 = np.expand_dims(np.ones(len(mix_25_75))*.25, axis=1)

print(data_oligo.shape, data_PFF.shape, mix_50_50.shape, mix_75_25.shape, mix_25_75.shape)


(540, 133) (1170, 133) (840, 133) (710, 133) (730, 133)


In [5]:
#Build X and y by concatenating the different dataset (after sampling the same number of datapoints for each dataset)
np.random.seed()
X = np.concatenate((data_oligo, data_PFF[np.random.randint(data_PFF.shape[0], size=540),:],mix_50_50[np.random.randint(mix_50_50.shape[0], size=540),:], mix_75_25[np.random.randint(mix_75_25.shape[0], size=540),:]), axis = 0)
y = np.concatenate((y_oligo, y_PFF[np.random.randint(y_PFF.shape[0], size=540),:],y_50_50[np.random.randint(y_50_50.shape[0], size=540),:], y_75_25[np.random.randint(y_75_25.shape[0], size=540),:]), axis = 0)
X_train_, y_train_ = X, y
x_test_, y_test_ = mix_25_75, y_25_75

### Linear regression
Model trained on 0-100, 50-50, 75-25, 100-0 and tested on 25-75. Negative mean absolute error during 5-fold cross validation.

In [6]:
# linear regression
lm = LinearRegression()
scores = cross_val_score(lm, X_train_, y_train_, scoring='neg_mean_absolute_error', cv=5)
for i in range (5):
    print(f"Negative mean absolute error: {scores[i]:.4f}. iteration: {i}")

Negative mean absolute error: -0.6920. iteration: 0
Negative mean absolute error: -0.1895. iteration: 1
Negative mean absolute error: -0.1413. iteration: 2
Negative mean absolute error: -0.1030. iteration: 3
Negative mean absolute error: -0.1223. iteration: 4


In [7]:
lm.fit(X_train_, y_train_)
preds = lm.predict(x_test_)
print(f"Mean absolute error: {sklearn.metrics.mean_absolute_error(y_test_, preds):.4f}")

Mean absolute error: 0.1364


### Ridge Regression
Model trained on 0-100, 50-50, 75-25, 100-0 and tested on 25-75. Negative mean squared error and mean absolute error during 5-fold cross validation.


In [8]:
clf = Ridge(alpha=0.01)
scores = cross_val_score(clf, X_train_, y_train_, scoring='neg_mean_absolute_error', cv=5)
for i in range (5):
    print(f"Negative mean absolute error: {scores[i]:.4f}. iteration: {i}")

Negative mean absolute error: -0.6914. iteration: 0
Negative mean absolute error: -0.1916. iteration: 1
Negative mean absolute error: -0.1418. iteration: 2
Negative mean absolute error: -0.0970. iteration: 3
Negative mean absolute error: -0.1159. iteration: 4


In [9]:
clf.fit(X_train_, y_train_)
preds = clf.predict(x_test_)
print(f"Mean absolute error: {sklearn.metrics.mean_absolute_error(y_test_, preds):.4f}")

Mean absolute error: 0.1200


Cross validation for Ridge Regression to optimize alpha

In [10]:
# step-1: create a cross-validation scheme
folds = KFold(n_splits = 5, shuffle = True, random_state = 100)

# step-2: specify range of hyperparameters to tune
hyper_params = [{'alpha': [10**k for k in range(-4, 4)]}]


# step-3: perform grid search
# 3.1 specify model
lm = Ridge()
lm.fit(X_train_, y_train_)

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = lm, 
                        param_grid = hyper_params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
model_cv.fit(X_train_, y_train_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=100, shuffle=True),
             estimator=Ridge(),
             param_grid=[{'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100,
                                    1000]}],
             return_train_score=True, scoring='neg_mean_absolute_error',
             verbose=1)

In [11]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.006134,0.001554,0.000787,0.000259,0.0001,{'alpha': 0.0001},-0.133615,-0.136887,-0.129853,-0.137938,...,-0.133716,0.003304,3,-0.121221,-0.121474,-0.124762,-0.123045,-0.125197,-0.12314,0.001633
1,0.013189,0.007352,0.000717,0.000149,0.001,{'alpha': 0.001},-0.133417,-0.136307,-0.129526,-0.137727,...,-0.133389,0.003286,2,-0.121109,-0.121256,-0.124597,-0.122872,-0.125107,-0.122988,0.001651
2,0.007662,0.002865,0.00079,0.000109,0.01,{'alpha': 0.01},-0.133254,-0.135357,-0.128809,-0.138258,...,-0.133013,0.003574,1,-0.12186,-0.121425,-0.125169,-0.122955,-0.125672,-0.123416,0.001718
3,0.013421,0.011008,0.000789,0.000201,0.1,{'alpha': 0.1},-0.136135,-0.138205,-0.129952,-0.140978,...,-0.135309,0.004151,4,-0.126215,-0.125814,-0.129475,-0.126889,-0.130001,-0.127679,0.001724
4,0.012529,0.00877,0.00116,0.000784,1.0,{'alpha': 1},-0.14069,-0.143366,-0.130949,-0.143552,...,-0.138157,0.005473,5,-0.130517,-0.130408,-0.134501,-0.132207,-0.135227,-0.132572,0.00199
5,0.004183,0.000173,0.000698,8.6e-05,10.0,{'alpha': 10},-0.146294,-0.150861,-0.130369,-0.144148,...,-0.140781,0.00806,6,-0.13418,-0.134659,-0.139911,-0.137742,-0.140538,-0.137406,0.002613
6,0.004193,0.000266,0.000653,8e-06,100.0,{'alpha': 100},-0.15408,-0.160021,-0.134492,-0.149642,...,-0.147182,0.009695,7,-0.142366,-0.142415,-0.149231,-0.146443,-0.14949,-0.145989,0.003126
7,0.004246,9.4e-05,0.000771,0.0002,1000.0,{'alpha': 1000},-0.17152,-0.177665,-0.152403,-0.166481,...,-0.164966,0.009292,8,-0.161093,-0.161487,-0.167704,-0.164507,-0.168351,-0.164628,0.003023


### Extremely randomized trees
Model trained on 0-100, 50-50, 75-25, 100-0 and tested on 25-75.


In [19]:
import optuna

def objective(trial):
    # parameters to optimize
    a = trial.suggest_categorical("model_params/etr/n_estimators", [100, 150, 200, 300, 400, 500])
    b = trial.suggest_loguniform("model_params/etr/max_depth", 2, 32)
    c = trial.suggest_categorical("model_params/etr/min_samples_split", [4, 8, 16])
    d = trial.suggest_categorical("model_params/etr/max_features", ['auto', 0.2, 0.4, 0.6, 0.8])
    e = trial.suggest_categorical("model_params/etr/min_samples_leaf", [1, 2, 3, 4])

    reg = ExtraTreesRegressor(n_estimators=a, max_depth=b, min_samples_split=c, max_features=d, min_samples_leaf=e, random_state=0)
    return cross_val_score(reg, X_train_, y_train_.ravel(), cv=5, scoring='neg_mean_absolute_error').mean()

study = optuna.create_study(direction='maximize')
# run param optimization with optuna
study.optimize(objective, n_trials=100)

[32m[I 2021-12-23 17:00:48,153][0m A new study created in memory with name: no-name-0eed5b7c-469a-46b2-99cd-dc8479efc1fe[0m
[32m[I 2021-12-23 17:00:55,651][0m Trial 0 finished with value: -0.18148468317452532 and parameters: {'model_params/etr/n_estimators': 150, 'model_params/etr/max_depth': 11.962485438069068, 'model_params/etr/min_samples_split': 4, 'model_params/etr/max_features': 0.4, 'model_params/etr/min_samples_leaf': 3}. Best is trial 0 with value: -0.18148468317452532.[0m
[32m[I 2021-12-23 17:00:57,690][0m Trial 1 finished with value: -0.34550715594016923 and parameters: {'model_params/etr/n_estimators': 100, 'model_params/etr/max_depth': 2.1644612895529534, 'model_params/etr/min_samples_split': 4, 'model_params/etr/max_features': 0.2, 'model_params/etr/min_samples_leaf': 1}. Best is trial 0 with value: -0.18148468317452532.[0m
[32m[I 2021-12-23 17:01:04,779][0m Trial 2 finished with value: -0.22521452425088798 and parameters: {'model_params/etr/n_estimators': 300,

In [20]:
# ERT with best parameters
reg = ExtraTreesRegressor(n_estimators=400, max_depth=14.27, min_samples_split=4, max_features='auto', min_samples_leaf=1, random_state=0).fit(X_train_, y_train_.ravel())
preds = reg.predict(x_test_)
print(f"Mean absolute error: {sklearn.metrics.mean_absolute_error(y_test_, preds):.4f}")


Mean absolute error: 0.2783
