In [None]:
from math import sqrt

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold, cross_validate

from sklearn.preprocessing import OneHotEncoder
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD, Adam

def mean_relative_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

def median_relative_error(y_true, y_pred):
    return np.median(np.abs((y_true - y_pred) / y_true))

scoring = {'mean_relative_error': metrics.make_scorer(mean_relative_error, greater_is_better=True),
           'median_relative_error': metrics.make_scorer(median_relative_error, greater_is_better=True),
           'r2' : 'r2',
           'neg_mean_absolute_error': 'neg_mean_absolute_error',
           'neg_root_mean_squared_error' :'neg_root_mean_squared_error'
           }

In [None]:
train_index = pd.read_csv("trainDataECFP.csv")
test_index = pd.read_csv("testDataECFP.csv")

#One hot encode adducts
enc = OneHotEncoder()
encData = pd.DataFrame(enc.fit_transform(train_index[['Adduct.x']]).toarray())
trainData = train_index.reset_index()
trainData = pd.concat([trainData, encData], axis = 1)
trainData.columns = [*trainData.columns[:-3], 'Adduct_[M+Na]', 'Adduct_[M+H]','Adduct_[M-H]']

encData = pd.DataFrame(enc.fit_transform(test_index[['Adduct.x']]).toarray())
testData = test_index.reset_index()
testData = pd.concat([testData, encData], axis = 1)
testData.columns = [*testData.columns[:-3], 'Adduct_[M+Na]', 'Adduct_[M+H]','Adduct_[M-H]']

list = [str(i) for i in range(1, 1025)]
train_cols = ['m/z', 'Adduct_[M+Na]', 'Adduct_[M+H]','Adduct_[M-H]']
train_cols = train_cols + list
X_train = trainData.loc[:,train_cols]
X_test = testData.loc[:,train_cols]
y_train = trainData["CCS_AVG"]
y_test = testData["CCS_AVG"]
ncols = X_train.shape[1]

In [None]:
#Linear models for baseline comparison
linModel_scores = cross_validate(LinearRegression(), X_train, y_train, cv=10, scoring=scoring, return_train_score = True)
linModel_scores = pd.DataFrame(linModel_scores)
print(linModel_scores.mean())
linModel_scores

In [None]:
LinearReg = LinearRegression()
LinearReg.fit(X_train, y_train)
pred_test = LinearReg.predict(X_test)
mae = metrics.mean_absolute_error(y_test, pred_test)
r2 = metrics.r2_score(y_test, pred_test)
rmse = sqrt(metrics.mean_squared_error(y_test, pred_test))
mdre = median_relative_error(np.array(y_test), pred_test)
print(mae, r2, rmse, mdre)

In [None]:
def create_model():
    kmodel = Sequential()
    kmodel.add(Dense(ncols, input_shape=(ncols,),activation = 'relu'))
    kmodel.add(Dense(512, activation = 'relu'))
    kmodel.add(Dense(32, activation = 'relu'))
    kmodel.add(Dense(1))
    kmodel.compile(loss = 'mean_squared_error', optimizer= Adam(learning_rate = 0.0001))
    return kmodel

keras_regressor = KerasRegressor(build_fn = create_model, epochs = 75, verbose = 1, batch_size = 128, validation_split = 0.1)

keras_scores = cross_validate(keras_regressor, X_train, y_train, cv=10, scoring=scoring, return_train_score = True)
keras_scores = pd.DataFrame(keras_scores)
print(keras_scores.mean())
keras_scores

In [None]:
model = Sequential()
model.add(Dense(ncols, input_shape=(ncols,),activation = 'relu'))
model.add(Dense(512, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1))
model.compile(loss = 'mean_squared_error', optimizer= Adam(learning_rate = 0.0001))

nn_model = model.fit(X_train, y_train, epochs = 75, verbose = 1, batch_size = 128, validation_split = 0.1)

In [None]:
import matplotlib.pyplot as plt
plt.plot(nn_model.history['loss'])
plt.plot(nn_model.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'cross_validation'], loc='upper left')
plt.show()

In [None]:
pred_test = model.predict(X_test)
mae = metrics.mean_absolute_error(y_test, pred_test)
r2 = metrics.r2_score(y_test, pred_test)
rmse = sqrt(metrics.mean_squared_error(y_test, pred_test))
mdre = median_relative_error(np.array(y_test).reshape(-1,1), pred_test)
print(mae, r2, rmse, mdre)