Look at whether any samples' predictions are improved by the inclusion of X10-X17.

In [1]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2

from biomarker.models import Ensemble
from biomarker.data_collection import *
from sklearn import svm
from sklearn.model_selection import cross_val_score


import numpy as np

data_collection


In [2]:
EXCLUDE_KEYS = [206, 205, 184, 183, 82, 81, 45]
TEST_KEYS = [218, 217, 216]
C = 30
EPSILON = 0.001
GAMMA = 0.1

In [3]:
excel = parse_master_file(exclude_keys=EXCLUDE_KEYS)
test_idxs = list(excel[np.isin(excel['Key'], TEST_KEYS)].index)
print('Test indices: ', test_idxs)
# alter the weighting of low logK values:
# excel = duplicate_master(excel, 2, 2, how='lt')
L = get_filename_list(excel['Associated data'])

[3, 9]
Test indices:  [191, 192, 193]


In [4]:
excel.head()

Unnamed: 0,Input,Key,Associated data,X10: Category Method,X11: Temperature (K),X12: [Salt*Valency],X13: Category Salt type,X14: [Buffer] (mM),X15: pH,X16: CI #,X17: CI,Unnamed: 11,Output: logK,Output: logKbucket
0,1,17,SB_156,A,,,,,,,,,4.39,1
1,2,18,SB_156,B,,,,50.0,4.74,,,,4.39,1
2,3,19,SB_158,A,,,,,,1.0,CI_2,,4.63,1
3,4,20,SB_159,A,,,,,,2.0,CI_1,,4.8,1
4,5,21,SB_160,A,,,,,,2.0,CI_2,,4.9,1


In [5]:
x1 = create_x1_matrix(L)
x4 = create_x4_matrix(L)
x5 = create_x5_matrix(L)
x6 = create_x6_matrix(L)
x7 = create_x7_matrix(L)

y = excel['Output: logK'].values
y_buck = excel['Output: logKbucket'].values

In [6]:
x10_x17 = excel.iloc[:, 3:-2]
master, master_names = prepare_master(x10_x17)

In [7]:
# X = np.hstack((x1,x4,x5,x6,x7,master))
X = np.hstack((x1,x4,x5,x6,x7))
X.shape

(196, 1050)

In [8]:
X_train = np.delete(X, test_idxs, axis=0)

# y_train = y[:-2]
y_train = np.delete(y, test_idxs, axis=0)
y_buck_train = np.delete(y_buck, test_idxs, axis=0)

In [9]:
# Regression predictions
clf = svm.SVR(kernel='rbf', epsilon=EPSILON, C=C, gamma=GAMMA)
clf.fit(X_train, y_train)
prediction = clf.predict(X[test_idxs])
print(prediction, y[test_idxs])

[6.29753103 6.29753103 6.29753103] [6.54 1.99 1.99]


In [10]:
# Classification cross-validation
clf = svm.SVC(decision_function_shape='ovo', C=C, gamma=GAMMA)
cross_val_score(clf, X, y_buck, cv=5).mean()

0.7862644415917843

In [11]:
# Regression cross-validation
clf = svm.SVR(kernel='rbf', epsilon=EPSILON, C=C, gamma=GAMMA)
cvs = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_absolute_error')
print('Mean Cross-Validation Score: {}, Stddev: {}'.format(cvs.mean(), cvs.std()))

Mean Cross-Validation Score: -2.087047898125759, Stddev: 0.5968168130546884


In [12]:
fitted = clf.fit(X_train, y_train)
fitted.dual_coef_

array([[-1.90606250e+00, -1.66612500e+00, -1.49615625e+00,
        -1.39635937e+00, -7.46648437e-01, -2.99870000e+01,
         3.00000000e+01,  6.21828125e-01,  3.00000000e+01,
         3.00000000e+01,  3.00000000e+01, -3.00000000e+01,
        -3.00000000e+01, -3.00000000e+01,  3.00000000e+01,
        -3.00000000e+01, -3.00000000e+01, -3.00000000e+01,
        -3.00000000e+01,  3.00000000e+01,  2.87813750e+01,
         3.00000000e+01,  9.61414062e-01, -2.89470000e+01,
         3.00000000e+01,  1.28164062e+00,  1.65164062e+00,
         2.01100000e+00,  2.21187500e+00,  2.65142187e+00,
         3.00000000e+01, -2.70463594e+01,  5.93143750e+00,
         3.00000000e+01, -2.46670625e+01,  5.99093750e+00,
        -1.41667188e+00, -8.16648437e-01,  5.10000000e-02,
        -2.94561250e+01, -3.00000000e+01,  3.00000000e+01,
         3.00000000e+01,  3.00000000e+01, -3.00000000e+01,
         1.00141406e+00,  3.00000000e+01, -3.00000000e+01,
         6.51843750e-01,  3.00000000e+01, -2.95765781e+0

In [13]:
# Find the predictive subspace
feat_choices = [x1, x4, x5, x6, x7]
for i in range(len(feat_choices)):
    print('Feature: {}'.format(i))
    X = feat_choices[i]
    X_train = np.delete(X, test_idxs, axis=0)
    print(X_train.shape)

    clf = svm.SVR(kernel='rbf', epsilon=EPSILON, C=C, gamma=GAMMA)
    clf.fit(X_train, y_train)
    prediction = clf.predict(X[test_idxs])
    print(prediction, y[test_idxs])
    
    clf = svm.SVR(kernel='rbf', epsilon=EPSILON, C=C, gamma=GAMMA)
    cvs = cross_val_score(clf, X, y, cv=5, scoring='neg_mean_absolute_error')
    print('Mean Cross-Validation Score: {}, Stddev: {}'.format(cvs.mean(), cvs.std()))

Feature: 0
(193, 250)
[6.29751966 6.29751966 6.29751966] [6.54 1.99 1.99]
Mean Cross-Validation Score: -2.0870453812438625, Stddev: 0.5968145472049823
Feature: 1
(193, 150)
[6.32214365 6.32214365 6.32214365] [6.54 1.99 1.99]
Mean Cross-Validation Score: -2.0865043204969607, Stddev: 0.5949758771406073
Feature: 2
(193, 200)
[6.08829291 6.08829291 6.09394036] [6.54 1.99 1.99]
Mean Cross-Validation Score: -1.8692286137134073, Stddev: 0.4838217734152629
Feature: 3
(193, 300)
[6.24971834 6.24971834 6.24971834] [6.54 1.99 1.99]
Mean Cross-Validation Score: -2.0307575505329476, Stddev: 0.5749235437219055
Feature: 4
(193, 150)
[6.25317197 6.25317197 6.25317197] [6.54 1.99 1.99]
Mean Cross-Validation Score: -2.014820208853879, Stddev: 0.5499397816629243
