In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score

Read the dataframe from the CSV file. We first try using 12 features selected by the Random Forest. The train-validation-test split is set at 60-20-20 .

In [41]:
header_list = ['SpMax_L', 'J_Dz(e)', 'nHM', 'F01[N-N]', 'F04[C-N]','NssssC', 'nCb-', 'C%', 'nCp', 'nO', 'F03[C-N]',
               'SdssC', 'HyWi_B(m)', 'LOC', 'SM6_L', 'F03[C-O]', 'Me', 'Mi', 'nN-N', 'nArNO2', 'nCRX3', 'SpPosA_B(p)', 
              'nCIR', 'B01[C-Br]', 'B03[C-Cl]', 'N-073', 'SpMax_A', 'Psi_i_1d', 'B04[C-Br]', 'SdO' , 'TI2_L', 'nCrt',
               'C-026', 'F02[C-N]', 'nHDon', 'SpMax_B(m)', 'Psi_i_A', 'nN', 'SM6_B(m)', 'nArCOOR', 'nX', 'TARGET']
df = pd.read_csv('BioDegData.csv', names = header_list)
X = df[['SpMax_B(m)','SpMax_L','SpPosA_B(p)','Psi_i_A','Mi','F02[C-N]','SM6_B(m)','SdssC','nN','SpMax_A','SdO','J_Dz(e)']]
y = df.iloc[:, -1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [42]:
pipe_gs = make_pipeline(StandardScaler(), MinMaxScaler(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']}, 
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_gs, param_grid=param_grid, scoring='accuracy', refit=True, cv=10, n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8405257936507937
{'svc__C': 10.0, 'svc__gamma': 10.0, 'svc__kernel': 'rbf'}


In [43]:
clf = gs.best_estimator_
scores = cross_val_score(estimator=clf, X=X_val, y=y_val, cv=10, n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [0.95454545 0.9047619  0.76190476 0.85714286 0.85714286 0.80952381
 0.9047619  0.80952381 0.85714286 0.9047619 ]
CV accuracy: 0.862 +/- 0.054


In [44]:
y_pred = clf.predict(X_test)
print('--------------Results using RF Selection--------------')
print('Accuracy score: %.3f' % accuracy_score(y_test, y_pred))
print('Precision score: %.3f' % precision_score(y_test, y_pred))
print('Recall score: %.3f' % recall_score(y_test, y_pred))
print('F1 score: %.3f' % f1_score(y_test, y_pred))
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))

--------------Results using RF Selection--------------
Accuracy score: 0.877
Precision score: 0.836
Recall score: 0.789
F1 score: 0.812
Confusion Matrix: 
[[129  11]
 [ 15  56]]


We try the same process using the 31 features our SBS selected.

In [45]:
X = df[['J_Dz(e)', 'nHM', 'F01[N-N]', 'F04[C-N]', 'nCb-', 'C%', 'nCp', 'nO',
       'F03[C-N]', 'SdssC', 'HyWi_B(m)', 'LOC', 'SM6_L', 'F03[C-O]', 'Me',
       'Mi', 'nArNO2', 'nCRX3', 'SpPosA_B(p)', 'B01[C-Br]', 'Psi_i_1d',
       'TI2_L', 'C-026', 'F02[C-N]', 'nHDon', 'SpMax_B(m)', 'Psi_i_A', 'nN',
       'SM6_B(m)', 'nArCOOR', 'nX']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [46]:
pipe_gs = make_pipeline(StandardScaler(), MinMaxScaler(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']}, 
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]
gs = GridSearchCV(estimator=pipe_gs, param_grid=param_grid, scoring='accuracy', refit=True, cv=10, n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8674107142857144
{'svc__C': 10.0, 'svc__gamma': 1.0, 'svc__kernel': 'rbf'}


In [47]:
clf = gs.best_estimator_
scores = cross_val_score(estimator=clf, X=X_val, y=y_val, cv=10, n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [0.81818182 0.9047619  0.71428571 0.85714286 0.9047619  0.76190476
 0.95238095 0.76190476 0.85714286 0.95238095]
CV accuracy: 0.848 +/- 0.079


In [48]:
y_pred = clf.predict(X_test)
print('--------------Results using SBS--------------')
print('Accuracy score: %.3f' % accuracy_score(y_test, y_pred))
print('Precision score: %.3f' % precision_score(y_test, y_pred))
print('Recall score: %.3f' % recall_score(y_test, y_pred))
print('F1 score: %.3f' % f1_score(y_test, y_pred))
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))

--------------Results using SBS--------------
Accuracy score: 0.882
Precision score: 0.848
Recall score: 0.789
F1 score: 0.818
Confusion Matrix: 
[[130  10]
 [ 15  56]]


We then try the same process transforming the feature space with PCA. We include the number of PCA components in our grid search to find the best possible combination of hyperparameters.

In [49]:
X = df.iloc[:, 0:-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [51]:
pipe_pca = make_pipeline(StandardScaler(), MinMaxScaler(), PCA(), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
PCA_range = [10,20,30,40]
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear'], 'pca__n_components': PCA_range}, 
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf'], 'pca__n_components': PCA_range}]
gs = GridSearchCV(estimator=pipe_pca, param_grid=param_grid, scoring='accuracy', refit=True, cv=10, n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8674851190476192
{'pca__n_components': 30, 'svc__C': 100.0, 'svc__kernel': 'linear'}


In [52]:
clf = gs.best_estimator_
scores = cross_val_score(estimator=clf, X=X_val, y=y_val, cv=10, n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [0.81818182 0.80952381 0.80952381 0.76190476 0.76190476 0.85714286
 0.9047619  0.85714286 0.76190476 0.95238095]
CV accuracy: 0.829 +/- 0.061


In [53]:
y_pred = clf.predict(X_test)
print('--------------Results using PCA--------------')
print('Accuracy score: %.3f' % accuracy_score(y_test, y_pred))
print('Precision score: %.3f' % precision_score(y_test, y_pred))
print('Recall score: %.3f' % recall_score(y_test, y_pred))
print('F1 score: %.3f' % f1_score(y_test, y_pred))
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))

--------------Results using PCA--------------
Accuracy score: 0.891
Precision score: 0.833
Recall score: 0.845
F1 score: 0.839
Confusion Matrix: 
[[128  12]
 [ 11  60]]


Same thing with LDA. We include it in the pipeline.

In [54]:
X = df.iloc[:, 0:-1].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [55]:
pipe_lda = make_pipeline(StandardScaler(), MinMaxScaler(), LinearDiscriminantAnalysis(n_components=1), SVC(random_state=1))
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
solver_choices = ['svd', 'lsqr', 'eigen']
param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear'],'lineardiscriminantanalysis__solver': solver_choices}, 
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf'], 'lineardiscriminantanalysis__solver': solver_choices}]
gs = GridSearchCV(estimator=pipe_lda, param_grid=param_grid, scoring='accuracy', refit=True, cv=10, n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.8484623015873016
{'lineardiscriminantanalysis__solver': 'svd', 'svc__C': 10.0, 'svc__gamma': 1.0, 'svc__kernel': 'rbf'}


In [56]:
clf = gs.best_estimator_
scores = cross_val_score(estimator=clf, X=X_val, y=y_val, cv=10, n_jobs=1)
print('CV accuracy scores: %s' % scores)
print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores)))

CV accuracy scores: [0.72727273 0.80952381 0.66666667 0.71428571 0.80952381 0.85714286
 0.80952381 0.80952381 0.85714286 0.85714286]
CV accuracy: 0.792 +/- 0.063


In [57]:
y_pred = clf.predict(X_test)
print('--------------Results using LDA--------------')
print('Accuracy score: %.3f' % accuracy_score(y_test, y_pred))
print('Precision score: %.3f' % precision_score(y_test, y_pred))
print('Recall score: %.3f' % recall_score(y_test, y_pred))
print('F1 score: %.3f' % f1_score(y_test, y_pred))
print('Confusion Matrix: ')
print(confusion_matrix(y_test, y_pred))

--------------Results using LDA--------------
Accuracy score: 0.858
Precision score: 0.747
Recall score: 0.873
F1 score: 0.805
Confusion Matrix: 
[[119  21]
 [  9  62]]
