# Baseline model based on anatomy features

In [1]:
path = ".."

import sys
sys.path.insert(0, path)

In [2]:
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
import seaborn

In [3]:
from problem import get_train_data, get_test_data

data_train, labels_train = get_train_data(path=path)
data_test, labels_test = get_test_data(path=path)

In [4]:
data_train.shape, data_test.shape

((1127, 220), (23, 220))

In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from problem import get_cv

def evaluation(X, y):
    pipe = make_pipeline(FeatureExtractor(), Classifier())
    cv = get_cv(X, y)
    results = cross_validate(pipe, X, y, scoring=['roc_auc', 'accuracy'], cv=cv,
                             verbose=1, return_train_score=True,
                             n_jobs=1)
    
    return results

### Basic KNN

In [87]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA


anatomy_cols = [col for col in data_train.columns if col.startswith('anatomy')]
anatomy_cols.remove('anatomy_select')


class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.pca = PCA(0.92)
    
    def fit(self, X_df, y):
        X = X_df[anatomy_cols]
        self.pca.fit(X)
        return self

    def transform(self, X_df):
        # get only the anatomical information
        X = self.pca.transform(X_df[anatomy_cols])
        return X
        

class Classifier(BaseEstimator):
    def __init__(self):
        self.clf = KNeighborsClassifier(n_neighbors=4, weights='uniform')

    def fit(self, X, y):
        self.clf.fit(X, y)
        return self
        
    def predict(self, X):
        return self.clf.predict(X)

    def predict_proba(self, X):
        return self.clf.predict_proba(X)


In [88]:
import numpy as np

results = evaluation(data_train, labels_train)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(np.mean(results['train_roc_auc']),
                                                        np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(np.mean(results['test_roc_auc']),
                                                          np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['train_accuracy']),
                                                         np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(np.mean(results['test_accuracy']),
                                                           np.std(results['test_accuracy'])))

Training score ROC-AUC: 0.786 +- 0.010
Validation score ROC-AUC: 0.542 +- 0.024 

Training score accuracy: 0.687 +- 0.011
Validation score accuracy: 0.544 +- 0.022


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.7s finished
