### Import

In [179]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import feature_selection 
from sklearn import preprocessing
from sklearn import metrics
from sklearn import neighbors
from sklearn import svm
from sklearn import decomposition
from sklearn.decomposition import PCA
from matplotlib import pyplot
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold


### Load data

In [192]:
def load_data():
    # this_directory = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(os.path.join('hn', 'HN_radiomicFeatures.csv'), index_col=0)

    label = data["label"]
    data = data.drop(["label"], axis=1)

    selector = VarianceThreshold()
    selector.fit_transform(data)
    zero_var_col = [column for column in data.columns if column not in data.columns[selector.get_support()]]
    data.drop(zero_var_col, inplace=True, axis=1)
    
    feature_names = data.columns.values
    return data, label, feature_names

data, label, feature_names = load_data()

print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


(113, 157)
The number of samples: 113
The number of columns: 157


### Check for missing data

In [181]:
# Detect missing values in dataframe
data.isnull().values.any()

False

### Data split

In [182]:
def split_data(data, label):
    train_data, test_data, train_label, test_label = train_test_split(data, label, train_size=0.8)
    return train_data, test_data, train_label, test_label
# train_data, val_data, train_label, val_label = train_test_split(train_val_data, train_val_label, train_size=0.85)

train_data, test_data, train_label, test_label = split_data(data, label)
print(f'The number of train samples: {train_data.shape[0]}')
# print(f'The number of validation samples: {val_data.shape[0]}')
print(f'The number of test samples: {test_data.shape[0]}')

The number of train samples: 90
The number of test samples: 23


### Scaling train data

In [183]:
def scale_data(train_data, test_data, feature_names):
    scaler = StandardScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    test_data = scaler.transform(test_data)
    train_data = pd.DataFrame(train_data, columns=feature_names)
    test_data = pd.DataFrame(test_data, columns=feature_names)
    return train_data, test_data

train_data, test_data = scale_data(train_data, test_data, feature_names)

### Linear Classifier

In [184]:
clf = SGDClassifier()
clf.fit(train_data, train_label)
print(np.shape(clf.coef_))
print(clf.intercept_)

label_train_pred = clf.predict(train_data)
print(metrics.accuracy_score(train_label, label_train_pred))

y_pred = clf.predict(test_data)
print(metrics.accuracy_score(test_label, y_pred))

(1, 157)
[-80.41123642]
0.9
0.6086956521739131


### kNN classifier

In [185]:
# Perform a PCA
def perform_pca(train_data, test_data, n_components):
    pca = decomposition.PCA(n_components=n_components)
    pca.fit(train_data)
    train_pca = pca.transform(train_data)
    test_pca = pca.transform(test_data)
    return pca, train_pca, test_pca

pca, train_pca, test_pca = perform_pca(train_data, test_data, 10)
print(sum(pca.explained_variance_ratio_))

# Fit kNN
def knn_classifier(train_pca, test_pca, train_label, test_label):
    knn = neighbors.KNeighborsClassifier(n_neighbors=10)
    knn.fit(train_pca, train_label)
    score_train = knn.score(train_pca, train_label)
    score_test = knn.score(test_pca, test_label)
    return score_train, score_test

score_train, score_test = knn_classifier(train_pca, test_pca, train_label, test_label)

# Print result
print(f"Training result: {score_train}")
print(f"Test result: {score_test}")

0.7237400595669629
Training result: 0.7111111111111111
Test result: 0.6521739130434783


In [186]:
# You can understand the F-Scores as a measure of how informative each feature is for your dataset.
# As it is explained in the method documentation, an F-test is carried out to assess each feature. 
# The F-scores are the test statistic for the F-test, and they basically represent the ratio between the explained and the unexplained variance.
kbest = 80

def select_features(kbest, train_data, train_label, test_data):
    fs = SelectKBest(score_func=f_classif, k=kbest)
    kbest_train = fs.fit_transform(train_data, train_label)
    kbest_test = fs.transform(test_data)
    return kbest_train, kbest_test

kbest_train, kbest_test = select_features(kbest, train_data, train_label, test_data)

  f = msb / msw


### Find average accuracy over multiple classifications

In [194]:
# Find mean accuracy for 10 kNN classifications
loops = 100
score_train_array = np.zeros(loops)
score_test_array = np.zeros(loops)
num_best_features = 80
pca_components = 10

for i in range(loops):
    data, label, feature_names = load_data()
    train_data, test_data, train_label, test_label = split_data(data, label)
    train_data, test_data = scale_data(train_data, test_data, feature_names)

    # Find best features based on F-scores
    kbest_train, kbest_test = select_features(num_best_features, train_data, train_label, test_data)

    # Perform a PCA
    pca, train_pca, test_pca = perform_pca(kbest_train, kbest_test, pca_components)

    # Fit kNN
    score_train, score_test = knn_classifier(train_pca, test_pca, train_label, test_label)
    score_train_array[i] = score_train
    score_test_array[i] = score_test


mean_score_train = np.mean(score_train_array)
mean_score_test = np.mean(score_test_array)
print(f"Mean training score: {mean_score_train}")
print(f"Mean test score: {mean_score_test}")


  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw
  f = msb / msw


Mean training score: 0.793111111111111
Mean test score: 0.6982608695652174


### Lasso Regression (L1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel

sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
sel_.fit(train_data, np.ravel(train_label,order='C'))
sel_.get_support()
train_data = pd.DataFrame(train_data)

selected_feat = train_data.columns[(sel_.get_support())]
print('total features: {}'.format((train_data.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
np.sum(sel_.estimator_.coef_ == 0)))

removed_feats = train_data.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats