### Import

In [88]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import neighbors
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from matplotlib import pyplot


### Load data

In [113]:
def load_data():
    data = pd.read_csv(os.path.join('hn', 'HN_radiomicFeatures.csv'), index_col=0)
    label = data["label"]
    label = label.replace(to_replace={'T12': False, 'T34': True}, value=None)
    data = data.drop(["label"], axis=1)
    feature_names = data.columns.values
    return data, label, feature_names

data, label, feature_names = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


55
The number of samples: 113
The number of columns: 159


### Check for missing data

In [90]:
# Detect missing values in dataframe
data.isnull().values.any()

False

### Data split

In [115]:
def split_data(data, label):
    train_data, test_data, train_label, test_label = train_test_split(data, label, train_size=0.8)
    return train_data, test_data, train_label, test_label
# train_data, val_data, train_label, val_label = train_test_split(train_val_data, train_val_label, train_size=0.85)

train_data, test_data, train_label, test_label = split_data(data, label)
print(f'The number of train samples: {train_data.shape[0]}')
# print(f'The number of validation samples: {val_data.shape[0]}')
print(f'The number of test samples: {test_data.shape[0]}')

42
The number of train samples: 90
The number of test samples: 23


### Scaling train data

In [92]:
def scale_data(train_data, test_data, feature_names):
    scaler = StandardScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    test_data = scaler.transform(test_data)
    train_data = pd.DataFrame(train_data, columns=feature_names)
    test_data = pd.DataFrame(test_data, columns=feature_names)
    return train_data, test_data

### Remove features with 0 variance

In [93]:
def remove_zero_var(train_data, test_data):
    selector = VarianceThreshold()
    selector.fit_transform(train_data)
    zero_var_col = [column for column in train_data.columns if column not in train_data.columns[selector.get_support()]]
    train_data.drop(zero_var_col, inplace=True, axis=1)
    test_data.drop(zero_var_col, inplace=True, axis=1)
    return train_data, test_data

### Select best features with Anova test

In [94]:
def select_features(kbest, train_data, train_label, test_data):
    fs = SelectKBest(score_func=f_classif, k=kbest)
    kbest_train = fs.fit_transform(train_data, train_label)
    kbest_test = fs.transform(test_data)
    return kbest_train, kbest_test

### Lasso Regression

In [95]:
def lasso_regression(train_data, train_label, test_data):
    sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
    sel_.fit(train_data, np.ravel(train_label,order='C'))
    sel_.get_support()
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)
    removed_feats = train_data.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
    lasso_train = train_data.drop(removed_feats, axis='columns')
    lasso_test = test_data.drop(removed_feats, axis='columns')
    return lasso_train, lasso_test

### Linear Classifier

In [96]:
def linear_classifier(train_data, test_data, train_label, test_label):
    clf = SGDClassifier()
    clf.fit(train_data, train_label)

    train_linear = clf.predict(train_data)
    score_linear_train = metrics.accuracy_score(train_label, train_linear)

    test_linear = clf.predict(test_data)
    score_linear_test = metrics.accuracy_score(test_label, test_linear)
    return score_linear_train, score_linear_test

### kNN classifier

In [97]:
def knn_classifier(train_pca, test_pca, train_label, test_label, knn_neighbors):
    knn = neighbors.KNeighborsClassifier(n_neighbors=knn_neighbors)
    knn.fit(train_pca, train_label)
    score_train = knn.score(train_pca, train_label)
    score_test = knn.score(test_pca, test_label)
    return score_train, score_test

### Random Forest Classifier

In [122]:
def random_forest(train_data, test_data, train_label, test_label):
    clf = RandomForestClassifier()
    n_estimators = range(20,140,15)
    max_depth = range(8, 15, 2)
    min_samples_leaf = range(2, 7)
    min_samples_split = range(3, 10, 2)

    param_grid = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_leaf': min_samples_leaf,
        'min_samples_split': min_samples_split
    }

    rf_random = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=100, cv=5)
    rf_random.fit(train_data, train_label)
    print(rf_random.best_params_)
    
    best_params = rf_random.best_params_
    best_model = RandomForestClassifier(n_estimators=best_params['n_estimators'], min_samples_split=best_params['min_samples_split'],
                                    min_samples_leaf=best_params['min_samples_leaf'], max_depth=best_params['max_depth'])
    best_model.fit(train_data,train_label)
    rf_random_train =  best_model.predict(train_data)
    rf_random_test = best_model.predict(test_data)
    score_train = metrics.accuracy_score(train_label, rf_random_train)
    score_test = metrics.accuracy_score(test_label, rf_random_test)
    confusion_matrix_train = metrics.confusion_matrix(train_label, rf_random_train)
    confusion_matrix_test = metrics.confusion_matrix(test_label, rf_random_test)
    return score_train, score_test, confusion_matrix_train, confusion_matrix_test

### Calculate sensitivity, specificity, PPV, NPV

In [124]:
def sens_spec(confusion_matrix):
    true_pos, false_pos, true_neg, false_neg = confusion_matrix.ravel()

    sens = true_pos/(true_pos + false_neg)
    spec = true_neg/(true_neg + false_pos)
    pos_pred_value = true_pos/(true_pos + false_pos)
    neg_pred_value = true_neg/(true_neg + false_neg)
    return sens, spec, pos_pred_value, neg_pred_value

### Find average accuracy over multiple classifications

In [132]:
# Find mean accuracy for kNN classification
loops = 15
score_train_array = np.zeros(loops)
score_test_array = np.zeros(loops)
sens_train_array = np.zeros(loops)
spec_train_array = np.zeros(loops)
pos_pred_value_train_array = np.zeros(loops)
neg_pred_value_train_array = np.zeros(loops)
sens_test_array = np.zeros(loops)
spec_test_array = np.zeros(loops)
pos_pred_value_test_array = np.zeros(loops)
neg_pred_value_test_array = np.zeros(loops)

num_best_features = 80
pca_components = 10
knn_neighbors = 15

for i in range(loops):
    data, label, feature_names = load_data()
    train_data, test_data, train_label, test_label = split_data(data, label)
    train_data, test_data = scale_data(train_data, test_data, feature_names)
    train_data, test_data = remove_zero_var(train_data, test_data)

    # Find best features based on F-scores
    kbest_train, kbest_test = select_features(num_best_features, train_data, train_label, test_data)

    # Lasso regression
    lasso_train, lasso_test = lasso_regression(kbest_train, train_label, kbest_test)

    # Apply random forest classifier
    score_train, score_test, confusion_matrix_train, confusion_matrix_test = random_forest(lasso_train, lasso_test, train_label, test_label)
    score_train_array[i] = score_train
    score_test_array[i] = score_test

    sens_train, spec_train, pos_pred_value_train, neg_pred_value_train = sens_spec(confusion_matrix_train)
    sens_test, spec_test, pos_pred_value_test, neg_pred_value_test = sens_spec(confusion_matrix_test)

    sens_train_array[i] = sens_train
    spec_train_array[i] = spec_train
    pos_pred_value_train_array[i] = pos_pred_value_train
    neg_pred_value_train_array[i] = neg_pred_value_train

    sens_test_array[i] = sens_test
    spec_test_array[i] = spec_test
    pos_pred_value_test_array[i] = pos_pred_value_test
    neg_pred_value_test_array[i] = neg_pred_value_test


mean_score_train = np.mean(score_train_array)
mean_score_test = np.mean(score_test_array)
mean_sens_train = np.mean(sens_train)
mean_spec_train = np.mean(spec_train)
mean_ppv_train = np.mean(pos_pred_value_train)
mean_npv_train = np.mean(neg_pred_value_train)
mean_sens_test = np.mean(sens_test)
mean_spec_test = np.mean(spec_test)
mean_ppv_test = np.mean(pos_pred_value_test)
mean_npv_test = np.mean(neg_pred_value_test)

print(f"Mean training score: {mean_score_train}")
print(f"Mean test score: {mean_score_test}")
print(f"Mean train sens: {mean_sens_train}")
print(f"Mean test sens: {mean_sens_test}")
print(f"Mean train spec: {mean_spec_train}")
print(f"Mean test spec: {mean_spec_test}")

45
{'n_estimators': 35, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 12}
[[45  0]
 [ 0 45]]
46


  spec = true_neg/(true_neg + false_pos)


{'n_estimators': 20, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_depth': 8}
[[40  4]
 [ 0 46]]
41
{'n_estimators': 65, 'min_samples_split': 9, 'min_samples_leaf': 6, 'max_depth': 8}
[[47  2]
 [ 6 35]]
44
{'n_estimators': 95, 'min_samples_split': 9, 'min_samples_leaf': 6, 'max_depth': 12}
[[41  5]
 [ 5 39]]
45
{'n_estimators': 50, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_depth': 8}
[[44  1]
 [ 3 42]]
Mean training score: 0.9422222222222223
Mean test score: 0.6782608695652174
Mean train sens: 0.5116279069767442
Mean test sens: 0.5
Mean train spec: 0.75
Mean test spec: 0.2
