### Import

In [289]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, RandomizedSearchCV
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import neighbors
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from matplotlib import pyplot


### Load data

In [260]:
def load_data():
    data = pd.read_csv(os.path.join('hn', 'HN_radiomicFeatures.csv'), index_col=0)
    label = data["label"]
    data = data.drop(["label"], axis=1)
    feature_names = data.columns.values
    return data, label, feature_names

data, label, feature_names = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')


The number of samples: 113
The number of columns: 159


### Check for missing data

In [261]:
# Detect missing values in dataframe
data.isnull().values.any()

False

### Data split

In [262]:
def split_data(data, label):
    train_data, test_data, train_label, test_label = train_test_split(data, label, train_size=0.8)
    return train_data, test_data, train_label, test_label
# train_data, val_data, train_label, val_label = train_test_split(train_val_data, train_val_label, train_size=0.85)

train_data, test_data, train_label, test_label = split_data(data, label)
print(f'The number of train samples: {train_data.shape[0]}')
# print(f'The number of validation samples: {val_data.shape[0]}')
print(f'The number of test samples: {test_data.shape[0]}')

The number of train samples: 90
The number of test samples: 23


### Scaling train data

In [263]:
def scale_data(train_data, test_data, feature_names):
    scaler = StandardScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    test_data = scaler.transform(test_data)
    train_data = pd.DataFrame(train_data, columns=feature_names)
    test_data = pd.DataFrame(test_data, columns=feature_names)
    return train_data, test_data

### Remove features with 0 variance

In [264]:
def remove_zero_var(train_data, test_data):
    selector = VarianceThreshold()
    selector.fit_transform(train_data)
    zero_var_col = [column for column in train_data.columns if column not in train_data.columns[selector.get_support()]]
    train_data.drop(zero_var_col, inplace=True, axis=1)
    test_data.drop(zero_var_col, inplace=True, axis=1)
    return train_data, test_data

### Select best features with Anova test

In [265]:
def select_features(kbest, train_data, train_label, test_data):
    fs = SelectKBest(score_func=f_classif, k=kbest)
    kbest_train = fs.fit_transform(train_data, train_label)
    kbest_test = fs.transform(test_data)
    return kbest_train, kbest_test

### Lasso Regression

In [266]:
def lasso_regression(train_data, train_label, test_data):
    sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1', solver='liblinear'))
    sel_.fit(train_data, np.ravel(train_label,order='C'))
    sel_.get_support()
    train_data = pd.DataFrame(train_data)
    test_data = pd.DataFrame(test_data)
    removed_feats = train_data.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
    lasso_train = train_data.drop(removed_feats, axis='columns')
    lasso_test = test_data.drop(removed_feats, axis='columns')
    return lasso_train, lasso_test

### Linear Classifier

In [267]:
def linear_classifier(train_data, test_data, train_label, test_label):
    clf = SGDClassifier()
    clf.fit(train_data, train_label)

    train_linear = clf.predict(train_data)
    score_linear_train = metrics.accuracy_score(train_label, train_linear)

    test_linear = clf.predict(test_data)
    score_linear_test = metrics.accuracy_score(test_label, test_linear)
    return score_linear_train, score_linear_test

### kNN classifier

In [268]:
def knn_classifier(train_pca, test_pca, train_label, test_label, knn_neighbors):
    knn = neighbors.KNeighborsClassifier(n_neighbors=knn_neighbors)
    knn.fit(train_pca, train_label)
    score_train = knn.score(train_pca, train_label)
    score_test = knn.score(test_pca, test_label)
    return score_train, score_test

### Random Forest Classifier

In [293]:
max_depth = range(1,10)
mean_rf_score = []
std_rf_score = []

"""
for i in max_depth:
    clf=RandomForestClassifier(n_estimators=20, max_depth=i)
    rf_score = cross_val_score(clf, train_data, train_label, cv=7)
    mean_rf_score.append(np.mean(rf_score))
    std_rf_score.append(np.std(rf_score))
"""

#print(f"Mean rf score per depth: {mean_rf_score}")
#print(f"Standard deviation: {std_rf_score}")
# clf = RandomForestClassifier(n_estimators=20, max_depth=2)
# learning_curve(clf, train_data, train_label, cv=5, scoring=None)
# n_jobs=-1, train_sizes=np.linspace(0.01, 1.0, 50)    

clf = RandomForestClassifier()
n_estimators = range(5,125,10)
max_depth = range(4, 15)
min_samples_leaf = range(1, 6)
min_samples_split = range(2, 10, 2)

param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split
}

rf_random = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=100, cv=5)
rf_random.fit(train_data, train_label)
print(rf_random.best_params_)


{'n_estimators': 115, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 14}


In [270]:
def random_forest(train_data, test_data, train_label, test_label):
    clf=RandomForestClassifier(n_estimators=20)
    clf.fit(train_data,train_label)
    test_random_forest = clf.predict(test_data)
    train_random_forest =  clf.predict(train_data)
    score_test = metrics.accuracy_score(test_label, test_random_forest)
    score_train = metrics.accuracy_score(train_label, train_random_forest)
    return score_train, score_test

### Find average accuracy over multiple classifications

In [271]:
# Find mean accuracy for kNN classification
loops = 200
score_train_array = np.zeros(loops)
score_test_array = np.zeros(loops)
num_best_features = 80
pca_components = 10
knn_neighbors = 15

for i in range(loops):
    data, label, feature_names = load_data()
    train_data, test_data, train_label, test_label = split_data(data, label)
    train_data, test_data = scale_data(train_data, test_data, feature_names)
    train_data, test_data = remove_zero_var(train_data, test_data)

    # Find best features based on F-scores
    kbest_train, kbest_test = select_features(num_best_features, train_data, train_label, test_data)

    # Lasso regression
    lasso_train, lasso_test = lasso_regression(kbest_train, train_label, kbest_test)

    # Fit kNN
    # score_train, score_test = knn_classifier(train_pca, test_pca, train_label, test_label, knn_neighbors)
    # score_train, score_test = linear_classifier(kbest_train, kbest_test, train_label, test_label)
    score_train, score_test = random_forest(lasso_train, lasso_test, train_label, test_label)
    score_train_array[i] = score_train
    score_test_array[i] = score_test


mean_score_train = np.mean(score_train_array)
mean_score_test = np.mean(score_test_array)
print(f"Mean training score: {mean_score_train}")
print(f"Mean test score: {mean_score_test}")

Mean training score: 0.9966666666666666
Mean test score: 0.7278260869565217
