### Import

In [56]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import feature_selection 
from sklearn import preprocessing
from sklearn import metrics
from sklearn import neighbors
from sklearn import svm
from sklearn import decomposition
from sklearn.decomposition import PCA

### Load data

In [57]:
def load_data():
    # this_directory = os.path.dirname(os.path.abspath(__file__))
    data = pd.read_csv(os.path.join('hn', 'HN_radiomicFeatures.csv'), index_col=0)

    return data


data = load_data()
label = data["label"]
data = data.drop(["label"], axis=1)
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')

The number of samples: 113
The number of columns: 159


### Check for missing data

In [58]:
# Detect missing values in dataframe
data.isnull().values.any()

False

### Data split

In [59]:
train_val_data, test_data, train_val_label, test_label = train_test_split(data, label, train_size=0.8)
train_data, val_data, train_label, val_label = train_test_split(train_val_data, train_val_label, train_size=0.85)

print(f'The number of train samples: {train_data.shape[0]}')
print(f'The number of validation samples: {val_data.shape[0]}')
print(f'The number of test samples: {test_data.shape[0]}')

The number of train samples: 76
The number of validation samples: 14
The number of test samples: 23


### Scaler & Linear Classifier

In [60]:
scaler = StandardScaler().fit(train_data)
train_data = scaler.transform(train_data)

clf = SGDClassifier()
clf.fit(train_data, train_label)
print(np.shape(clf.coef_))
print(clf.intercept_)

label_train_pred = clf.predict(train_data)
print(metrics.accuracy_score(train_label, label_train_pred))

y_pred = clf.predict(test_data)
print(metrics.accuracy_score(test_label, y_pred))

(1, 159)
[-38.39942875]
1.0
0.43478260869565216




In [61]:
# exclude features with zero variances
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold()
selector.fit_transform(test_data)
selector.fit_transform(test_data).shape

(23, 155)

### kNN classifier

In [62]:
# Scale the data to be normal
scaler = preprocessing.StandardScaler()
scaler.fit(train_val_data)
train_scaled = scaler.transform(train_val_data)
test_scaled = scaler.transform(test_data)

# Perform a PCA
pca = decomposition.PCA(n_components=10)
pca.fit(train_scaled)
train_pca = pca.transform(train_scaled)
test_pca = pca.transform(test_scaled)
print(sum(pca.explained_variance_ratio_))

# Fit kNN
knn = neighbors.KNeighborsClassifier(n_neighbors=15)
knn.fit(train_pca, train_val_label)
score_train = knn.score(train_pca, train_val_label)
score_test = knn.score(test_pca, test_label)

# Print result
print(f"Training result: {score_train}")
print(f"Test result: {score_test}")

0.7150232582497472
Training result: 0.6666666666666666
Test result: 0.6521739130434783
