In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../dataset/samples_post_extraction_and_labelled.csv')

In [3]:
print(df.shape)
cols_to_drop = [col for col in df.columns if col.startswith('diagnostics_')]
cols_to_drop.extend(['sample_id', 'mycn_amplified'])
print(f'Number of cols to drop: {len(cols_to_drop)}')
print(cols_to_drop)

X = df.drop(columns=cols_to_drop)
print(f'Number of features for model: {len(X.columns)}')

y = df['mycn_amplified']

(47, 131)
Number of cols to drop: 24
['diagnostics_Versions_PyRadiomics', 'diagnostics_Versions_Numpy', 'diagnostics_Versions_SimpleITK', 'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python', 'diagnostics_Configuration_Settings', 'diagnostics_Configuration_EnabledImageTypes', 'diagnostics_Image-original_Hash', 'diagnostics_Image-original_Dimensionality', 'diagnostics_Image-original_Spacing', 'diagnostics_Image-original_Size', 'diagnostics_Image-original_Mean', 'diagnostics_Image-original_Minimum', 'diagnostics_Image-original_Maximum', 'diagnostics_Mask-original_Hash', 'diagnostics_Mask-original_Spacing', 'diagnostics_Mask-original_Size', 'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_VoxelNum', 'diagnostics_Mask-original_VolumeNum', 'diagnostics_Mask-original_CenterOfMassIndex', 'diagnostics_Mask-original_CenterOfMass', 'sample_id', 'mycn_amplified']
Number of features for model: 107


In [4]:
for col in X.columns:
    print(col, X[col].dtype)

original_shape_Elongation float64
original_shape_Flatness float64
original_shape_LeastAxisLength float64
original_shape_MajorAxisLength float64
original_shape_Maximum2DDiameterColumn float64
original_shape_Maximum2DDiameterRow float64
original_shape_Maximum2DDiameterSlice float64
original_shape_Maximum3DDiameter float64
original_shape_MeshVolume float64
original_shape_MinorAxisLength float64
original_shape_Sphericity float64
original_shape_SurfaceArea float64
original_shape_SurfaceVolumeRatio float64
original_shape_VoxelVolume float64
original_firstorder_10Percentile float64
original_firstorder_90Percentile float64
original_firstorder_Energy float64
original_firstorder_Entropy float64
original_firstorder_InterquartileRange float64
original_firstorder_Kurtosis float64
original_firstorder_Maximum float64
original_firstorder_MeanAbsoluteDeviation float64
original_firstorder_Mean float64
original_firstorder_Median float64
original_firstorder_Minimum float64
original_firstorder_Range float6

In [5]:
y.value_counts() # Imbalanced dataset

mycn_amplified
0    34
1    13
Name: count, dtype: int64

In [6]:
# Train validation split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# # Scaling
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [8]:
# # Feature selection
# from sklearn.feature_selection import SelectKBest, f_classif, chi2

# prev_features = len(X_train[0])

# selector = SelectKBest(chi2, k=50)
# X_train = selector.fit_transform(X_train, y_train)
# X_val = selector.transform(X_val)

# post_features = len(X_train[0])
# print(f'{post_features} were selected from {prev_features}')

In [9]:
# # Dimensionaltiy reduction (TBC)
# from sklearn.decomposition import PCA

# prev_features = len(X_train[0])

# pca = PCA(n_components=20)
# X_train = pca.fit_transform(X_train)
# X_val = pca.transform(X_val)

# post_features = len(X_train[0])
# print(f'{post_features} were selected from {prev_features} for train and {len(X_val[0])} for validation')

In [10]:
# Resampling
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f'X_train changed: {X_train.shape} -> {X_train_res.shape}')
print(f'y_train changed: {y_train.shape} -> {y_train_res.shape}')
print(f'Prev y labels were:\n{y_train.value_counts()}\nNew y labels are:\n{y_train_res.value_counts()}')

X_train changed: (37, 107) -> (54, 107)
y_train changed: (37,) -> (54,)
Prev y labels were:
mycn_amplified
0    27
1    10
Name: count, dtype: int64
New y labels are:
mycn_amplified
0    27
1    27
Name: count, dtype: int64


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

lr = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm = SVC(kernel='rbf', random_state=42)
models = {
    'logistic_regression': lr, 
    'random_forest': rf, 
    'support_vector': svm
}

def eval_results(y_actual, predictions): 
    acc = accuracy_score(y_actual, predictions)
    clas_report = classification_report(y_actual, predictions) # Note that F1 score can only be used if true positives = true negatives 
    conf_matrix = confusion_matrix(y_actual, predictions)
    print('Accuracy:', acc)
    print('Classifcation Report:\n', clas_report)
    print('Confusion Matrix:\n', conf_matrix)

for name, model in models.items():
    print(f'\n=== {name} ===')
    model.fit(X_train, y_train)
    model_predictions = model.predict(X_val)
    eval_results(y_val, model_predictions)


=== logistic_regression ===
Accuracy: 0.6
Classifcation Report:
               precision    recall  f1-score   support

           0       0.67      0.86      0.75         7
           1       0.00      0.00      0.00         3

    accuracy                           0.60        10
   macro avg       0.33      0.43      0.38        10
weighted avg       0.47      0.60      0.53        10

Confusion Matrix:
 [[6 1]
 [3 0]]

=== random_forest ===
Accuracy: 0.7
Classifcation Report:
               precision    recall  f1-score   support

           0       0.70      1.00      0.82         7
           1       0.00      0.00      0.00         3

    accuracy                           0.70        10
   macro avg       0.35      0.50      0.41        10
weighted avg       0.49      0.70      0.58        10

Confusion Matrix:
 [[7 0]
 [3 0]]

=== support_vector ===
Accuracy: 0.7
Classifcation Report:
               precision    recall  f1-score   support

           0       0.70      1.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
