In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
from sklearn.manifold import TSNE, Isomap


all_df = pd.read_csv('TrainDataset2024.csv', index_col=False)
all_df.drop('ID', axis=1, inplace=True)
all_df.head()

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,144.0,41.0,0,0,0,1,3,3,1,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0,142.0,39.0,1,1,0,0,3,3,1,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1,135.0,31.0,0,0,0,1,2,1,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,12.0,35.0,0,0,0,1,3,3,1,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,109.0,61.0,1,0,0,0,2,1,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [2]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

# classification target
clf_y = all_df['pCR (outcome)']
# regression target
rgr_y = all_df['RelapseFreeSurvival (outcome)']

### Outlier Removal

In [3]:
def percentile_based_outlier(data, threshold=95):
    diff = (100 - threshold) / 2.0
    minval, maxval = np.percentile(data, [diff, 100 - diff])
    return (data < minval) | (data > maxval)

def mad_based_outlier(points, threshold=3.5):
    points = np.array(points)
    if len(points.shape) == 1:
        points = points[:, None]
    median_y = np.median(points)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in points])
    # Small constant added to avoid division by zero
    modified_z_scores = [0.6745 * (y - median_y) / (median_absolute_deviation_y + 1e-6) for y in points]

    return np.abs(modified_z_scores) > threshold

def std_div(data, threshold=3):
    std = data.std()
    mean = data.mean()
    isOutlier = []
    for val in data:
        if abs(val - mean)/std > threshold:
            isOutlier.append(True)
        else:
            isOutlier.append(False)
    return isOutlier

def outlierVote(data):
    x = percentile_based_outlier(data)
    y = mad_based_outlier(data)
    z = std_div(data)
    temp = list(zip(x, y, z))
    final = []
    for i in range(len(temp)):
        if temp[i].count(False) >= 2:
            final.append(False)
        else:
            final.append(True)
    return final

def plotOutliers(x):
    fig, axes = plt.subplots(nrows=4)
    for ax, func in zip(axes, [percentile_based_outlier, mad_based_outlier, std_div, outlierVote]):
        sns.distplot(x, ax=ax, rug=True, hist=False)
        outliers = func(x)
        ax.plot(outliers, np.zeros_like(outliers), 'ro', clip_on=False)

    kwargs = dict(y=0.95, x=0.05, ha='left', va='top', size=20)
    axes[0].set_title('Percentile-based Outliers', **kwargs)
    axes[1].set_title('MAD-based Outliers', **kwargs)
    axes[2].set_title('STD-based Outliers', **kwargs)
    axes[3].set_title('Majority vote', **kwargs)
    fig.suptitle('Comparing Outlier Tests with n={}'.format(len(x)), size=20)
    fig = plt.gcf()
    fig.set_size_inches(20, 15)

def removeOutliers(data):
    # Remove outliers from the dataframe
    for column in data.columns:
        outliers = outlierVote(all_df[column])
        # Calculate Non-Outlier Maximum using the outliers list
        non_outlier_max = all_df.loc[~np.array(outliers), column].max()
        # Replace outliers with the maximum non-outlier value
        data.loc[outliers, column] = non_outlier_max

removeOutliers(all_df)
# Verify the changes
all_df.head()

# Assign features to X
X = all_df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)

In [4]:
X.head()
X.iloc[:,11]

0      0.813912
1      0.666118
2      0.645083
3      0.770842
4      0.861035
         ...   
395    0.389439
396    0.915643
397    0.657236
398    0.890276
399    0.464971
Name: original_shape_Elongation, Length: 400, dtype: float64

### Feature Selection

In [24]:
# Create manifold of MRI scan data (columns 11 onwards)

tsne = TSNE(n_components=2, random_state=42)
# isomap = Isomap(n_components=)
X_tsne_mri = tsne.fit_transform(X.iloc[:,11:])

# plot manifold
# plt.scatter(X_tsne_mri[:, -2], X_tsne_mri[:, -1], c=clf_y, cmap="jet")
# plt.axis('off')
# plt.colorbar()
# plt.show()

# combine columns 0-10 with the manifold
X_tsne_mri = np.c_[X.iloc[:,0:11], X_tsne_mri]

X_tsne_mri.shape



(400, 13)

In [None]:
svr = SVR(C=6, gamma=0.1, kernel='rbf')
svr.fit(X_tsne_mri, rgr_y)

y_pred = svr.predict(X_tsne_mri)
mae = mean_absolute_error(rgr_y, y_pred)


print(f'Test MAE: {mae}')



### K_fold training with different n dimensions (tsne)


In [None]:
num_splits = 5
kf = KFold(n_splits=num_splits, shuffle=True, random_state=1)
svr_outer = SVR(C=3, gamma=0.1, kernel='rbf')
svr_inner = SVR(C=3, gamma=0.1, kernel='rbf')

n_dimensions_range = 2 # Set hyperparameter range of values
kf_inner = KFold(n_splits=n_dimensions_range, shuffle=True, random_state=1) # Create kfold for inner loop

best_n_dimensions_list = []   # Best degrees resulting from inner fold
best_val_mae_list = []  # For inner fold
test_mae_list = [] # For outer fold

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index],X.iloc[test_index]
    y_train, y_test = rgr_y.iloc[train_index], rgr_y.iloc[test_index]

    best_n_dimensions = 1
    best_val_mae = np.inf

    degree = 1
    for train_index_inner, val_index in kf_inner.split(X_train):
        X_train_inner, X_val = X_train.iloc[train_index_inner],  X_train.iloc[val_index]
        y_train_inner, y_val = y_train.iloc[train_index_inner],  y_train.iloc[val_index]

        # train model with x_train_inner and y_train_inner with n dimensions
        tsne_inner = TSNE(n_components=degree, random_state=42)

        # Create the pure and interaction terms of degree n
        X_tsne_train_inner = tsne_inner.fit_transform(X_train_inner.iloc[:,11:])
        X_tsne_train_inner = np.c_[X_train_inner.iloc[:,0:11], X_tsne_train_inner]
        X_tsne_val = tsne_inner.fit_transform(X_val.iloc[:,11:])
        X_tsne_val = np.c_[X_val.iloc[:,0:11], X_tsne_val]

        svr_inner.fit(X_tsne_train_inner, y_train_inner)

        # validate model on validation data and get MSE
        y_pred_inner = svr_inner.predict(X_tsne_val)
        mae = mean_absolute_error(y_val, y_pred_inner)
        print(f"Degree {degree} Validation MSE: {mae:.4}", end=" ---- ")

        # if MSE is best so far, save degree and MSE
        if mae < best_val_mae:
            best_val_mae = mae
            best_degree = degree

        degree += 1
    
    best_n_dimensions_list.append(best_degree)
    best_val_mae_list.append(best_val_mae)

    tsne_outer = TSNE(n_components=best_degree, random_state=42)
    X_train_tsne = tsne_outer.fit_transform(X_train.iloc[:,11:])
    X_train_tsne = np.c_[X_train.iloc[:,0:11], X_train_tsne]
    X_test_tsne = tsne_outer.fit_transform(X_test.iloc[:,11:])
    X_test_tsne = np.c_[X_test.iloc[:,0:11], X_test_tsne]
    
    svr_outer.fit(X_train_tsne, y_train)

    y_pred_test = svr_outer.predict(X_test_tsne)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mae_list.append(test_mae)

print("\nSummary of outer folds:")
for i in range(num_splits):
    print(f"Fold {i+1}: Best Degree of {best_n_dimensions_list[i]}, Validation MSE = {best_val_mae_list[i]:.4f}, Test MSE = {test_mae_list[i]:.4f}")

print(f"\nAverage Test MSE: {np.mean(test_mae_list):.4f}")



Degree 1 Validation MSE: 20.47 ---- Degree 2 Validation MSE: 21.57 ---- Degree 1 Validation MSE: 21.26 ---- Degree 2 Validation MSE: 21.69 ---- Degree 1 Validation MSE: 20.46 ---- Degree 2 Validation MSE: 21.55 ---- Degree 1 Validation MSE: 21.46 ---- Degree 2 Validation MSE: 21.88 ---- Degree 1 Validation MSE: 20.89 ---- Degree 2 Validation MSE: 21.11 ---- 
Summary of outer folds:
Fold 1: Best Degree of 1, Validation MSE = 20.4745, Test MSE = 21.6160
Fold 2: Best Degree of 1, Validation MSE = 21.2573, Test MSE = 19.7552
Fold 3: Best Degree of 1, Validation MSE = 20.4650, Test MSE = 21.7220
Fold 4: Best Degree of 1, Validation MSE = 21.4590, Test MSE = 20.5679
Fold 5: Best Degree of 1, Validation MSE = 20.8863, Test MSE = 22.0465

Average Test MSE: 21.1415


### K_fold training with different n dimensions (Isomap)


In [None]:
num_splits = 5
kf = KFold(n_splits=num_splits, shuffle=True, random_state=1)
svr_outer = SVR(C=6, gamma=0.1, kernel='rbf')
svr_inner = SVR(C=6, gamma=0.1, kernel='rbf')

n_dimensions_range = 10 # Set hyperparameter range of values
kf_inner = KFold(n_splits=n_dimensions_range, shuffle=True, random_state=1) # Create kfold for inner loop

best_n_dimensions_list = []   # Best degrees resulting from inner fold
best_val_mae_list = []  # For inner fold
test_mae_list = [] # For outer fold

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index],X.iloc[test_index]
    y_train, y_test = rgr_y.iloc[train_index], rgr_y.iloc[test_index]

    best_n_dimensions = 1
    best_val_mae = np.inf

    degree = 1
    for train_index_inner, val_index in kf_inner.split(X_train):
        X_train_inner, X_val = X_train.iloc[train_index_inner],  X_train.iloc[val_index]
        y_train_inner, y_val = y_train.iloc[train_index_inner],  y_train.iloc[val_index]

        # train model with x_train_inner and y_train_inner with n dimensions
        iso = Isomap(n_components=degree)

        # Create the pure and interaction terms of degree n
        X_iso_train_inner = iso.fit_transform(X_train_inner.iloc[:,11:])
        X_iso_train_inner = np.c_[X_train_inner.iloc[:,0:11], X_iso_train_inner]
        X_iso_val = iso.fit_transform(X_val.iloc[:,11:])
        X_iso_val = np.c_[X_val.iloc[:,0:11], X_iso_val]

        svr_inner.fit(X_iso_train_inner, y_train_inner)

        # validate model on validation data and get MSE
        y_pred_inner = svr_inner.predict(X_iso_val)
        mae = mean_absolute_error(y_val, y_pred_inner)
        print(f"Degree {degree} Validation MSE: {mae:.4}", end=" ---- ")

        # if MSE is best so far, save degree and MSE
        if mae < best_val_mae:
            best_val_mae = mae
            best_degree = degree

        degree += 1
    
    best_n_dimensions_list.append(best_degree)
    best_val_mae_list.append(best_val_mae)

    iso_outer = Isomap(n_components=best_degree)
    X_train_iso = iso_outer.fit_transform(X_train.iloc[:,11:])
    X_train_iso = np.c_[X_train.iloc[:,0:11], X_train_iso]
    X_test_iso = iso_outer.fit_transform(X_test.iloc[:,11:])
    X_test_iso = np.c_[X_test.iloc[:,0:11], X_test_iso]
    
    svr_outer.fit(X_train_iso, y_train)

    y_pred_test = svr_outer.predict(X_test_iso)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mae_list.append(test_mae)

print("\nSummary of outer folds:")
for i in range(num_splits):
    print(f"Fold {i+1}: Best Number of Dimensions of {best_n_dimensions_list[i]}, Validation MSE = {best_val_mae_list[i]:.4f}, Test MSE = {test_mae_list[i]:.4f}")

print(f"\nAverage Test MSE: {np.mean(test_mae_list):.4f}")

Degree 1 Validation MSE: 19.79 ---- Degree 2 Validation MSE: 17.91 ---- Degree 3 Validation MSE: 24.39 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 4 Validation MSE: 17.6 ---- Degree 5 Validation MSE: 23.23 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 6 Validation MSE: 19.02 ---- Degree 7 Validation MSE: 19.88 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 8 Validation MSE: 25.82 ---- Degree 9 Validation MSE: 18.64 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 10 Validation MSE: 24.82 ---- Degree 1 Validation MSE: 19.0 ---- Degree 2 Validation MSE: 20.4 ---- Degree 3 Validation MSE: 22.08 ---- Degree 4 Validation MSE: 22.58 ---- Degree 5 Validation MSE: 22.47 ---- Degree 6 Validation MSE: 25.19 ---- Degree 7 Validation MSE: 19.11 ---- Degree 8 Validation MSE: 20.35 ---- Degree 9 Validation MSE: 20.57 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 10 Validation MSE: 23.29 ---- Degree 1 Validation MSE: 21.02 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 2 Validation MSE: 19.52 ---- Degree 3 Validation MSE: 21.1 ---- Degree 4 Validation MSE: 17.91 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 5 Validation MSE: 22.75 ---- Degree 6 Validation MSE: 19.19 ---- Degree 7 Validation MSE: 19.14 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 8 Validation MSE: 24.5 ---- Degree 9 Validation MSE: 22.03 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 10 Validation MSE: 23.04 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 1 Validation MSE: 19.51 ---- Degree 2 Validation MSE: 17.86 ---- Degree 3 Validation MSE: 26.96 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 4 Validation MSE: 18.83 ---- Degree 5 Validation MSE: 21.36 ---- Degree 6 Validation MSE: 20.58 ---- Degree 7 Validation MSE: 20.81 ---- Degree 8 Validation MSE: 22.21 ---- Degree 9 Validation MSE: 21.84 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 10 Validation MSE: 23.51 ---- Degree 1 Validation MSE: 20.76 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 2 Validation MSE: 20.3 ---- Degree 3 Validation MSE: 20.81 ---- Degree 4 Validation MSE: 18.96 ---- Degree 5 Validation MSE: 23.24 ---- Degree 6 Validation MSE: 15.63 ---- Degree 7 Validation MSE: 19.18 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 8 Validation MSE: 22.61 ---- Degree 9 Validation MSE: 23.73 ---- 

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])


Degree 10 Validation MSE: 24.38 ---- 
Summary of outer folds:
Fold 1: Best Degree of 4, Validation MSE = 17.5967, Test MSE = 21.5878
Fold 2: Best Degree of 1, Validation MSE = 18.9973, Test MSE = 19.7838
Fold 3: Best Degree of 4, Validation MSE = 17.9088, Test MSE = 21.7321
Fold 4: Best Degree of 2, Validation MSE = 17.8634, Test MSE = 20.6210
Fold 5: Best Degree of 6, Validation MSE = 15.6297, Test MSE = 22.0553

Average Test MSE: 21.1560


Todo

1. Try DR on subsets of MRI data columns
2. Wrapper method feature selection on remaining features
3. Explore alternative imputation methods (do no imputate values for cateogrical variables, delete observations instead) 
4. Try different DR methods

### ATTEMPT 2: Using DR on each subset of MRI data columns


#### 2a: as normal


#### 2b: w/ Wrapper style feature selection

#### 2c: using alternative imputation methods

#### 2d: combined