In [145]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.svm import SVR

# Importing the dataset
df = pd.read_excel('../TrainDataset2024.xls', sheet_name='Sheet1')
test_pd = pd.read_excel('../TestDatasetExample.xls', sheet_name='Sheet1')


In [146]:
df.head()

Unnamed: 0,ID,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002174,1,144.0,41.0,0,0,0,1,3,3,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,TRG002178,0,142.0,39.0,1,1,0,0,3,3,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,TRG002204,1,135.0,31.0,0,0,0,1,2,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,TRG002206,0,12.0,35.0,0,0,0,1,3,3,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,TRG002210,0,109.0,61.0,1,0,0,0,2,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [147]:
## Regression
y = df['RelapseFreeSurvival (outcome)']

X_id = df['ID'] # for later

X = df.drop(['RelapseFreeSurvival (outcome)', 'ID'], axis=1)

In [148]:
X.head()

Unnamed: 0,pCR (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,41.0,0,0,0,1,3,3,1,1,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0,39.0,1,1,0,0,3,3,1,1,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1,31.0,0,0,0,1,2,1,1,0,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,35.0,0,0,0,1,3,3,1,1,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,61.0,1,0,0,0,2,1,1,0,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


In [149]:
missing_val_strat = 'fill_with_median'

In [150]:
import numpy as np
# outlier detectors
def percentile_based_outlier(data, threshold=95):
  diff = (100 - threshold) / 2.0
  minval, maxval = np.percentile(data, [diff, 100 - diff])
  return (data < minval) | (data > maxval)

# median absolute deviation
def detect_outliers_mad(df, threshold=3):
    medians = df.median()
    mean_abs_dev = (df - medians).abs().median()
    mod_z_score = (0.6745 * (df - medians) / mean_abs_dev)
    return np.abs(mod_z_score) > threshold

# std deviation based
def std_div_outlier(data, threshold=3):
  std = data.std()
  mean = data.mean()
  is_outlier = []
  for v in data:
    is_outlier.append(True if (v > mean + threshold * std) | (v < mean - threshold * std) else False)
  return is_outlier

def vote(data):
  x = percentile_based_outlier(data)
  y = detect_outliers_mad(data)
  z = std_div_outlier(data)
  temp = list(zip(data.index, x, y, z))
  final = []
  for i in range(len(temp)):
    if temp[i].count(False) >= 2:
      final.append(False)
    else:
      final.append(True)
  return final

In [151]:
# Preprocessing - Strategy A - fill missing values with median
def fill_missing_values_with_median(df_X):
  median = df_X.median()

  df_X.replace({col: {999: median[col]} for col in X.columns}, inplace=True)
  df_X.fillna(median, inplace=True)
  
  return df_X

# Preprocessing - Strategy B - drop rows with missing values
def get_missing_rows(df_X):
  return df_X[df_X.isnull().any(axis=1)]


In [None]:
# # 1. fill missing values with median
# X = fill_missing_values_with_median(X)

# 2. drop rows with missing values
missing_rows = get_missing_rows(X)
X = X.drop(missing_rows.index)
y = y.drop(missing_rows.index)

In [153]:
# outlier detection
outliers = X.apply(vote)
outliers.head()
# outliers.sum()

Unnamed: 0,pCR (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [154]:
# outlier removal - replace with median
def remove_outliers(df_X):
  median = df_X.median()
  df_X = df_X.where(~outliers, other=median, axis=1)
  return df_X

X = remove_outliers(X)


In [155]:
## normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

In [156]:
from sklearn.decomposition import PCA
feature_names = list(X.columns)
pca = PCA(n_components=10)
Xs_pca = pca.fit_transform(Xs)
Xs_pca=Xs_pca[:,0:5]

In [157]:
Xs_train, Xs_test, y_train, y_test = train_test_split(Xs_pca, y, test_size=0.2, random_state=1)

In [158]:
from sklearn.svm import SVR
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np

# Define the SVR model and the hyperparameters to tune
svr = SVR()

param_grid = {
  'C': [0.1,1,2,3,4,5,6,7,8,9,10,50,100],
  'gamma': [0.1, 0.01, 0.001, 'scale'],
  # 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
  'kernel': ['rbf']
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# best_score = -np.inf
# best_params = {}

# for C in param_grid['C']:
#   for gamma in param_grid['gamma']:
#     for kernel in param_grid['kernel']:
#       svr.set_params(C=C, gamma=gamma, kernel=kernel)
#       scores = cross_val_score(svr, Xs_pca, y, cv=kf, scoring='neg_mean_absolute_error')
#       mean_score = scores.mean()

#       if mean_score > best_score:
#         best_score = mean_score
#         best_params = {'C': C, 'gamma': gamma, }
# print('Best parameters:', best_params)
# print('Best cross-validation score:', -best_score)

svr = SVR()
grid_search = GridSearchCV(estimator=svr, param_grid=param_grid, cv=kf, scoring='neg_mean_absolute_error')
grid_search.fit(Xs_train, y_train)

print('Best parameters:', grid_search.best_params_)
print('Best cross-validation MAE score:', -grid_search.best_score_)
            

Best parameters: {'C': 6, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation MAE score: 21.204172866472703


In [159]:
### Test data ###
C_test = grid_search.best_params_['C']
gamma_test = grid_search.best_params_['gamma']
kernel_test = grid_search.best_params_['kernel']

svr = SVR(C=C_test, gamma=gamma_test, kernel=kernel_test)
svr.fit(Xs_train, y_train)

y_pred = svr.predict(Xs_test)
mae = np.mean(np.abs(y_test - y_pred))
print('Test MAE:', mae)


Test MAE: 22.43109265222185
