In [10]:
import pandas as pd
import numpy as np

all_df = pd.read_excel('TrainDataset2024.xls', index_col=False)
all_df.drop('ID', axis=1, inplace=True)
print(all_df.shape)
all_df.head()

(400, 120)


Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,144.0,41.0,0,0,0,1,3,3,1,...,0.517172,0.375126,3.325332,0.002314,3880771.5,473.464852,0.000768,0.182615,0.030508,0.000758
1,0,142.0,39.0,1,1,0,0,3,3,1,...,0.444391,0.444391,3.032144,0.005612,2372009.744,59.45971,0.004383,0.032012,0.001006,0.003685
2,1,135.0,31.0,0,0,0,1,2,1,1,...,0.534549,0.534549,2.485848,0.006752,1540027.421,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,12.0,35.0,0,0,0,1,3,3,1,...,0.506185,0.506185,2.606255,0.003755,6936740.794,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,109.0,61.0,1,0,0,0,2,1,1,...,0.462282,0.462282,2.809279,0.006521,1265399.054,39.621023,0.006585,0.034148,0.001083,0.005626


### Data imputation

In [11]:
import pickle
from sklearn.impute import SimpleImputer

# load gene clf feature names
import json
keep_feat_names = []
with open('gene_clf_selected_features.json', 'rb') as f:
  keep_feat_names = json.load(f)
  
# replace missing gene with classification result
# see train_gene_classifier.ipynb for more details
with open('log_reg_gene_classifier.pkl', 'rb') as f:
  log_res_clf = pickle.load(f)
  
  # rebuild prediction df
  gene_impute_df = all_df.copy()

  temp_X = gene_impute_df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)
  y = gene_impute_df['Gene']

  print("before impute:") 
  print(gene_impute_df['Gene'].value_counts())

  keep_df = temp_X[keep_feat_names]
  replace_index = keep_df[keep_df['Gene'] == 999].index

  # get prediction on missing gene
  target = gene_impute_df.loc[replace_index, keep_feat_names]
  target.drop('Gene', axis=1, inplace=True)

  pred = log_res_clf.predict(target)
  gene_impute_df.loc[replace_index, 'Gene'] = pred

  print("after impute:") 
  print(gene_impute_df['Gene'].value_counts())

  # assign back to all_df
  all_df['Gene'] = gene_impute_df['Gene']

# Replace missing values with median of the column
imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

# classification target
clf_y = all_df['pCR (outcome)']
# regression target
rgr_y = all_df['RelapseFreeSurvival (outcome)']

# all_df.to_csv('reg_all_df.csv', index=False)
# all_df.head()

before impute:
Gene
0      193
1      119
999     88
Name: count, dtype: int64
after impute:
Gene
0    281
1    119
Name: count, dtype: int64




### Outlier Removal

In [12]:
from outlier_removal import removeOutliers

# See the outlier_removal.py file for the implementation of the function
removeOutliers(all_df)

# keep pCR outcome as training set
X = all_df.drop(['RelapseFreeSurvival (outcome)'], axis=1)

### Data normalisation

In [13]:
from sklearn.preprocessing import StandardScaler

# Standardise features by removing the mean and scaling to unit variance.
scaler = StandardScaler()
Xs = scaler.fit_transform(X)

print(Xs[:,:5])
print(Xs.shape)

[[ 1.93956303 -0.98809731 -1.09997489 -0.82502865 -0.65465367]
 [-0.51558005 -1.17099914  0.90911166  1.21207912 -0.65465367]
 [ 1.93956303 -1.90260643 -1.09997489 -0.82502865 -0.65465367]
 ...
 [-0.51558005  0.13674891 -1.09997489 -0.82502865 -0.65465367]
 [-0.51558005  1.55423805  0.90911166 -0.82502865 -0.65465367]
 [-0.51558005 -0.53084275  0.90911166 -0.82502865 -0.65465367]]
(400, 119)


### Feature Selection and Dimensionality Reduction

In [14]:
### Feature Selection and Dimensionality Reduction strategy:
#
# 1. Keep all non mri columns
# 2. Select the mri_pca_2MRI features and apply PCA
# 3. Combine the two sets of features

non_mri_indicies = list(range(13))
print(all_df.columns[non_mri_indicies])

# Select the MRI features
mri_indices = list(range(13, Xs.shape[1]))
print(all_df.columns[mri_indices])

# Apply PCA to the MRI features
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
X_mri = Xs[:, mri_indices]
X_mri_pca = pca.fit_transform(X_mri)

# Combine the two sets of features
non_mri_feats = Xs[:, non_mri_indicies]
Xs = np.hstack([non_mri_feats, X_mri_pca])

print("final shape:")
print(Xs.shape)


Index(['pCR (outcome)', 'RelapseFreeSurvival (outcome)', 'Age', 'ER', 'PgR',
       'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation',
       'HistologyType', 'LNStatus', 'TumourStage', 'Gene'],
      dtype='object')
Index(['original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_Maximum3DDiameter', 'original_shape_MeshVolume',
       'original_shape_MinorAxisLength',
       ...
       'original_glszm_SmallAreaEmphasis',
       'original_glszm_SmallAreaHighGrayLevelEmphasis',
       'original_glszm_SmallAreaLowGrayLevelEmphasis',
       'original_glszm_ZoneEntropy', 'original_glszm_ZonePercentage',
       'original_glszm_ZoneVariance', 'original_ngtdm_Busyness',
       'original_ngtdm_Coarseness', 'original_ngtdm_Complexity',
       'original_

In [15]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(Xs, rgr_y, test_size=0.2, random_state=42)

### Random Forest for Regression + Hyper Param Tuning

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

# Define the model
rnd_forest = RandomForestRegressor(random_state=42)

param_grid = {
    'max_depth': [1,2,3,4,5,6,7,8,9,10],
    'n_estimators': [50, 75, 100],
    'max_features': ['sqrt'],
    'min_samples_split': [2, 5, 7, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(estimator=rnd_forest, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_rnd_forest = grid_search.best_estimator_

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=75; total time=   0.0s
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=75; total time=   0.0s
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.0s
[CV] END max_depth=1, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=75; total time

In [17]:
from pprint import pprint

# Make predictions
rnd_pred = best_rnd_forest.predict(X_test)

# Evaluate the model
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

res = {
  'best_params': grid_search.best_params_,
  'mean_absolute_error': rnd_mae,
  'root_mean_squared_error': rnd_rmse,
  'r2_score': rnd_r2
}

pprint(res)

{'best_params': {'max_depth': 9,
                 'max_features': 'sqrt',
                 'min_samples_leaf': 2,
                 'min_samples_split': 7,
                 'n_estimators': 50},
 'mean_absolute_error': np.float64(22.29321120857575),
 'r2_score': -0.1005612104675262,
 'root_mean_squared_error': np.float64(29.183794528533994)}


In [18]:
# save model
with open('rfs_regression_model.pkl', 'wb') as f:
  pickle.dump(best_rnd_forest, f)