### RFS Regression

In [1]:
import pandas as pd
import numpy as np

# Install xlrd package
%pip install xlrd

all_df = pd.read_excel('FinalTestDataset2024.xls', index_col=False)
IDs = all_df['ID']
all_df.drop('ID', axis=1, inplace=True)
all_df.head()



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,TumourStage,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,47.0,1,1,0,0,3,2,1,1,2,...,0.49835,0.49835,3.144594,0.003447,8257693.277,150.048587,0.001753,0.03711,0.001369,0.001513
1,41.0,1,1,0,0,3,2,1,0,2,...,0.622381,0.622381,2.061654,0.006535,1568441.643,26.484938,0.009649,0.019352,0.000321,0.008285
2,53.0,0,0,0,1,2,1,1,1,2,...,0.412482,0.412482,3.440353,0.005391,2656924.827,174.606929,0.001594,0.075152,0.005255,0.001444
3,46.0,1,1,0,0,2,1,1,1,3,...,0.378333,0.378333,3.531715,0.007102,1714787.173,96.787378,0.002772,0.053377,0.002666,0.002406
4,39.0,0,0,1,0,2,2,1,1,2,...,0.524767,0.524767,2.186214,0.007896,510479.346,12.789071,0.020072,0.02314,0.000463,0.017172


### Data Imputation

In [2]:
import pickle
from sklearn.impute import SimpleImputer

# Load selected features from gene classifier
import json
keep_feat_names = []
with open('gene_clf_selected_features.json', 'rb') as f:
  keep_feat_names = json.load(f)

'''
Replace missing gene with classification result
See train_gene_classifier.ipynb for implementation
'''  

with open('log_reg_gene_classifier.pkl', 'rb') as f:
  log_res_clf = pickle.load(f)
  
  # rebuild prediction df
  gene_impute_df = all_df.copy()

  y = gene_impute_df['Gene']

  keep_df = gene_impute_df[keep_feat_names]
  replace_index = keep_df[keep_df['Gene'] == 999].index

  print("Before Impute:")
  print(gene_impute_df.iloc[replace_index, :]['Gene'][:3])

  # get prediction on missing gene
  target = gene_impute_df.loc[replace_index, keep_feat_names]
  target.drop('Gene', axis=1, inplace=True)

  pred = log_res_clf.predict(target)
  gene_impute_df.loc[replace_index, 'Gene'] = pred

  print("After Impute:") 
  print(gene_impute_df.iloc[replace_index, :]['Gene'][:3])

  # assign back to all_df
  all_df['Gene'] = gene_impute_df['Gene']

# Replace missing values with median of the column
nan_imputer = SimpleImputer(strategy="median", missing_values=np.nan)
all_df[:] = nan_imputer.fit_transform(all_df)
imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

Before Impute:
9     999
53    999
59    999
Name: Gene, dtype: int64
After Impute:
9     0
53    0
59    0
Name: Gene, dtype: int64




### Outlier Removal

In [3]:
'''
See the outlier_removal.py file for the implementation of the function
'''
from outlier_removal import removeOutliers
removeOutliers(all_df)
X = all_df


In [4]:
'''
Use the saved features from RelapseFreeSurvival_regression.ipynb to shape the data into one that can be used for the model
'''

# Load features from the saved file
import json
features = None
with open('rfs_features_cache.json', 'rb') as f:
  features = json.load(f)

normal_mri_idx = features['normal_cols_idx']
important_features_idx = features['important_features_idx']
mandatory_features_indices = [1,3,10]
  
# Apply PCA to the MRI features
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
X_mri = X.iloc[:, normal_mri_idx]
X_mri_pca = pca.fit_transform(X_mri)

# combine all features
final_indices = mandatory_features_indices + important_features_idx
print(f"Final features: {len(final_indices)} -  {X.columns[final_indices]}")

# add pca of normal MRI features
Xs = X.iloc[:, final_indices]
Xs = np.concatenate((Xs, X_mri_pca), axis=1)

print(f"Final Shape: {Xs.shape}")

Final features: 18 -  Index(['ER', 'HER2', 'Gene', 'original_firstorder_Range',
       'original_firstorder_Kurtosis', 'original_glszm_ZoneEntropy',
       'original_firstorder_90Percentile', 'Age',
       'original_glszm_SizeZoneNonUniformity', 'original_firstorder_Skewness',
       'original_firstorder_Maximum',
       'original_glszm_SizeZoneNonUniformityNormalized',
       'original_glszm_ZonePercentage', 'original_shape_Elongation',
       'original_firstorder_Variance', 'original_shape_MinorAxisLength',
       'original_shape_Flatness',
       'original_glszm_SmallAreaHighGrayLevelEmphasis'],
      dtype='object')
Final Shape: (133, 19)


### Load Model and Predict

In [5]:
with open('rfs_regression_model.pkl', 'rb') as f:
  rfs_model = pickle.load(f)

  # predict RFS
  rfs_pred = rfs_model.predict(Xs)
  print(rfs_pred)
  
  all_df['RelapseFreeSurvival (outcome)'] = rfs_pred

[59.54641246 70.82243642 49.66255704 44.5703895  49.53002773 56.20463373
 64.62209643 52.39639681 71.45407897 53.11554919 62.02700913 74.95248868
 52.36669517 74.41069815 62.75282788 63.65100768 52.15807544 59.48319517
 70.02633554 55.01062061 56.69034989 48.99645629 64.72961286 66.19296029
 63.16789995 49.79840333 75.87719694 69.34824993 52.73169268 66.56808336
 67.70361977 51.37089765 50.7686743  61.48667625 43.86791919 60.41486411
 53.95565115 50.14632559 72.52548112 68.46323494 49.92073993 51.80851178
 49.94190096 55.16969612 50.62659803 53.47494932 66.17117175 59.14577707
 71.53421357 48.11962488 51.58430479 63.83974922 51.60304772 65.46701875
 44.29958542 44.72192208 59.33416881 51.59859715 49.52982415 46.08374698
 43.15034696 50.67473581 69.32796729 44.54257685 49.24518726 52.85813191
 49.81509846 53.4451927  57.29330843 48.82363375 59.74580392 45.07436657
 46.60497582 41.34993176 65.10109485 44.72384514 45.87543334 45.25444657
 55.94478838 44.327093   67.31259353 52.05114993 44

In [6]:
### Save to results to csv
rfs_coutcome = all_df['RelapseFreeSurvival (outcome)']
reg_outcome = pd.DataFrame({'ID': IDs,'RelapseFreeSurvival (outcome)': rfs_coutcome,})
reg_outcome.to_csv('RFSPrediction.csv', index=False)