In [1]:
import joblib
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Install xlrd package
%pip install xlrd

all_df = pd.read_excel('TestDatasetExample.xls', index_col=False)
IDs = all_df['ID']
all_df.drop('ID', axis=1, inplace=True)
all_df.head()

saved_model = joblib.load('pcr_classification_model.joblib')
model = saved_model['model']
scaler = saved_model['scaler']
lda = saved_model['lda']
selected_features_indices = saved_model['selected_features_indices']


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Pre-Process testing data

In [2]:
from sklearn.impute import SimpleImputer

# Replace missing values with median of the column
imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

# Standardize the data
Xs = scaler.transform(all_df)

  all_df[:] = imputer.fit_transform(all_df)


### Apply Feature Selection and Dimensionality Reduction training strategy

In [3]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Select required features (ER, HER2 and Gene)
non_mri_features = Xs[:, selected_features_indices]

# Select MRI features
mri_indices = list(range(11, Xs.shape[1]))
mri = Xs[:, mri_indices]

# Apply LDA to MRI features
Xs_lda = lda.transform(Xs[:, 11:])

# Combine required features with LDA transformed features
Xs = np.hstack((non_mri_features, Xs_lda))

### Predict Classification

In [4]:
result = model.predict(Xs)

# Add the prediction to the dataframe
all_df['pCR (outcome)'] = result

# Part 2: RFS Regression

### Pre-process for Regression

In [5]:

from sklearn.impute import SimpleImputer
import pickle

# load gene clf feature names
import json
keep_feat_names = []
with open('gene_clf_selected_features.json', 'rb') as f:
  keep_feat_names = json.load(f)

if 999 not in all_df['Gene'].values:
  print("no missing Gene - skipping gene impute")
else:
  # replace missing gene with classification result
  # see train_gene_classifier.ipynb for more details
  with open('log_reg_gene_classifier.pkl', 'rb') as f:
    log_res_clf = pickle.load(f)

    # rebuild prediction df
    gene_impute_df = all_df.copy()

    temp_X = gene_impute_df.drop(['pCR (outcome)'], axis=1)
    y = gene_impute_df['Gene']

    print("before impute:") 
    print(gene_impute_df['Gene'].value_counts())

    keep_df = gene_impute_df[keep_feat_names]
    replace_index = keep_df[keep_df['Gene'] == 999].index

    # get prediction on missing gene
    target = gene_impute_df.loc[replace_index, keep_feat_names]
    target.drop('Gene', axis=1, inplace=True)

    print("target shape:", target.shape)

    pred = log_res_clf.predict(target)
    gene_impute_df.loc[replace_index, 'Gene'] = pred

    print("after impute:") 
    print(gene_impute_df['Gene'].value_counts())

    # assign back to all_df
    all_df['Gene'] = gene_impute_df['Gene']


  # Replace missing values with median of the column
  imputer = SimpleImputer(strategy="median", missing_values=999)
  all_df[:] = imputer.fit_transform(all_df)

no missing Gene - skipping gene impute


In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xs = scaler.fit_transform(all_df)

In [7]:
### Feature Selection and Dimensionality Reduction strategy:
#
# 1. Keep all non mri columns
# 2. Select the mri_pca_2MRI features and apply PCA
# 3. Combine the two sets of features

non_mri_indicies = list(range(13))
print(all_df.columns[non_mri_indicies])

# Select the MRI features
mri_indices = list(range(13, Xs.shape[1]))
print(all_df.columns[mri_indices])

# Apply PCA to the MRI features
from sklearn.decomposition import PCA

pca = PCA(n_components=1)
X_mri = Xs[:, mri_indices]
X_mri_pca = pca.fit_transform(X_mri)

# Combine the two sets of features
non_mri_feats = Xs[:, non_mri_indicies]
Xs = np.hstack([non_mri_feats, X_mri_pca])

print("final shape:")
print(Xs.shape)

Index(['Age', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade',
       'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage', 'Gene',
       'original_shape_Elongation', 'original_shape_Flatness'],
      dtype='object')
Index(['original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterColumn',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_Maximum3DDiameter', 'original_shape_MeshVolume',
       'original_shape_MinorAxisLength', 'original_shape_Sphericity',
       'original_shape_SurfaceArea',
       ...
       'original_glszm_SmallAreaLowGrayLevelEmphasis',
       'original_glszm_ZoneEntropy', 'original_glszm_ZonePercentage',
       'original_glszm_ZoneVariance', 'original_ngtdm_Busyness',
       'original_ngtdm_Coarseness', 'original_ngtdm_Complexity',
       'original_ngtdm_Contrast', 'original_ngtdm_Strength', 'pCR (outcome)'],
      dtype='object', length

### Load Model and Predict

In [8]:
with open('rfs_regression_model.pkl', 'rb') as f:
  rfs_model = pickle.load(f)

  # predict RFS
  rfs_pred = rfs_model.predict(Xs)
  
  all_df['RelapseFreeSurvival (outcome)'] = rfs_pred


In [9]:
print(all_df.loc[:, ['pCR (outcome)', 'RelapseFreeSurvival (outcome)']].head())

   pCR (outcome)  RelapseFreeSurvival (outcome)
0              0                      51.998804
1              1                      53.630021
2              0                      47.038868


In [10]:
### Save to results to csv

pcr_coutcome = all_df['pCR (outcome)']
rfs_coutcome = all_df['RelapseFreeSurvival (outcome)']

clf_outcome = pd.DataFrame({'ID': IDs,'pCR (outcome)': pcr_coutcome,})

reg_outcome = pd.DataFrame({'ID': IDs,'RelapseFreeSurvival (outcome)': rfs_coutcome,})

clf_outcome.to_csv('results/classification_outcome.csv', index=False)
reg_outcome.to_csv('results/regression_outcome.csv', index=False)