In [161]:
'''
train three models
- mandatory features - ['ER', 'HER2', 'Gene']
- all non mandatory and non MRI features
- mri features
'''

src = 'imputed_csv/gene_predicted_rest_median_imputed_no_outliers.csv'

import pandas as pd
df = pd.read_csv(src)

y = df['RelapseFreeSurvival (outcome)']
ID = df['ID']
X = df.drop(columns=['RelapseFreeSurvival (outcome)', 'ID'])

mand_df = X[['ER', 'HER2', 'Gene', 'pCR (outcome)']].copy()

X.drop(columns=['ER', 'HER2', 'Gene'], inplace=True)

mri_cols = X.columns[9:]
mri_df = X[mri_cols].copy()

X.drop(columns=mri_cols, inplace=True)
rest_df = X

In [162]:
from sklearn.ensemble import RandomForestRegressor
rnd_forest = RandomForestRegressor(
  max_depth=20,
  max_features="sqrt",
  min_samples_leaf=4,
  min_samples_split=10,
  n_estimators=50
)

In [163]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

# mand_df
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(mand_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


rnd_forest.fit(X_train_scaled, y_train)
rnd_pred = rnd_forest.predict(X_test_scaled)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'Mandatory features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

Mandatory features: MAE: 23.338929612477767, RMSE: 28.76485435847793, R2: -0.027991090691737552


In [164]:
# mri_df
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(mri_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

rnd_forest.fit(X_train_scaled, y_train)
rnd_pred = rnd_forest.predict(X_test_scaled)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'mri features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

mri features: MAE: 21.233116930539232, RMSE: 26.8224935603013, R2: 0.106152785182002


In [165]:
# mri_df PCA
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(mri_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

from sklearn.decomposition import PCA
pca = PCA(n_components=5)
X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.fit_transform(X_test_scaled)

PCA_df = pd.DataFrame()
PCA_df['PCA_1'] = X_train_scaled[:,0]
PCA_df['PCA_2'] = X_train_scaled[:,1]
# PCA_df['PCA_3'] = X_train_scaled[:,2]

PCA_test_df = pd.DataFrame()
PCA_test_df['PCA_1'] = X_test_scaled[:,0]
PCA_test_df['PCA_2'] = X_test_scaled[:,1]
# PCA_test_df['PCA_3'] = X_test_scaled[:,2]

rnd_forest.fit(PCA_df, y_train)
rnd_pred = rnd_forest.predict(PCA_test_df)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'mri features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

mri features: MAE: 23.54789260872154, RMSE: 28.447189677007866, R2: -0.005411223252455466


In [166]:
# rest_df
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(rest_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

rnd_forest.fit(X_train_scaled, y_train)
rnd_pred = rnd_forest.predict(X_test_scaled)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'mri features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

mri features: MAE: 23.695424014236618, RMSE: 28.850835312522104, R2: -0.03414580667131428


In [167]:
import numpy as np
import xgboost as xgb
import lightgbm

from sklearn.linear_model import LinearRegression
df = pd.read_csv(src)
cols = df.columns
# mri_cols = cols[14:]

# # Split features into pipelines
clinical_features = ['Age', 'ChemoGrade', 'HistologyType', 'LNStatus', 'TumourStage']
biomarker_features = ['ER', 'PgR', 'HER2', 'TrippleNegative', 'Proliferation', 'Gene']
imaging_features = cols[14:]

# Create individual models
clinical_model = RandomForestRegressor()
# biomarker_model = xgb.XGBRegressor()
biomarker_model = RandomForestRegressor()
# imaging_model = lightgbm.LGBMRegressor()
imaging_model = RandomForestRegressor()

from sklearn.model_selection import train_test_split

# Split data
X_clinical = df[clinical_features]
X_biomarker = df[biomarker_features]
X_imaging = df[imaging_features]

X_train_clin, X_test_clin, y_train, y_test = train_test_split(X_clinical, y, test_size=0.2, random_state=42)
X_train_bio, X_test_bio, _, _ = train_test_split(X_biomarker, y, test_size=0.2, random_state=42)
X_train_img, X_test_img, _, _ = train_test_split(X_imaging, y, test_size=0.2, random_state=42)

# Train individual models
clinical_model.fit(X_train_clin, y_train)
biomarker_model.fit(X_train_bio, y_train)
imaging_model.fit(X_train_img, y_train)

# Generate predictions for meta-model
clinical_pred_train = clinical_model.predict(X_train_clin)
biomarker_pred_train = biomarker_model.predict(X_train_bio)
imaging_pred_train = imaging_model.predict(X_train_img)

# Stack meta-features for training
meta_features_train = np.column_stack([clinical_pred_train, biomarker_pred_train, imaging_pred_train])

print(meta_features_train)

# Meta-model training
meta_model = LinearRegression().fit(meta_features_train, y_train)

[[ 73.54916667  58.97725057  76.51      ]
 [ 34.5         49.21876695  48.095     ]
 [ 68.075       43.18812545  51.99166667]
 [ 36.01        58.97725057  44.88666667]
 [ 61.66        58.97725057  55.10833333]
 [102.85        64.21719728 101.22      ]
 [ 65.34833333  55.69008333  65.35333333]
 [ 48.89        40.26099903  54.79      ]
 [ 74.30833333  62.98293057  84.75      ]
 [ 53.19564286  49.21876695  40.7975    ]
 [ 69.21        64.21719728  70.23333333]
 [ 54.21        58.97725057  56.85583333]
 [ 34.47666667  56.98359524  38.32583333]
 [ 49.77666667  39.82845679  44.91333333]
 [ 71.81        64.21719728  72.0525    ]
 [ 31.33        64.21719728  22.39583333]
 [ 65.98333333  59.98615821  64.76166667]
 [ 44.485       43.18812545  46.63416667]
 [ 54.37166667  73.74338492  62.55166667]
 [ 50.1         55.69008333  48.27583333]
 [ 64.797       58.68288     68.14333333]
 [ 76.32        43.18812545  79.535     ]
 [ 60.89916667  61.08813925  64.2       ]
 [ 44.03        49.21876695  43.74

In [168]:
# Generate test predictions for meta-model
clinical_pred_test = clinical_model.predict(X_test_clin)
biomarker_pred_test = biomarker_model.predict(X_test_bio)
imaging_pred_test = imaging_model.predict(X_test_img)

# Stack meta-features for testing
meta_features_test = np.column_stack([clinical_pred_test, biomarker_pred_test, imaging_pred_test])

# Meta-model predictions
meta_pred = meta_model.predict(meta_features_test)

# Evaluate the meta-model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mse = mean_squared_error(y_test, meta_pred)
r2 = r2_score(y_test, meta_pred)
mae = mean_absolute_error(y_test, meta_pred)
print(f"Meta-Model MSE: {mse}")
print(f"Meta-Model MAE: {mae}")
print(f"Meta-Model R²: {r2}")


Meta-Model MSE: 753.6850556756132
Meta-Model MAE: 22.125750197593604
Meta-Model R²: 0.06361403454624337
