In [143]:
'''
train three models
- mandatory features - ['ER', 'HER2', 'Gene']
- all non mandatory and non MRI features
- mri features
'''

src = 'imputed_csv/gene_predicted_rest_median_imputed_no_outliers.csv'

import pandas as pd
df = pd.read_csv(src)

y = df['RelapseFreeSurvival (outcome)']
ID = df['ID']
X = df.drop(columns=['RelapseFreeSurvival (outcome)', 'ID'])

mand_df = X[['ER', 'HER2', 'Gene', 'pCR (outcome)']].copy()

X.drop(columns=['ER', 'HER2', 'Gene'], inplace=True)

mri_cols = X.columns[9:]
mri_df = X[mri_cols].copy()

X.drop(columns=mri_cols, inplace=True)
rest_df = X

In [144]:
from sklearn.ensemble import RandomForestRegressor
rnd_forest = RandomForestRegressor(
  max_depth=20,
  max_features="sqrt",
  min_samples_leaf=4,
  min_samples_split=10,
  n_estimators=50
)

In [145]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

# mand_df
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(mand_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


rnd_forest.fit(X_train_scaled, y_train)
rnd_pred = rnd_forest.predict(X_test_scaled)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'Mandatory features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

Mandatory features: MAE: 23.41907038363716, RMSE: 28.73801283441071, R2: -0.02607347460949705


In [146]:
# mri_df
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(mri_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

rnd_forest.fit(X_train_scaled, y_train)
rnd_pred = rnd_forest.predict(X_test_scaled)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'mri features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

mri features: MAE: 21.62071884746266, RMSE: 26.95915774033452, R2: 0.09702103915334848


In [147]:
# mri_df PCA
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(mri_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

from sklearn.decomposition import PCA
pca = PCA(n_components=10)
X_train_scaled = pca.fit_transform(X_train_scaled)
X_test_scaled = pca.fit_transform(X_test_scaled)

PCA_df = pd.DataFrame()
PCA_df['PCA_1'] = X_train_scaled[:,0]
PCA_df['PCA_2'] = X_train_scaled[:,1]
PCA_df['PCA_3'] = X_train_scaled[:,2]

PCA_test_df = pd.DataFrame()
PCA_test_df['PCA_1'] = X_test_scaled[:,0]
PCA_test_df['PCA_2'] = X_test_scaled[:,1]
PCA_test_df['PCA_3'] = X_test_scaled[:,2]

rnd_forest.fit(PCA_df, y_train)
rnd_pred = rnd_forest.predict(PCA_test_df)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'mri features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

mri features: MAE: 21.688529472205904, RMSE: 27.334988637191824, R2: 0.07166914098971455


In [148]:
# rest_df
scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(rest_df, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

rnd_forest.fit(X_train_scaled, y_train)
rnd_pred = rnd_forest.predict(X_test_scaled)
rnd_mae = mean_absolute_error(y_test, rnd_pred)
rnd_rmse = root_mean_squared_error(y_test, rnd_pred)
rnd_r2 = r2_score(y_test, rnd_pred)

print(f'mri features: MAE: {rnd_mae}, RMSE: {rnd_rmse}, R2: {rnd_r2}')

mri features: MAE: 23.065431190767647, RMSE: 28.216322738124152, R2: 0.010841652979916794


In [158]:
import numpy as np
import xgboost as xgb
import lightgbm

from sklearn.linear_model import LinearRegression
df = pd.read_csv(src)
cols = df.columns
# mri_cols = cols[14:]

# # Split features into pipelines
clinical_features = ['Age', 'ChemoGrade', 'HistologyType', 'LNStatus', 'TumourStage']
biomarker_features = ['ER', 'PgR', 'HER2', 'TrippleNegative', 'Proliferation', 'Gene']
imaging_features = cols[14:]

# Create individual models
clinical_model = RandomForestRegressor()
# biomarker_model = xgb.XGBRegressor()
biomarker_model = RandomForestRegressor()
# imaging_model = lightgbm.LGBMRegressor()
imaging_model = RandomForestRegressor()

from sklearn.model_selection import train_test_split

# Split data
X_clinical = df[clinical_features]
X_biomarker = df[biomarker_features]
X_imaging = df[imaging_features]

X_train_clin, X_test_clin, y_train, y_test = train_test_split(X_clinical, y, test_size=0.2, random_state=42)
X_train_bio, X_test_bio, _, _ = train_test_split(X_biomarker, y, test_size=0.2, random_state=42)
X_train_img, X_test_img, _, _ = train_test_split(X_imaging, y, test_size=0.2, random_state=42)

# Train individual models
clinical_model.fit(X_train_clin, y_train)
biomarker_model.fit(X_train_bio, y_train)
imaging_model.fit(X_train_img, y_train)

# Generate predictions for meta-model
clinical_pred_train = clinical_model.predict(X_train_clin)
biomarker_pred_train = biomarker_model.predict(X_train_bio)
imaging_pred_train = imaging_model.predict(X_train_img)

# Stack meta-features for training
meta_features_train = np.column_stack([clinical_pred_train, biomarker_pred_train, imaging_pred_train])

print(meta_features_train)

# Meta-model training
meta_model = LinearRegression().fit(meta_features_train, y_train)

[[ 72.8         59.53913815  81.72833333]
 [ 37.54        48.9478425   48.4625    ]
 [ 61.335       44.27156488  52.62166667]
 [ 39.12        59.53913815  43.4775    ]
 [ 61.26        59.53913815  54.47      ]
 [104.49        64.54895045 105.87416667]
 [ 65.60666667  55.12678694  63.52166667]
 [ 47.42916667  42.83823304  50.77      ]
 [ 77.48666667  64.09275843  81.23333333]
 [ 54.817       48.9478425   39.81833333]
 [ 72.01        64.54895045  72.4275    ]
 [ 53.95        59.53913815  54.34416667]
 [ 33.15583333  53.40988095  41.10333333]
 [ 52.90166667  40.26547     46.48416667]
 [ 79.515       64.54895045  75.10666667]
 [ 36.35        64.54895045  28.06583333]
 [ 67.86833333  58.63392394  65.545     ]
 [ 44.95583333  44.27156488  46.08416667]
 [ 53.705       75.02438889  63.06583333]
 [ 48.47083333  55.12678694  49.12416667]
 [ 66.64        59.47638027  69.8225    ]
 [ 73.61        44.27156488  79.72      ]
 [ 60.8075      62.77533333  64.99      ]
 [ 44.50166667  48.9478425   42.74

In [157]:
# Generate test predictions for meta-model
clinical_pred_test = clinical_model.predict(X_test_clin)
biomarker_pred_test = biomarker_model.predict(X_test_bio)
imaging_pred_test = imaging_model.predict(X_test_img)

# Stack meta-features for testing
meta_features_test = np.column_stack([clinical_pred_test, biomarker_pred_test, imaging_pred_test])

# Meta-model predictions
meta_pred = meta_model.predict(meta_features_test)

# Evaluate the meta-model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mse = mean_squared_error(y_test, meta_pred)
r2 = r2_score(y_test, meta_pred)
mae = mean_absolute_error(y_test, meta_pred)
print(f"Meta-Model MSE: {mse}")
print(f"Meta-Model MAW: {mae}")
print(f"Meta-Model R²: {r2}")


Meta-Model MSE: 762.6197095329928
Meta-Model MAW: 22.65859170911335
Meta-Model R²: 0.05251352987889524
