In [10]:
import pandas as pd
import numpy as np

all_df = pd.read_excel('TrainDataset2024.xls', index_col=False)
all_df.drop('ID', axis=1, inplace=True)
all_df.shape

(400, 120)

### Data imputation

In [11]:
import pickle
from sklearn.impute import SimpleImputer

# load gene clf feature names
import json
keep_feat_names = []
with open('gene_clf_selected_features.json', 'rb') as f:
  keep_feat_names = json.load(f)
  
# replace missing gene with classification result
# see train_gene_classifier.ipynb for more details
with open('log_reg_gene_classifier.pkl', 'rb') as f:
  log_res_clf = pickle.load(f)
  
  # rebuild prediction df
  gene_impute_df = all_df.copy()

  temp_X = gene_impute_df.drop(['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1)
  y = gene_impute_df['Gene']

  print("before impute:") 
  print(gene_impute_df['Gene'].value_counts())

  keep_df = temp_X[keep_feat_names]
  replace_index = keep_df[keep_df['Gene'] == 999].index

  # get prediction on missing gene
  target = gene_impute_df.loc[replace_index, keep_feat_names]
  target.drop('Gene', axis=1, inplace=True)

  pred = log_res_clf.predict(target)
  gene_impute_df.loc[replace_index, 'Gene'] = pred

  print("after impute:") 
  print(gene_impute_df['Gene'].value_counts())

  # assign back to all_df
  all_df['Gene'] = gene_impute_df['Gene']

# Replace missing values with median of the column
imputer = SimpleImputer(strategy="median", missing_values=999)
all_df[:] = imputer.fit_transform(all_df)

# classification target
clf_y = all_df['pCR (outcome)']
# regression target
rgr_y = all_df['RelapseFreeSurvival (outcome)']

all_df.shape

before impute:
Gene
0      193
1      119
999     88
Name: count, dtype: int64
after impute:
Gene
0    281
1    119
Name: count, dtype: int64




(400, 120)

### Outlier Removal

In [12]:
from outlier_removal import removeOutliers
removeOutliers(all_df)
# keep pCR outcome as training set
X = all_df.drop(['RelapseFreeSurvival (outcome)', 'pCR (outcome)'], axis=1)

### Dimentionality Reduction - Manifold Learning

In [None]:
from umap import UMAP
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
Xs_non_mri = scaler.fit_transform(X.iloc[:, :11])

umap = UMAP(n_components=2, random_state=42)
X_umap_mri = umap.fit_transform(X.iloc[:, 11:])

Xs = np.c_[Xs_non_mri, X_umap_mri]
Xs.shape

  warn(


(400, 13)

In [None]:
from sklearn.manifold import TSNE, Isomap

tsne = TSNE(n_components=2, random_state=42)
X_tsne_mri = tsne.fit_transform(X.iloc[:,11:])
standard_scaler = StandardScaler()
Xs_non_mri = standard_scaler.fit_transform(X.iloc[:,0:11])
Xs = np.c_[Xs_non_mri, X_tsne_mri]


### Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, rgr_y, test_size=0.2, random_state=42)

### SVR Hyper-parameter Tuning

In [None]:
# svr 
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 30, 45, 50, 75, 80, 85, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

grid_search = GridSearchCV(SVR(), param_grid, n_jobs=-1, cv=5, verbose=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END ......................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=0.001, kernel=rbf; total time=   0.0s
[CV] END .....................C=0.1, gamma=0.00

### Validation

In [18]:
from sklearn.metrics import mean_absolute_error, r2_score

best = grid_search.best_estimator_

y_pred = best.predict(X_test)
print(y_pred[:10])

print("MAE: ", mean_absolute_error(y_test, y_pred))
print("R2: ", r2_score(y_test, y_pred))

[54.07159418 50.29602761 54.07181845 54.08068968 54.07203675 53.91419663
 54.07739314 52.86987333 54.12574331 54.08793857]
MAE:  21.359574037769008
R2:  -0.02750186442999225


### Random Forest for Regression + Hyper Param Tuning

In [17]:
# save model
with open('rfs_regression_model.pkl', 'wb') as f:
  pickle.dump(best, f)