### Imports 

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os
import virtual_biopsy_utils as vbu
import integration_images_features_utils as image_utils
import ast
import delong

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score, brier_score_loss, precision_score, recall_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
import shap

In [2]:
sen_data = vbu.load_sentara(path = '../pkls/sentara.pkl', overwrite=True)

shape: (2395, 7713)


In [3]:
shared = pkl.load(open('../pkls/shared_features_mac_sen.pkl', 'rb'))

len(shared)

1842

### Add calculated features to sentara

* Add BMI estimation

In [4]:
sen_data = vbu.add_bmi_estimation_sentara(df=sen_data)

* Add likelihood of obesity estimation

In [5]:
sen_data = vbu.add_likelihood_obesity_estimation_sentara(df=sen_data)

* Breast density estimation

In [6]:
sen_data = vbu.add_density_estimation_sentara(df = sen_data)

### Use shared  features only

In [7]:
sen_data = sen_data[shared]

In [8]:
sen_data['outcome_cancer_type_Malignant'] = sen_data['outcome_cancer_type_DCIS'] | sen_data['outcome_cancer_type_Invasive'] + 0
sen_data['outcome_cancer_type_BenignAll'] = 1 - sen_data['outcome_cancer_type_Malignant'] + 0

### Split data

In [9]:
x_train, y_train, x_val, y_val, x_test, y_test = vbu.split_sentara(sen_data, 
                                train_path = '../pkls/sentara_train.pkl',
                                val_path = '../pkls/sentara_val.pkl', 
                                test_path = '../pkls/sentara_test.pkl', overwrite = True)

Number of samples in train: 1685, val: 357 and test: 353


In [10]:
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape)

(1685, 1830) (1685, 7) (357, 1830) (357, 7) (353, 1830) (353, 7)


### Add imaging features to all sets

In [11]:
# file from image classifier with entire set (train, test, val)
pred_file_path = '../input_files/final_tal_predictons_without_annotation_all.csv'


pred = image_utils.compute_predictions_malignant_benign_images_sentara(pred_file_path)

x_train = x_train.join(pred.set_index('study_id'), on='study_id')
x_val = x_val.join(pred.set_index('study_id'), on='study_id')
x_test = x_test.join(pred.set_index('study_id'), on='study_id')

In [12]:
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape)

(1685, 1862) (1685, 7) (357, 1862) (357, 7) (353, 1862) (353, 7)


In [15]:
y_train.sum(axis=0)

outcome_cancer_type_BenignHR       60
outcome_cancer_type_DCIS          173
outcome_cancer_type_Papilloma     100
outcome_cancer_type_Invasive      440
outcome_cancer_type_Benign        956
outcome_cancer_type_Malignant     610
outcome_cancer_type_BenignAll    1075
dtype: int64

In [16]:
y_val.sum(axis=0)

outcome_cancer_type_BenignHR      11
outcome_cancer_type_DCIS          44
outcome_cancer_type_Papilloma     23
outcome_cancer_type_Invasive      89
outcome_cancer_type_Benign       202
outcome_cancer_type_Malignant    132
outcome_cancer_type_BenignAll    225
dtype: int64

In [17]:
y_test.sum(axis=0)

outcome_cancer_type_BenignHR      10
outcome_cancer_type_DCIS          40
outcome_cancer_type_Papilloma     17
outcome_cancer_type_Invasive      86
outcome_cancer_type_Benign       205
outcome_cancer_type_Malignant    126
outcome_cancer_type_BenignAll    227
dtype: int64

In [13]:
list(set(y_train.index.get_level_values(0).tolist()) - set(x_train.index.get_level_values(0).tolist()))

[]

In [14]:
with open('../pkls/x_test_ready_for_testing_MALBEN.pkl', 'wb') as handle:
    pkl.dump(x_test, handle, protocol=pkl.HIGHEST_PROTOCOL)

with open('../pkls/y_test_ready_for_testing_MALBEN.pkl', 'wb') as handle:
    pkl.dump(y_test, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
with open('../pkls/x_train_ready_for_training_MALBEN.pkl', 'wb') as handle:
    pkl.dump(x_train, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
with open('../pkls/y_train_ready_for_training_MALBEN.pkl', 'wb') as handle:
    pkl.dump(y_train, handle, protocol=pkl.HIGHEST_PROTOCOL)   
    
with open('../pkls/x_val_ready_for_training_MALBEN.pkl', 'wb') as handle:
    pkl.dump(x_val, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
with open('../pkls/y_val_ready_for_training_MALBEN.pkl', 'wb') as handle:
    pkl.dump(y_val, handle, protocol=pkl.HIGHEST_PROTOCOL)