In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import os
import virtual_biopsy_utils as vbu
import integration_images_features_utils as image_utils
import ast
import delong
import shap

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

### Load sentara

In [2]:
sen_data = vbu.load_sentara(path = '../pkls/sentara.pkl', overwrite=True)

shape: (2395, 7713)


### Load shared features between maccabi and sentara 

In [3]:
shared = pkl.load(open('../pkls/shared_features_mac_sen.pkl', 'rb'))

len(shared)

1850

### Load and add additional features from Ella: suspicious calcifications, spiculated leasions, arch distortion

In [4]:
# spic_lesions_studies = [x.split('\t') for x in open('../input_files/additional_features_spiculated_lesions_sentara.txt').readlines()]
# arch_dist_studies = [x.split('\t') for x in open('../input_files/additional_features_architectural_distortion_sentara.txt').readlines()]
# susp_calc_studies = [x.split('\t') for x in open('../input_files/additional_features_suspicius_calcifications_sentara.txt').readlines()]

# spic_lesions_studies = [item[1][:-1] for item in spic_lesions_studies[1:]]
# arch_dist_studies = [item[1][:-1] for item in arch_dist_studies[1:]]
# susp_calc_studies = [item[1][:-1] for item in susp_calc_studies[1:]]

# sen_data['spiculated_lesions_report'] = np.array([x in spic_lesions_studies for x in sen_data.study_id.tolist()]).astype(int)
# sen_data['architectural_distortion_report'] = np.array([x in arch_dist_studies for x in sen_data.study_id.tolist()]).astype(int)
# sen_data['suspicious_calcifications_report'] = np.array([x in susp_calc_studies for x in sen_data.study_id.tolist()]).astype(int)

### Add calculated features to sentara

* Add BMI estimation

In [5]:
sen_data = vbu.add_bmi_estimation_sentara(df=sen_data)

* Add likelihood of obesity estimation

In [6]:
sen_data = vbu.add_likelihood_obesity_estimation_sentara(df=sen_data)

* Breast density estimation

In [7]:
sen_data = vbu.add_density_estimation_sentara(df = sen_data)

### Use shared features only

In [8]:
sen_data = sen_data[shared]

### Split data

In [9]:
x_train, y_train, x_val, y_val, x_test, y_test = vbu.split_sentara(sen_data, 
                                train_path = '../pkls/sentara_train.pkl',
                                val_path = '../pkls/sentara_val.pkl', 
                                test_path = '../pkls/sentara_test.pkl', overwrite = True)

Number of samples in train: 1685, val: 357 and test: 353


### Transform multilabel to single label - only on the train set

In [10]:
sen_data_train = x_train.combine_first(y_train)

cancers = [x for x in sen_data_train if 'outcome_cancer_type' in x]

temp = []
for _, row in sen_data_train.iterrows():
    
    cols_one = row[cancers].loc[lambda x:x==True].index
    for col_name in cols_one:
        temp_row = row.copy()
        temp_row[cancers] = 0
        temp_row[col_name] = 1
        temp.append(temp_row)
        
sen_data_train = pd.DataFrame(temp)
multiindex = sen_data_train.index.tolist()
sen_data_train.reset_index(inplace=True)
sen_data_train['patient_id'] = [item[1] for item in multiindex]
sen_data_train['study_date'] = [item[2] for item in multiindex]
sen_data_train.set_index(['patient_id', 'study_date'], inplace = True, append=True)
sen_data_train.drop('index', axis = 1, inplace=True)
print('New number of samples of training set: %d' %sen_data_train.shape[0])

# Redefine new training dataframes with the single label samples
y_train = sen_data_train[[x for x in sen_data_train.columns if x.startswith('outcome_cancer_')]]
x_train = sen_data_train.drop(columns=[x for x in sen_data_train.columns if x.startswith('outcome_')])

New number of samples of training set: 1729


In [11]:
y_val.to_csv('sentara_outputs_for_delong.csv')

### Add imaging features to train and val sets

* Predictions

In [12]:
pred_file_path = '../input_files/final_tal_predictons_without_annotation_train_and_val.csv'

pred = image_utils.compute_predictions_images_sentara(pred_file_path)
x_train = x_train.join(pred.set_index('study_id'), on='study_id')
x_val = x_val.join(pred.set_index('study_id'), on='study_id')

* Microcalcifications

In [13]:
# micro = image_utils.get_microcalc_features()
# x_train = x_train.join(micro.set_index('study_id'), on='study_id')
# x_val = x_val.join(micro.set_index('study_id'), on='study_id')

* Findings size

In [14]:
# finding = image_utils.get_findings_size_features()
# x_train = x_train.join(finding.set_index('study_id'), on='study_id')
# x_val = x_val.join(finding.set_index('study_id'), on='study_id')

* Findings_x_max 

In [15]:
# finding_x = image_utils.get_findings_x_max_features(overwrite=False)

# x_train = x_train.join(finding_x.set_index('study_id'), on='study_id')
# x_val = x_val.join(finding_x.set_index('study_id'), on='study_id')

* Findings_y_max 

In [16]:
# finding_y = image_utils.get_findings_y_max_features(overwrite=False)

# x_train = x_train.join(finding_y.set_index('study_id'), on='study_id')
# x_val = x_val.join(finding_y.set_index('study_id'), on='study_id')

* Types: Calcification, breast assymetry, tumor, architectural distortion, axillary lymphadenopathy

In [17]:
# types = image_utils.get_types_features(overwrite=False)

# x_train = x_train.join(types.set_index('study_id'), on='study_id')
# x_val = x_val.join(types.set_index('study_id'), on='study_id')

#### Drop features we dont use

In [18]:
# if clinical only or clinical+images:
x_train.drop(columns=['study_id'], inplace=True) #studyid
x_val.drop(columns=['study_id'], inplace=True)

#if images only:
# imaging_feats = [x for x in x_train if 'pred' in x]
# x_train = x_train[imaging_feats]
# x_val = x_val[imaging_feats]

print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(1729, 1937) (357, 1937) (1729, 5) (357, 5)


#### Fill missing values in categorical data

In [19]:
cat_feats = [x for x in x_train.columns if 'ind' in x] + [x for x in x_train.columns if 'cnt' in x] +\
              [ 'breast_density_past']

x_train[cat_feats] = x_train[cat_feats].fillna(x_train[cat_feats].mode().iloc[0])
x_val[cat_feats] = x_val[cat_feats].fillna(x_val[cat_feats].mode().iloc[0])

### Logistic Regression 

In [20]:
categories = ['outcome_cancer_type_DCIS', 'outcome_cancer_type_Invasive', 'outcome_cancer_type_BenignHR',
           'outcome_cancer_type_Papilloma', 'outcome_cancer_type_Benign']



LogReg_pipeline = Pipeline([
    ('imputation', SimpleImputer(missing_values = np.nan, strategy = 'mean')), # impute values with mean
    ('scaler', MinMaxScaler()), 
    ('clf', OneVsRestClassifier(LogisticRegression(random_state = 42), n_jobs=-1)),
])


grid_params_lr = {'clf__estimator__C': [0.001, 0.01, 0.1, 1, 10],
                 'clf__estimator__tol': [1e-3, 1e-4, 1e-5]}

LR= RandomizedSearchCV(estimator = LogReg_pipeline,
                 param_distributions = grid_params_lr,
                 scoring = 'roc_auc', n_iter = 15,
                 cv=10, random_state=42) 

predict_probs = []

for category in categories:
    print('**Processing class {} ...**'.format(category))
    
    if os.path.isfile('../pkls/cancer_prediction_shared_features_pkls/LogReg/feature_set_both/best_model_randomized_search_' + str(category) + '.pkl'):
        best_model = pkl.load(open('../pkls/cancer_prediction_shared_features_pkls/LogReg/feature_set_both/best_model_randomized_search_' + str(category) + '.pkl', 'rb'))
    else:
    
        LR.fit(x_train, y_train[category])
        y_pred = LR.predict(x_val)


        scores = LR.fit(x_train, y_train[category]).decision_function(x_val)
        prob = LR.predict_proba(x_val)[:, 1]

        predict_probs.append(prob)
        
        print('AUC is {:.2f} [{:.2f}, {:.2f}]'.format(roc_auc_score(y_val[category], 
                                        prob), *delong.get_delong_ci(prob, y_val[category])))

        print('\n')
        
        pkl.dump(LR.best_estimator_, open('../pkls/cancer_prediction_shared_features_pkls/LogReg/feature_set_both/best_model_randomized_search_' + str(category) + '.pkl', 'wb')) 

    
#     if os.path.isfile('../pkls/cancer_prediction_pkls/LogReg/without dicom/best_model_randomized_search_' + str(category) + '.pkl'):
#         best_model = pkl.load(open('../pkls/cancer_prediction_pkls/LogReg/without dicom/best_model_randomized_search_' + str(category) + '.pkl', 'rb'))
    

    #SHAP
#     if os.path.isfile('../pkls/cancer_prediction_pkls/LogReg/shap_values_' + str(category) + '.pkl'):
#         shap_values = pkl.load(open('../pkls/cancer_prediction_shared_features_pkls/LogReg/shap_values_' + str(category) + '.pkl', 'rb'))
#     else:
#         x_val_sample = shap.sample(x_val, 50)
#         explainer = shap.KernelExplainer(best_model.predict_proba, x_val_sample) 
#         shap_values = explainer.shap_values(x_val_sample)
#         pkl.dump(shap_values, open('../pkls/cancer_prediction_shared_features_pkls/LogReg/shap_values_' + str(category) + '.pkl', 'wb'))

**Processing class outcome_cancer_type_DCIS ...**
AUC is 0.68 [0.59, 0.76]


**Processing class outcome_cancer_type_Invasive ...**
AUC is 0.50 [0.42, 0.57]


**Processing class outcome_cancer_type_BenignHR ...**
AUC is 0.52 [0.33, 0.71]


**Processing class outcome_cancer_type_Papilloma ...**
AUC is 0.51 [0.38, 0.64]


**Processing class outcome_cancer_type_Benign ...**
AUC is 0.55 [0.48, 0.60]




In [21]:
np.savetxt('predict_probs_Logres_both.csv', predict_probs, delimiter=',')

In [None]:
# x_val_sample = shap.sample(x_val, 50)
# explainer = shap.KernelExplainer(LR.best_estimator_.predict_proba, x_val_sample) 
# shap_values = explainer.shap_values(x_val_sample)

In [None]:
# shap.summary_plot(shap_values[1], x_val_sample, plot_type = "dot")