In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from time import time

from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import make_pipeline

from sklearn.metrics import f1_score, auc, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import classification_report, confusion_matrix


# To show all columns in a dataframe
pd.options.display.max_info_columns=250
pd.options.display.max_columns=500

# To make pretty plots
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-ticks')
sns.set_style('ticks')
plt.rcParams['figure.figsize'] = (6, 4)
plt.rcParams['axes.titlesize'] = 22
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 16
plt.rcParams['ytick.labelsize'] = 16

%matplotlib inline

# Load data

In [2]:
df = pd.read_csv("../data/emr-filled-2984.csv")
df.drop(columns=["duration", "over7d", "over14d", "over21d"], inplace=True)
df.head()

Unnamed: 0,stay_id,over72h,gender,weight,height,pao2fio2ratio,co2_total_max,co2_total_min,ph_max,ph_min,lactate_max,lactate_min,heart_rate_max,heart_rate_min,mbp_ni_max,mbp_ni_min,mbp_arterial_max,mbp_arterial_min,resp_rate_max,resp_rate_min,spo2_max,spo2_min,temp_max,temp_min,glucose_max,glucose_min,epinephrine,vasopressin,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_min,peep_max,peep_min,plateau_pressure_max,plateau_pressure_min,rrt,neuroblocker,admission_location,insurance,language,ethnicity,marital_status,age,hours_in_hosp_before_intubation,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,sinus_rhythm,SOFA,apsiii
0,30074509,1,M,122.0,173.0,182.5,35.0,26.0,7.4,7.23,2.2,0.3,128.0,76.0,119.0,71.0,111.0,65.0,31.0,11.0,100.0,87.0,38.33,34.0,155.0,89.0,1,1,1,1,1,1,6,100.0,40.0,13.0,5.0,29.0,16.0,0,0,EMERGENCY ROOM,Medicaid,ENGLISH,WHITE,SINGLE,59,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,9,61
1,30488518,1,F,111.0,170.0,326.666667,37.0,33.0,7.49,7.43,2.1,1.1,101.0,47.0,106.0,27.0,108.0,43.0,25.0,5.0,100.0,91.0,38.17,36.67,446.0,132.0,1,1,1,1,1,1,6,50.0,30.0,10.0,0.0,19.0,17.0,0,0,EMERGENCY ROOM,Medicaid,ENGLISH,WHITE,DIVORCED,61,96,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1.0,3,47
2,30679928,1,F,82.2,157.0,97.5,30.0,12.0,7.48,7.17,6.8,1.5,203.0,75.0,107.0,44.0,261.0,62.0,37.0,12.0,100.0,92.0,39.22,35.0,390.0,59.0,1,1,1,1,1,1,6,100.0,40.0,10.0,0.0,26.0,16.0,1,0,EMERGENCY ROOM,Medicare,ENGLISH,WHITE,DIVORCED,83,6,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1.0,13,83
3,30767458,1,M,87.6,178.0,234.285714,29.0,25.0,7.45,7.42,0.9,0.7,144.0,58.0,113.0,53.0,160.0,56.0,24.0,8.0,100.0,92.0,38.78,36.78,127.0,74.0,1,1,1,1,1,1,6,100.0,35.0,5.6,0.0,15.0,10.0,0,0,TRANSFER FROM HOSPITAL,Medicare,ENGLISH,WHITE,MARRIED,67,27,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,2,40
4,30963638,1,M,143.3,178.0,110.0,36.0,31.0,7.46,7.41,1.6,1.2,130.0,70.0,114.0,49.0,113.0,55.0,45.0,10.0,100.0,78.0,39.56,36.78,206.0,92.0,1,1,1,1,1,1,6,40.0,35.0,11.0,0.0,27.0,22.0,0,0,EMERGENCY ROOM,Medicaid,ENGLISH,WHITE,SINGLE,65,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0.0,7,41


In [3]:
df.describe()

Unnamed: 0,stay_id,over72h,weight,height,pao2fio2ratio,co2_total_max,co2_total_min,ph_max,ph_min,lactate_max,lactate_min,heart_rate_max,heart_rate_min,mbp_ni_max,mbp_ni_min,mbp_arterial_max,mbp_arterial_min,resp_rate_max,resp_rate_min,spo2_max,spo2_min,temp_max,temp_min,glucose_max,glucose_min,epinephrine,vasopressin,dobutamine,norepinephrine,phenylephrine,dopamine,count_of_vaso,fio2_max,fio2_min,peep_max,peep_min,plateau_pressure_max,plateau_pressure_min,rrt,neuroblocker,age,hours_in_hosp_before_intubation,congestive_heart_failure,cerebrovascular_disease,dementia,chronic_pulmonary_disease,rheumatic_disease,mild_liver_disease,diabetes_without_cc,diabetes_with_cc,paraplegia,renal_disease,malignant_cancer,severe_liver_disease,metastatic_solid_tumor,aids,sinus_rhythm,SOFA,apsiii
count,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0,2984.0
mean,35066500.0,0.491622,84.172297,169.146972,258.944856,29.26441,21.897453,7.448281,7.291327,3.084688,1.295583,118.294236,64.848525,101.375,54.447721,120.864611,53.254223,32.818365,9.427949,99.942359,89.291555,38.169715,36.158073,2567.63941,94.011059,1.0,1.0,1.0,1.0,1.0,1.0,6.0,80.690684,38.654491,9.723023,2.698425,23.819236,15.668599,0.137399,0.095509,63.547922,91.526139,0.305295,0.196381,0.030831,0.313003,0.037198,0.192359,0.254357,0.075402,0.083445,0.243968,0.119303,0.086796,0.05429,0.011729,0.609249,8.324732,67.802949
std,2848316.0,0.500014,25.163838,8.679247,117.013541,6.334311,6.165518,0.068061,0.112077,2.858095,0.918088,22.857336,14.288507,21.229912,11.984427,38.87869,14.523168,7.787307,3.821289,0.323452,10.251683,0.850611,1.040328,48374.498096,30.025794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.027861,6.863469,4.701488,2.905619,53.345907,4.198831,0.344326,0.293966,16.737439,594.928248,0.460609,0.397326,0.172889,0.463793,0.189279,0.39422,0.435572,0.264083,0.2766,0.429546,0.324199,0.281583,0.226626,0.107683,0.488,4.143517,28.17571
min,30004890.0,0.0,1.3,122.0,34.0,13.0,0.0,7.1,6.68,0.5,0.2,62.0,10.0,33.0,11.0,1.0,1.0,15.0,1.0,95.0,1.0,32.2,15.0,81.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,25.0,20.0,5.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0
25%,32589570.0,0.0,67.075,165.0,177.5,25.0,18.0,7.41,7.23,1.5,0.9,102.0,55.0,87.0,48.0,105.0,52.0,27.0,7.0,100.0,88.0,37.56,36.11,156.0,75.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,50.0,35.0,6.0,0.0,19.0,13.0,0.0,0.0,53.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,47.0
50%,35200690.0,0.0,81.3,169.146972,250.0,29.0,22.0,7.45,7.31,2.1,1.1,117.0,64.0,100.0,54.0,113.0,55.0,32.0,9.0,100.0,92.0,38.1,36.44,195.0,92.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,100.0,40.0,9.0,0.0,22.0,15.0,0.0,0.0,66.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8.0,64.0
75%,37479900.0,1.0,96.4,173.0,326.166667,32.0,25.0,7.49,7.37,3.5,1.4,132.0,74.0,112.0,61.0,121.0,59.0,37.0,12.0,100.0,95.0,38.78,36.67,262.0,110.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,100.0,40.0,12.0,5.0,26.0,18.0,0.0,0.0,76.0,59.25,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11.0,86.0
max,39997980.0,1.0,264.0,198.0,1115.0,66.0,52.0,7.71,7.61,29.1,14.5,247.0,125.0,216.0,117.0,299.0,241.0,69.0,30.0,100.0,100.0,43.06,39.17,999999.0,343.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,100.0,100.0,85.0,21.0,2910.0,32.0,1.0,1.0,97.0,28371.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,23.0,189.0


### Drop constant columns

In [4]:
df = df.loc[:, df.apply(pd.Series.nunique) != 1]

### Merge with CXR data

In [5]:
df_cxr = pd.read_csv("../data/cxr-filled-2984-v2.csv")
df_cxr

Unnamed: 0,stay_id,subject_id,study_id,over72h,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,fluid_overload,lung_infection,no_events
0,30074509,17638202,58483943.0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,30488518,10878728,54819095.0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,3
2,30679928,15355458,57154020.0,1,1,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,5
3,30767458,12562737,56148939.0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,1,1,3
4,30963638,13185626,51204041.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979,39303987,11482582,56374267.0,1,1,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,5
2980,39339013,18893665,53803838.0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,3
2981,39426851,16326458,52006496.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2
2982,39460777,12356657,56806751.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,2


In [6]:
df = df.merge(df_cxr)

In [7]:
df_cxr.columns[4:].tolist()

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Enlarged Cardiomediastinum',
 'Fracture',
 'Lung Lesion',
 'Lung Opacity',
 'No Finding',
 'Pleural Effusion',
 'Pleural Other',
 'Pneumonia',
 'Pneumothorax',
 'Support Devices',
 'fluid_overload',
 'lung_infection',
 'no_events']

### Define feature sets

In [8]:
all_numeric_features = ['weight', 'height', 'pao2fio2ratio', 'co2_total_max', 'co2_total_min',
                        'ph_max', 'ph_min', 'lactate_max', 'lactate_min', 'heart_rate_max',
                        'heart_rate_min', 'mbp_ni_max', 'mbp_ni_min', 'mbp_arterial_max',
                        'mbp_arterial_min', 'resp_rate_max', 'resp_rate_min', 'spo2_max',
                        'spo2_min', 'temp_max', 'temp_min', 'glucose_max', 'glucose_min', 
                        'fio2_max', 'fio2_min',
                        'peep_max', 'peep_min', 'plateau_pressure_max', 'plateau_pressure_min',
                        'rrt', 'neuroblocker', 'age', 'hours_in_hosp_before_intubation',
                        'congestive_heart_failure', 'cerebrovascular_disease', 'dementia',
                        'chronic_pulmonary_disease', 'rheumatic_disease', 'mild_liver_disease',
                        'diabetes_without_cc', 'diabetes_with_cc', 'paraplegia',
                        'renal_disease', 'malignant_cancer', 'severe_liver_disease',
                        'metastatic_solid_tumor', 'aids', 'sinus_rhythm', 'SOFA', 'apsiii']

all_categorical_features = ['gender', 'admission_location', 'insurance', 
                            'language', 'ethnicity', 'marital_status']

In [9]:
# all_numeric_features += df_cxr.columns[4:].tolist()
all_numeric_features += df_cxr.columns[18:20].tolist()

# Feature selection

In [None]:
df.drop("over72h", axis=1).corrwith(df.over72h).abs().sort_values(ascending=False).head(10)

In [None]:
# selected_features = ["resp_rate_max", "temp_max", "peep_max", "ph_max", "apsiii"]
ryo_features = ["co2_total_max", "co2_total_min", "ph_max", "ph_min", 
                "lactate_max", "lactate_min", "heart_rate_max", "heart_rate_min", 
                "mbp_ni_max", "mbp_ni_min", "mbp_arterial_max", "mbp_arterial_min", 
                "resp_rate_max", "resp_rate_min", "spo2_max", "spo2_min", 
                "temp_max", "temp_min", "glucose_min", "SOFA", "apsiii", 
                "fio2_max", "fio2_min", "peep_max", "peep_min", 
                "plateau_pressure_max", "plateau_pressure_min", "pao2fio2ratio", "rrt", "neuroblocker"]

In [None]:
plt.rcParams['figure.figsize'] = (10, 10)
# sns.heatmap(df.drop(all_categorical_features, axis=1).corr(), cmap="RdBu_r", vmax=1, vmin=-1);
sns.heatmap(df[ryo_features].corr(), cmap="RdBu_r", vmax=1, vmin=-1);

# Preprocessing

In [10]:
# encode categorical
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
ohe = OneHotEncoder()

In [11]:
numeric_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [12]:
def define_preprocessor(features):
    numeric_features = [f for f in features if f in all_numeric_features]
    categorical_features = [f for f in features if f in all_categorical_features]
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
    
    return preprocessor

In [13]:
def get_X_and_y(df, features):
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df.over72h)
    
    X_train = df_train[features]
    y_train = df_train['over72h']

    X_test = df_test[features]
    y_test = df_test['over72h']
    return X_train, y_train, X_test, y_test

# Test different models

In [14]:
def evaluate_model(y, y_proba, class_names, string, thresh=None, show_plots=True, digits=2, save_figures=False, filename=""):
    # Generate predictions
    if thresh:
        y_pred = np.where(y_proba[:,1] > thresh, 1, 0)
    else:
        y_pred = np.argmax(y_proba, axis=1)
    print("Model evaluation on the %s set" % string)
    print()
    # Classification report
    print("Classification report:")
    print(classification_report(y, y_pred, digits=digits))
    
    # Plot confusion matrix
    plt.figure();
    sns.heatmap(confusion_matrix(y, y_pred, normalize="true"), 
                annot=confusion_matrix(y, y_pred), fmt="d",
                cmap="Blues", cbar=False, 
                xticklabels=class_names, yticklabels=class_names)
    plt.yticks(rotation=0)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion matrix");
    if save_figures:
        plt.savefig(filename + "_CM.png", bbox_inches='tight', dpi=300, transparent=True, pad_inches=0);
    
    if show_plots:
        plt.rcParams['figure.figsize'] = (6, 4)
        # Plot ROC curves
        y_dummy = pd.get_dummies(y, drop_first=False).values
        plt.figure();
        fpr, tpr, _ = roc_curve(y_dummy[:,1], y_proba[:,1])
        roc_auc = roc_auc_score(y_dummy[:,1], y_proba[:,1])
        plt.plot(fpr, tpr, lw=3, label=class_names[1] + " (AUC = %0.2f)" % roc_auc)

        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title("ROC curve")
        plt.legend(loc="lower right");
        
        if save_figures:
            plt.savefig(filename + "_ROC.png", bbox_inches='tight', dpi=300, transparent=True, pad_inches=0);

        # Plot precision-recall curves
        plt.figure();
        prec, rec, _ = precision_recall_curve(y_dummy[:,1], y_proba[:,1])
        pr_auc = auc(rec, prec)
        plt.plot(rec, prec, lw=3, label=class_names[1] + " (PRAUC = %0.2f)" % pr_auc)

        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title("Precision-Recall curve")
        plt.legend(loc="lower right");
        
        if save_figures:
            plt.savefig(filename + "_PR.png", bbox_inches='tight', dpi=300, transparent=True, pad_inches=0);
            
            
def benchmark_cv_score(clf, X, y, class_names, evaluate="score"):
    print('_' * 80)
    print()
    print("Model training: ")
#     print(clf)
    
    if evaluate=="score":
        scoring = {"precision_macro" : "precision_macro",
                   "recall_macro" : "recall_macro",
                   "f1_macro" : "f1_macro"}

        t0 = time()

        scores = cross_validate(clf, X, y, n_jobs=-1, cv=10, scoring=scoring)

        train_time = time() - t0
        print("train time: %0.3fs" % train_time)

        print("Average Precision: %0.3f (+/- %0.2f)" % (scores["test_precision_macro"].mean(), 
                                                        scores["test_precision_macro"].std() * 2))
        print("Average Recall: %0.3f (+/- %0.2f)" % (scores["test_recall_macro"].mean(), 
                                                     scores["test_recall_macro"].std() * 2))
        print("Average F1 score: %0.3f (+/- %0.2f)" % (scores["test_f1_macro"].mean(), 
                                                       scores["test_f1_macro"].std() * 2))
    if evaluate=="predict":
        y_proba = cross_val_predict(clf, X, y, n_jobs=-1, cv=10, method="predict_proba")
        evaluate_model(y, y_proba, class_names, "CV", show_plots=True)

In [15]:
features = all_numeric_features+all_categorical_features
# features = ryo_features

X_train, y_train, X_test, y_test = get_X_and_y(df, features)
print(X_train.shape, y_train.shape)

preprocessor = define_preprocessor(features)

(2387, 58) (2387,)


In [16]:
class_names = ("Less than 72 hours", "Over 72 hours")

clfs = (
    LogisticRegression(max_iter=1000),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
)

for clf in clfs:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', clf)])
    benchmark_cv_score(pipe, X_train, y_train, class_names)

________________________________________________________________________________

Model training: 
train time: 0.971s
Average Precision: 0.813 (+/- 0.06)
Average Recall: 0.811 (+/- 0.05)
Average F1 score: 0.811 (+/- 0.05)
________________________________________________________________________________

Model training: 
train time: 0.888s
Average Precision: 0.735 (+/- 0.06)
Average Recall: 0.732 (+/- 0.05)
Average F1 score: 0.731 (+/- 0.05)
________________________________________________________________________________

Model training: 
train time: 0.825s
Average Precision: 0.713 (+/- 0.05)
Average Recall: 0.712 (+/- 0.05)
Average F1 score: 0.712 (+/- 0.05)
________________________________________________________________________________

Model training: 
train time: 0.854s
Average Precision: 0.817 (+/- 0.04)
Average Recall: 0.813 (+/- 0.04)
Average F1 score: 0.813 (+/- 0.04)


In [None]:
clf = clfs[0]
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', clf)])
benchmark_cv_score(pipe, X_train, y_train, class_names, evaluate="predict")

# Hyperparameter tuning

In [17]:
import copy, math, os, pickle, time, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [18]:
N = 15
SEED = 1443
RF_dist = DictDist({
    'n_estimators': ss.randint(50, 500),
    'max_depth': ss.randint(2, 10),
    'min_samples_split': ss.randint(2, 75),
    'min_samples_leaf': ss.randint(1, 50),
})
np.random.seed(SEED)
RF_hyperparams_list = RF_dist.rvs(N)

In [19]:
def score_test(clf, X_test, y_test):
    y_true = y_test.values
    y_score = clf.predict_proba(X_test)[:, 1]
    y_pred = clf.predict(X_test)
    
    auc   = roc_auc_score(y_true, y_score)
    auprc = average_precision_score(y_true, y_score)
    acc   = accuracy_score(y_true, y_pred)
    F1    = f1_score(y_true, y_pred)
    
    creport = classification_report(y_true, y_pred, digits=4)
    print('AUROC', auc)
    print('AUPR', auprc)
    print('ACC', acc)
    print(creport)

In [20]:
def cv_with_RF_on_data(X_train, y_train, RF_hyperparams_list, fold=10):
    hyperparam_auc_based_on_cv = []
    hyperparam_f1_based_on_cv = []
    for n, hp in enumerate(RF_hyperparams_list):
        clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier(**hp))])
        cv_scores = cross_validate(clf, X_train, y_train, 
                                   scoring=( 'roc_auc', 'f1'),
                                   cv=fold)
        auc = np.mean(cv_scores['test_roc_auc'])
        f1 = np.mean(cv_scores['test_f1'])
#         print(hp)
        print(n, {'auc': auc, 'f1': f1})
        hyperparam_auc_based_on_cv.append(auc)
        hyperparam_f1_based_on_cv.append(f1)
    best = np.argmax(hyperparam_auc_based_on_cv)
    best_hp = RF_hyperparams_list[best]
    print('best hyperparams based on auc', best_hp)
    return best_hp

### combined fts

In [None]:
# X_train_cb, y_train_cb, X_test_cb, y_test_cb = get_X_and_y(df.merge(df_cxr))

In [21]:
RF_hyperparams_list = RF_dist.rvs(45)
best_hp = cv_with_RF_on_data(X_train, y_train, RF_hyperparams_list)

0 {'auc': 0.856649734334192, 'f1': 0.7658955848146695}
1 {'auc': 0.8626714952626191, 'f1': 0.7663823268360717}
2 {'auc': 0.8846565368642209, 'f1': 0.7842640271308134}
3 {'auc': 0.8902527697028695, 'f1': 0.7903919606765817}
4 {'auc': 0.8824606671938001, 'f1': 0.7880353630642284}
5 {'auc': 0.8760522792438223, 'f1': 0.7773701890673983}
6 {'auc': 0.8623398769285252, 'f1': 0.7666822216794138}
7 {'auc': 0.8867939268681898, 'f1': 0.7935265738292685}
8 {'auc': 0.8821746966004209, 'f1': 0.7895866873358566}
9 {'auc': 0.8792152174392192, 'f1': 0.7810236890689637}
10 {'auc': 0.8896583520380534, 'f1': 0.7904358331106254}
11 {'auc': 0.8786484410142498, 'f1': 0.7815404700999579}
12 {'auc': 0.8588177606099251, 'f1': 0.7631550998721585}
13 {'auc': 0.8817126638604827, 'f1': 0.7840892366965175}
14 {'auc': 0.8762201551006521, 'f1': 0.7772044197228853}
15 {'auc': 0.8561064618441667, 'f1': 0.7550844166703117}
16 {'auc': 0.8786400025216363, 'f1': 0.782258802568367}
17 {'auc': 0.8827445073106747, 'f1': 0.7907

In [22]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier(**best_hp))])
clf.fit(X_train, y_train)
score_test(clf, X_test, y_test)

AUROC 0.9033141728040237
AUPR 0.9024430038467348
ACC 0.7973199329983249
              precision    recall  f1-score   support

           0     0.7905    0.8191    0.8045       304
           1     0.8050    0.7747    0.7896       293

    accuracy                         0.7973       597
   macro avg     0.7977    0.7969    0.7970       597
weighted avg     0.7976    0.7973    0.7972       597



In [23]:
y_proba = clf.predict_proba(X_test)

In [24]:
np.save("../results/y-proba-combined-v2", y_proba)

### mimic ft only

In [None]:
X_train_ft, y_train_ft, X_test_ft, y_test_ft = get_X_and_y(df)
RF_hyperparams_list = RF_dist.rvs(45)
best_hp = cv_with_RF_on_data(X_train_ft, y_train_ft, RF_hyperparams_list)

In [None]:
clf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier(**best_hp))])
clf.fit(X_train, y_train)
score_test(clf, X_test, y_test)

### try to get some ft importance

In [None]:
model = RandomForestClassifier(**best_hp)

In [None]:
prep = preprocessor
x_train = prep.fit_transform(X_train_ft)
x_test = prep.transform(X_test_ft)

In [None]:
model.fit(x_train, y_train)

In [None]:
feats = {} 
for feature, importance in zip(numeric_features, model.feature_importances_[:57]):
    feats[feature] = importance

In [None]:
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').tail(10).plot(kind='barh' ) #, rot=45)

In [None]:
for i in model.feature_importances_[57:]:
    if i > 0.025:
        print('oh no')

In [None]:
model = RandomForestClassifier(**best_hp)
prep = preprocessor
x_train_cb = prep.fit_transform(X_train_cb)
x_test_cb = prep.transform(X_test_cb)
model.fit(x_train_cb, y_train)

In [None]:
feats = {} 
for feature, importance in zip(numeric_features, model.feature_importances_[:-6]):
    feats[feature] = importance

In [None]:
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').tail(10).plot(kind='barh' ) #, rot=45)

In [None]:
for i in model.feature_importances_[-6:]:
    if i > 0.025:
        print('oh no')

In [None]:
len(categorical_features)

In [None]:
len(numeric_features)

In [None]:
X_train_ft

In [None]:
clf.n_features_in_

# Feature selection