# Input and Settings

## Prediction Filenames

In [None]:
dp_model = "resnet50-anjuu-512"
print(dp_model)
patients_stacking_splits_filename = "../csv/patients_stacking_splits.csv"
print(patients_stacking_splits_filename)

In [None]:
valid_filename = "../submissions/Bac/oof_valid_resnet50-anjuu-512.csv.gz"
print(valid_filename)
sub_filename = "../submissions/Bac/avg_resnet50-anjuu-512.csv.gz" # LB 0.067
print(sub_filename)

## Turn META_NUMBER_IMAGES_IN_USE ON/OFF to create 2 outputs

In [None]:
META_NUMBER_IMAGES_IN_USE = True
NUMBER_FOLDS = 6 # 6 for if we include the stage 1 set
print(NUMBER_FOLDS)

In [None]:
valid_post_filename = valid_filename.replace(".csv", 
                                "_post_{}.csv".format("with_meta" if META_NUMBER_IMAGES_IN_USE else "wo_meta"))
print(valid_post_filename)
sub_post_filename = sub_filename.replace(".csv", 
                                "_post_{}.csv".format("with_meta" if META_NUMBER_IMAGES_IN_USE else "wo_meta"))
print(sub_post_filename)

In [None]:
predefined_features = ['p', 'rank', 'rank_inv', 'rank_max', 'p1', 'p1_std', 'p1_skew', 'p1_list_std', 
                        'p1_inv', 'p1_inv_std', 'p1_inv_skew', 'p1_inv_list_std', 'rank_perc', 'p_next', 'p_prev']

excluded_meta_features = [] if META_NUMBER_IMAGES_IN_USE else [c for c in predefined_features if "rank" in c]
print(excluded_meta_features)

## Directories

In [None]:
models_h2o_dir = "../models_h2o/{}_{}/".format(dp_model, "with_meta" if META_NUMBER_IMAGES_IN_USE else "wo_meta")
print(models_h2o_dir)

In [None]:
!mkdir -p $models_h2o_dir

In [None]:
input_dir = "../input/"

# Introduction

In [None]:
from __future__ import print_function
import sklearn
import sklearn.datasets
import sklearn.ensemble
import pandas as pd
import numpy as np
# import h2o
# from h2o.estimators.random_forest import H2ORandomForestEstimator
# from h2o.estimators.gbm import H2OGradientBoostingEstimator

np.random.seed(2019)

import datetime as dt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import pickle
import re

%matplotlib inline

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 5, 5

In [None]:
from sklearn.metrics import log_loss
pd.options.display.float_format = '{:,.5f}'.format

# Metadata

In [None]:
# usecols = ['ID', 'Label', 'Sub_type', 'PatientID']

usecols = ["sop_instance_uid", "patient_id", "study_instance_uid"] + \
    ["image_position_patient_2"]

train_metadata = pd.read_csv(input_dir + "df_dicom_metadata_train.csv", usecols = usecols)
print(train_metadata.shape)
train_metadata.head(3)

In [None]:
train_label = pd.read_csv(input_dir + "stage_1_train.csv", usecols = None)
print(train_label.shape)
train_label.head(3)

In [None]:
train_label["sop_instance_uid"] = train_label["ID"].apply(lambda x: "ID_" + x.split("_")[1])
train_label["SubType"] = train_label["ID"].apply(lambda x: x.split("_")[2])
train_label.head(3)

train_label = pd.pivot_table(train_label, values='Label', index=['sop_instance_uid'], columns=['SubType'], aggfunc=np.max).reset_index()
train_label.head(3)

In [None]:
print(train_metadata.shape)
train_metadata = pd.merge(train_metadata, train_label, on="sop_instance_uid")
print(train_metadata.shape)
train_metadata.head()

In [None]:
train_patients = train_metadata.groupby(["patient_id"])["any"].max().to_frame("any").reset_index()
train_patients.head(3)

In [None]:
train_patients["any"].value_counts()

In [None]:
test_metadata = pd.read_csv(input_dir + "df_dicom_metadata_test.csv", usecols = usecols)
test_metadata["set"] = 0
print(test_metadata.shape)
test_metadata.head(3)

In [None]:
test_metadata["patient_id"].unique().shape[0]

# Validation Set

In [None]:
valid = pd.read_csv(valid_filename, compression="gzip" if ".gz" in valid_filename else None)
print(valid.shape)
valid.head(3)

## Pivot

In [None]:
if (len(valid.columns) == 2) and ("Label" in valid.columns):
    print("Pivoting the validation set ...")
    valid["sop_instance_uid"] = valid["ID"].apply(lambda x: "ID_" + x.split("_")[1])
    valid["SubType"] = valid["ID"].apply(lambda x: x.split("_")[2])
    valid.head(3)

    valid = pd.pivot_table(valid, values='Label', index=['sop_instance_uid'], columns=['SubType'], aggfunc=np.max).reset_index()
valid.head(3)

## Rename columns

In [None]:
cols = ['any', 'epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural']
print(cols)

In [None]:
p_cols_map = { c : "p_{}".format(c) for c in cols}
valid.rename(columns=p_cols_map, inplace=True)
valid.head()

## Join with metadata

In [None]:
train_metadata.head()

In [None]:
meta_cols = ["sop_instance_uid", "patient_id", "study_instance_uid","image_position_patient_2"]

df = pd.merge(valid[["sop_instance_uid"] + ["p_{}".format(c) for c in cols]], 
              train_metadata[meta_cols + cols], on=["sop_instance_uid"])
print(valid.shape, df.shape)
df.head(3)

# Modeling 

In [None]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators import H2OGeneralizedLinearEstimator
from h2o.estimators import H2ONaiveBayesEstimator
from h2o.estimators import H2OStackedEnsembleEstimator
from h2o.estimators import H2OXGBoostEstimator
from h2o.grid import H2OGridSearch

In [None]:
port = 54321
h2o.init(port=port,max_mem_size="100G")

In [None]:
id_col = "sop_instance_uid"
key_cols = ["patient_id", "study_instance_uid"]
pos_col = "image_position_patient_2"
ordered_cols = key_cols + [pos_col]

df.sort_values(ordered_cols, ascending=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

## Feature Creation

In [None]:
def create_features(df, label_col="any", WIN3 = True, is_train=True):
    selected_cols = [id_col] + ordered_cols + ["p_{}".format(label_col)]
    if is_train:
        selected_cols.append(label_col)
    
    df_sub = df[selected_cols].copy()
    df_sub.rename(columns={"p_{}".format(label_col): "p"}, inplace=True)

    df_sub.sort_values(ordered_cols, ascending=True, inplace=True)
    df_sub.reset_index(drop=True, inplace=True)
    df_sub.head(10)

    # Rank
    df_sub["rank"] = df_sub.groupby(key_cols)[pos_col].rank(ascending=True).astype(int)
    df_sub["rank_inv"] = df_sub.groupby(key_cols)[pos_col].rank(ascending=False).astype(int)
    df_sub_grouped = df_sub.groupby(key_cols)["rank"].count() #.agg({'Label':'max'})
    df_sub_grouped = df_sub_grouped.to_frame("rank_max").reset_index()
    df_sub = pd.merge(df_sub, df_sub_grouped, how='left', on=key_cols)
    df_sub.head()

    def list_std(x):
        return np.std(x[x>0])

    # Features
    df_sub['p1'] = df_sub.groupby(key_cols)['p'].apply(lambda x: x.shift().expanding().mean())
    df_sub['p1_std'] = df_sub.groupby(key_cols)['p'].apply(lambda x: x.shift().expanding().std())
    df_sub['p1_skew'] = df_sub.groupby(key_cols)['p'].apply(lambda x: x.shift().expanding().skew())
    df_sub['p1_list_std'] = df_sub.groupby(key_cols)['p'].apply(lambda x: \
                                                        x.shift().expanding().apply(lambda x: list_std(x), 'raw=False'))

    df_sub['p1_inv'] = df_sub.sort_values(key_cols + ["rank_inv"]).groupby(key_cols)['p'].apply(lambda x: x.shift().expanding().mean())
    df_sub['p1_inv_std'] = df_sub.sort_values(key_cols + ["rank_inv"]).groupby(key_cols)['p'].apply(lambda x: x.shift().expanding().std())
    df_sub['p1_inv_skew'] = df_sub.sort_values(key_cols + ["rank_inv"]).groupby(key_cols)['p'].apply(lambda x: x.shift().expanding().skew())
    df_sub['p1_inv_list_std'] = df_sub.sort_values(key_cols + ["rank_inv"]).groupby(key_cols)['p'].apply(lambda x: \
                                                        x.shift().expanding().apply(lambda x: list_std(x), 'raw=False'))

    df_sub["rank_perc"] = df_sub["rank"] / df_sub["rank_max"] 
    df_sub.head()

    if WIN3:
        df_sub["p_next"] = df_sub["p"].shift(-1)
        df_sub["p_prev"] = df_sub["p"].shift(+1)
    df_sub.head()
    return df_sub

## Modeling for each label

### Modeling

In [None]:
df_stacking_patients = pd.read_csv(patients_stacking_splits_filename)
print(df_stacking_patients.shape[0])
print(df_stacking_patients["stacking_fold"].value_counts())
df_stacking_patients.head(3)

In [None]:
all_valid_patients = set(df_stacking_patients[df_stacking_patients["stacking_fold"] < NUMBER_FOLDS]["patient_id"])
print(len(all_valid_patients))

In [None]:
# Modeling

all_kpi_valid = {}
valid_h2o_parts = []
for label_col in cols[:]: # "any"
    print("----------------------------------------")
    print("Processing label {} ...".format(label_col))

    # Feature creation
    print("Feature creating ...")
    df_sub = create_features(df, label_col)

    excluded_cols = [id_col] + ordered_cols + cols
    #print(excluded_cols)
    feature_cols = [c for c in df_sub.columns if c not in excluded_cols]
    # Number of images per study
    feature_cols = [c for c in feature_cols if c not in excluded_meta_features]
    print("Features:", feature_cols)

    # Drop NA
    df_ok = df_sub.dropna()
    print(df_ok.shape, df_sub.shape)

    fold_kpi_valid = {}
    for fold in range(NUMBER_FOLDS):
        # Fold
        patients_valid = set(df_stacking_patients[df_stacking_patients["stacking_fold"] == fold]["patient_id"])
        patients_train = all_valid_patients - patients_valid
        print(fold, len(patients_train), len(patients_valid))
    
        # Split
        data_train = h2o.H2OFrame(df_ok[df_ok["patient_id"].isin(patients_train)]) 
        data_valid = h2o.H2OFrame(df_ok[df_ok["patient_id"].isin(patients_valid)]) 

        data_train["TARGET"] = data_train[label_col].asfactor()
        data_valid["TARGET"] = data_valid[label_col].asfactor()

        # Modeling
        model_id = 'model_{}_{}'.format(label_col, fold)
        print("Modeling {} ...".format(model_id))

        model = H2OGradientBoostingEstimator(model_id = model_id, ntrees=100, nfolds=5,
                                                stopping_rounds=5, stopping_metric="logloss", stopping_tolerance=0.02,
                                                seed=2019)

        model.train(x=feature_cols, y="TARGET", training_frame=data_train, validation_frame=data_valid)

        # Save
        model_path = h2o.save_model(model=model, path=models_h2o_dir + 'model_{}'.format(label_col), force=True)
        print(model_path)

        # Feature importance
        df_fi = model._model_json['output']['variable_importances'].as_data_frame()
        print(df_fi.head(5))

        # model.varimp_plot();

        # Validation
        hf_y_pred = model.predict(data_valid)
        df_y_pred = (data_valid[["sop_instance_uid"] + key_cols + ["p", label_col]]).cbind(hf_y_pred).as_data_frame()
        print(df_y_pred.shape)
        df_y_pred.head(3)

        loss0 = log_loss(y_true=df_y_pred[label_col], y_pred=df_y_pred["p"])
        loss1 = log_loss(y_true=df_y_pred[label_col], y_pred=df_y_pred["p1"])
        print("Fold {}: origin loss = {}, new loss = {}".format(fold, loss0, loss1)) # 0.11695034962163679, 0.11695034962163679

        fold_kpi_valid[fold] = (loss0, loss1, model_path)
        # Append
        df_y_pred["ID"] = df_y_pred["sop_instance_uid"] + "_{}".format(label_col)
        valid_h2o_parts.append(df_y_pred[["ID", "p1"]])
        #

        print("----------------------------------------")
    all_kpi_valid[label_col] = fold_kpi_valid
len(valid_h2o_parts)

In [None]:
del model

In [None]:
all_kpi_valid

### OOF predictions

In [None]:
valid_h2o = pd.concat(valid_h2o_parts, axis=0)
print(valid_h2o.shape)
print(valid_h2o.head(3))
valid_h2o = valid_h2o.groupby("ID")["p1"].mean().to_frame("p1").reset_index()
valid_h2o.head(3)

## Smooth

In [None]:
weighted_mean = lambda x: (x[0] + 3*x[1] + x[2]) / 5

def smooth_interpolate_sub(sub_filename, metadata_filename = "df_dicom_metadata_train.csv"):
    sub = pd.read_csv(sub_filename, usecols = None, compression="gzip" if ".gz" in sub_filename else None)
    
    sub["sop_instance_uid"] = sub["ID"].apply(lambda x: "ID_" + x.split("_")[1])
    sub["cls"] = sub["ID"].apply(lambda x: x.split("_")[2])

    key_cols = ['patient_id', 'study_instance_uid', 'image_position_patient_2']
    usecols = ['sop_instance_uid'] + key_cols
    meta = pd.read_csv(metadata_filename, usecols = usecols)
    
    sub2 = sub.copy()
    for c in cols:
        print(c)
        sub_any = sub[sub["cls"] == c].copy()
        sub_any = pd.merge(sub_any, meta, on="sop_instance_uid")
        sub_any.sort_values(key_cols, inplace=True)
        sub_any.reset_index(drop=True, inplace=True)

        sub_any["f2_{}".format(c)] = sub_any.groupby('study_instance_uid')['Label'].rolling(3).apply(weighted_mean).reset_index(0,drop=True)
        sub_any["f3_{}".format(c)] = sub_any["f2_{}".format(c)].shift(-1) # sub_any["f2_{}".format(c)].shift(-1)

        sub2 = pd.merge(sub2, sub_any[["ID", "f3_{}".format(c)]], on="ID", how="left") 
    
    sub2["f3_Label"] = sub2["Label"]
    for c in cols:
        print(c)
        sub2.loc[~sub2["f3_{}".format(c)].isna(), "f3_Label"] = \
            sub2.loc[~sub2["f3_{}".format(c)].isna(), "f3_{}".format(c)]
    
    return sub2, meta

In [None]:
print(valid_filename)

valid_smooth, meta = smooth_interpolate_sub(sub_filename = valid_filename,
       metadata_filename = input_dir + "df_dicom_metadata_train.csv")
valid_smooth.head(8)

## Combination (supported labels)

In [None]:
#supported_labels = ["any"]
all_supported_labels = cols
print(all_supported_labels)

In [None]:
supported_labels = []
for label_col in all_supported_labels: #label_col = "subdural"
    print("----------------------------------------")
    
    kpi = all_kpi_valid[label_col]
    #print(kpi)
    kpi_0 = 0
    kpi_1 = 0
    for fold in range(NUMBER_FOLDS):
        kpi_0 = kpi_0 + kpi[fold][0]
        kpi_1 = kpi_1 + kpi[fold][1]
    if(kpi_0 <= kpi_1):
        print("NOT OK => ignore {}".format(label_col))
        continue
    else:
        print(label_col, kpi_0/5, kpi_1/5)
        supported_labels.append(label_col)
print(supported_labels)

In [None]:
print(valid_h2o.shape)
pd.merge(valid_smooth[["ID", "Label", "cls", "f3_Label"]], valid_h2o, on="ID").shape

In [None]:
final_valid = pd.merge(valid_smooth[["ID", "Label", "cls", "f3_Label"]], valid_h2o, on="ID", how="left")
print(valid_smooth.shape, final_valid.shape)
print(final_valid.isnull().sum())
final_valid.head(20)

In [None]:
final_valid["Label"] = final_valid["f3_Label"] 
supported_index = (~final_valid["p1"].isna()) & final_valid["cls"].isin(supported_labels)

final_valid.loc[supported_index, "Label"] = final_valid.loc[supported_index, "p1"]
final_valid.head(30)

In [None]:
print(valid_post_filename, final_valid.shape)
final_valid[["ID", "Label"]].to_csv(valid_post_filename, compression="gzip", index=None)

# Submission

In [None]:
print(sub_filename)
sub = pd.read_csv(sub_filename, compression="gzip" if ".gz" in sub_filename else None)
print(sub.shape)
sub.head(3)

In [None]:
print(sub[sub["ID"].isin(["ID_28fbab7eb_epidural"])])
sub.tail(3)

In [None]:
sample = pd.read_csv(input_dir + "stage_1_sample_submission.csv")
print(sample.shape)
sample.head(3)

In [None]:
sub = pd.merge(sample[["ID"]], sub, on="ID") # Ordered
print(sub.shape)
sub.head(3)

In [None]:
sub["sop_instance_uid"] = sub["ID"].apply(lambda x: "ID_" + x.split("_")[1])
sub["SubType"] = sub["ID"].apply(lambda x: x.split("_")[2])
sub.head(3)

sub = pd.pivot_table(sub, values='Label', index=['sop_instance_uid'], columns=['SubType'], aggfunc=np.max).reset_index()
sub.head(3)

In [None]:
print(meta_cols)

## H2O

In [None]:
sub_h2o_parts = []
for label_col in supported_labels: #label_col = "subdural"
    print("----------------------------------------")    
    kpi = all_kpi_valid[label_col]
        
    # Feature creation
    print("Feature creating ...")
    df_sub_temp = pd.merge(sub[["sop_instance_uid", label_col]].rename(columns={label_col: "p_{}".format(label_col)}), \
                 test_metadata[meta_cols], on="sop_instance_uid")
    df_sub = create_features(df_sub_temp, label_col, is_train=False)
    df_sub.head(3)

    df_sub_ok = df_sub.dropna().copy()
    print(df_sub_ok.shape, df_sub.shape)
    hf_sub = h2o.H2OFrame(df_sub_ok) 
        
    for fold in range(NUMBER_FOLDS):
        model_path = kpi[fold][2]
        print("Processing label {} ...".format(label_col))
        print(model_path)

        model_sub = h2o.load_model(path=model_path)

        hf_y_sub_pred = model_sub.predict(hf_sub)
        #df_y_sub_pred = (hf_sub[["sop_instance_uid"]+key_cols].cbind(hf_y_sub_pred)).as_data_frame()
        df_y_sub_pred = (hf_sub[["sop_instance_uid"]].cbind(hf_y_sub_pred)).as_data_frame()
        print("Output:", df_y_sub_pred.shape, df_y_sub_pred.columns)

        df_y_sub_pred["ID"] = df_y_sub_pred["sop_instance_uid"] + "_{}".format(label_col)

        # Append
        sub_h2o_parts.append(df_y_sub_pred[["ID", "p1"]])
    
print(len(sub_h2o_parts))

In [None]:
sub_h2o = pd.concat(sub_h2o_parts, axis=0)
print(sub_h2o.shape)
print(sub_h2o.head(3))
sub_h2o = sub_h2o.groupby("ID")["p1"].mean().to_frame("p1").reset_index()
sub_h2o.head(3)

In [None]:
sub_h2o[sub_h2o["ID"].isin(["ID_584e7fced_any"])]

## Smooth and Combination

In [None]:
print(sub_filename)
sub_smooth, meta = smooth_interpolate_sub(sub_filename = sub_filename,
       metadata_filename = input_dir + "df_dicom_metadata_test.csv")
sub_smooth.head(8)

In [None]:
print(sub_h2o.shape)
pd.merge(sub_smooth[["ID", "Label", "cls", "f3_Label"]], sub_h2o, on="ID").shape

In [None]:
final_sub = pd.merge(sub_smooth[["ID", "Label", "cls", "f3_Label"]], sub_h2o, on="ID", how="left")
print(sub_smooth.shape, final_sub.shape)
print(final_sub.isnull().sum())
final_sub.head(20)

In [None]:
final_sub["Label"] = final_sub["f3_Label"] 
supported_index = (~final_sub["p1"].isna()) & final_sub["cls"].isin(supported_labels)

final_sub.loc[supported_index, "Label"] = final_sub.loc[supported_index, "p1"]
final_sub.head(30)

## Save

In [None]:
sample.head(3)

In [None]:
final_sub = pd.merge(sample[["ID"]], final_sub, on="ID")
print(final_sub.shape)
final_sub.head(10)

In [None]:
print(sub_post_filename)
final_sub[["ID", "Label"]].to_csv(sub_post_filename, compression="gzip", index=None)

In [None]:
print(sub_post_filename)
print("Done")