In [62]:
import onnxruntime as ort
import sys
sys.path.append("..")
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from argparse import ArgumentParser
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from src.utils.DataLoader import HidaDataLoader
import pandas as pd
import scipy.special as sc
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import os

from transformers import ViTFeatureExtractor
from xgboost import XGBClassifier

In [59]:
transform = transforms.Compose([
    transforms.CenterCrop(size=(900,900)),
    transforms.ToTensor()
])


# get predictions from ResNet image classifier1:

In [60]:
dl = HidaDataLoader(num_workers=0, batch_size=8, data_path="../data", transform=transform)
dl.train_split = 0.6
dl.setup()

ONNX_FILE = "C:/Users/Tobias/PycharmProjects/HIDA_LFL/logs/checkpoints/HIDA/model_193.onnx"
options = ort.SessionOptions()
options.inter_op_num_threads = 12
options.intra_op_num_threads = 12

ort_sess = ort.InferenceSession(ONNX_FILE, sess_options=options)
ort_sess.get_inputs()[0].name

valid_dataloader = dl.val_dataloader()

input_name = ort_sess.get_inputs()[0].name
output_name = ort_sess.get_outputs()[0].name

predictions = []
targets = []

counter = 0
for batch in tqdm(valid_dataloader):
    model_input, label, (label_name, image_name) = batch

    outputs_single = sc.expit(ort_sess.run([output_name], {input_name: model_input.cpu().numpy()})[0]).T[0]
    predictions += list(outputs_single)
    targets += list(image_name)
    counter += 1

resnset_results = dict(predictions_resnet=predictions, images=targets)
resnet_df = pd.DataFrame(resnset_results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [07:14<00:00,  9.87s/it]


In [57]:
dl = HidaDataLoader(num_workers=0, batch_size=2, data_path="../data", transform=transform)
dl.train_split = 0.6
dl.setup()

ONNX_FILE = "C:/Users/Tobias/Downloads/model_9.onnx"
options = ort.SessionOptions()
options.inter_op_num_threads = 12
options.intra_op_num_threads = 12

ort_sess2 = ort.InferenceSession(ONNX_FILE, sess_options=options)
ort_sess2.get_inputs()[0].name

valid_dataloader = dl.val_dataloader()

input_name = ort_sess2.get_inputs()[0].name
output_name = ort_sess2.get_outputs()[0].name

predictions = []
targets = []


feature_extractor = ViTFeatureExtractor(do_resize=False, do_normalize=False)

counter = 0
for batch in tqdm(valid_dataloader):
    model_input, label, (label_name, image_name) = batch
    features = feature_extractor(model_input, return_tensors="pt")
    outputs_single = sc.expit(ort_sess2.run([output_name], {input_name: features["pixel_values"][0]})[0]).T[0]
    predictions += list(outputs_single)
    targets += list(image_name)
    counter += 1

vision_results = dict(predictions_resnet=predictions, images=targets)
vision_df = pd.DataFrame(vision_results)

  0%|                                                                                                                                                                                  | 0/173 [00:01<?, ?it/s]


ValueError: Unable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.

# impute datasets:


In [4]:

train_data = "../data/trainSet/trainSet.txt"
test_data = "../data/testSet/testSet.txt"

df_train = pd.read_csv(train_data)
df_train_length_idx = len(df_train)
df_test = pd.read_csv(test_data)

# Merge the two datasets
train_test = [df_train, df_test]
df_train_test = pd.concat(train_test)
# df_train_test['Prognosis'].loc[df_train_test['Prognosis'] == '<undefined>'] = np.nan
df_train_test.loc[df_train_test['Prognosis'] == '<undefined>', 'Prognosis'] = np.nan

# all variables in the dataset (incl. outcome)
variables = list(df_train_test.columns[3:])
# which variables to use for catboost (only numerical ones!)
variables_for_regression = ['WBC', 'Temp_C', 'CRP', 'Fibrinogen', 'LDH', 'Ddimer', 'Ox_percentage', 'PaO2', 'SaO2', 'pH', 'Age']
variables_for_classification = [ 'RespiratoryFailure', 'Sex', 'CardiovascularDisease', 'DifficultyInBreathing', 'Cough']

# Set up catboost for each variable separately
# Here we don't update and always use the original dataset with missing values in all variables

# Generate new dataframe for imputed values (for this we copy the original test_trai datetset and store the index of the imputed ones and fill them in)
df_imputed_train_test = df_train_test.copy()
del df_imputed_train_test['Prognosis']

# Prognosis should be Boolian
df_imputed_train_test['Prognosis']= np.nan
df_imputed_train_test.loc[df_train_test['Prognosis']=='MILD', 'Prognosis'] = 0
df_imputed_train_test.loc[df_train_test['Prognosis']=='SEVERE', 'Prognosis'] = 1


df_train_test['Prognosis'] = df_imputed_train_test['Prognosis']

if not os.path.isfile("imputed_test.csv"):
    for catboost_variable in variables_for_regression:
        print(catboost_variable)
        catboost_features = variables.copy()
        catboost_features.remove(catboost_variable)
        catboost_df_train_test = df_train_test[df_train_test[catboost_variable].notna()]
        X_train = catboost_df_train_test[catboost_features]
        y_train = catboost_df_train_test[catboost_variable]

        index_missing = df_train_test[catboost_variable].isna()
        catboost_df_test = df_train_test[df_train_test[catboost_variable].isna()]
        X_test = catboost_df_test[catboost_features]
        y_test = catboost_df_test[catboost_variable]

        # fit the model 
        model = CatBoostRegressor(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
        model.fit(X_train.values, y_train.values)

        # make a prediction
        yhat = model.predict(X_test.values)
        df_imputed_train_test.loc[df_imputed_train_test[catboost_variable].isna(), catboost_variable] = yhat.T[0].copy()


    for catboost_variable in variables_for_classification:
        print(catboost_variable)
        catboost_features = variables.copy()
        catboost_features.remove(catboost_variable)
        catboost_df_train_test = df_train_test[df_train_test[catboost_variable].notna()]
        X_train = catboost_df_train_test[catboost_features]
        y_train = catboost_df_train_test[catboost_variable]

        index_missing = df_train_test[catboost_variable].isna()
        catboost_df_test = df_train_test[df_train_test[catboost_variable].isna()]
        X_test = catboost_df_test[catboost_features]
        y_test = catboost_df_test[catboost_variable]

        # fit the model 
        model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
        model.fit(X_train.values, y_train.values)

        # make a prediction
        yhat = model.predict(X_test.values)
        df_imputed_train_test.loc[df_imputed_train_test[catboost_variable].isna(), catboost_variable] = yhat.T[0].copy()

    # separate the two datasets:
    df_imputed_train = df_imputed_train_test.iloc[0:df_train_length_idx, : ]
    df_imputed_test = df_imputed_train_test.iloc[df_train_length_idx:, :]

    df_imputed_train.to_csv("imputed_train.csv")
    df_imputed_test.to_csv("imputed_test.csv")

else:
    df_imputed_train = pd.read_csv("imputed_train.csv")
    df_imputed_test = pd.read_csv("imputed_test.csv")


# Train tabular models
### train catboost not imputed:

In [5]:
df_for_rf = df_train.copy()
df_for_rf["Prognosis"] = df_for_rf["Prognosis"].astype("category").cat.codes
df_for_rf = df_for_rf[variables]

catboost_features = variables.copy()
catboost_features.remove("Prognosis")

train_valid_split = 0.8
sep_index = int(863 * train_valid_split)

X_train = df_for_rf[catboost_features].iloc[0:sep_index]
y_train = df_for_rf["Prognosis"].iloc[0:sep_index]


X_test = df_for_rf[catboost_features].iloc[sep_index:]
y_test = df_for_rf["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]

# fit the model 
cat_boost_raw_model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
cat_boost_raw_model.fit(X_train.values, y_train.values)

<catboost.core.CatBoostClassifier at 0x273b0f23040>

### xgboost not imputed

In [63]:
# get predictions from xgboost not imputed:
df_for_xg = df_train.copy()
df_for_xg["Prognosis"] = df_for_xg["Prognosis"].astype("category").cat.codes
df_for_xg = df_for_xg[variables]

xgboost_features = variables.copy()
xgboost_features.remove("Prognosis")

train_valid_split = 0.8
sep_index = int(863 * train_valid_split)

X_train = df_for_xg[xgboost_features].iloc[0:sep_index]
y_train = df_for_xg["Prognosis"].iloc[0:sep_index]


X_test = df_for_xg[xgboost_features].iloc[sep_index:]
y_test = df_for_xg["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]

# fit model on training data
model_xboost = XGBClassifier()
model_xboost.fit(X_train.values, y_train.values)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### train catboost imputed:

In [64]:
catboost_features = variables.copy()
catboost_features.remove("Prognosis")

train_valid_split = 0.8
sep_index = int(863 * train_valid_split)

X_train = df_imputed_train[catboost_features].iloc[0:sep_index]
y_train = df_imputed_train["Prognosis"].iloc[0:sep_index]

X_test = df_imputed_train[catboost_features].iloc[sep_index:]
y_test = df_imputed_train["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]

# fit the model
cat_boost_imputed_model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
cat_boost_imputed_model.fit(X_train.values, y_train.values)

<catboost.core.CatBoostClassifier at 0x27383dba850>

### train imputed random forest:

In [65]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.values)
X_test_scaled = scaler.transform(X_test.values)

rf_regressor = RandomForestRegressor(n_estimators=20, random_state=0)
rf_regressor.fit(X_train_scaled, y_train)

RandomForestRegressor(n_estimators=20, random_state=0)

### xgboost imputed 

In [66]:
# fit model on training data
model_xboost_imputed = XGBClassifier()
model_xboost_imputed.fit(X_train.values, y_train.values)






XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Get predictions to train final catboost:

In [75]:
# from catboost raw:
df_for_rf = df_train.copy()
df_for_rf["Prognosis"] = df_for_rf["Prognosis"].astype("category").cat.codes
df_for_rf = df_for_rf[variables]

catboost_features = variables.copy()
catboost_features.remove("Prognosis")

train_valid_split = 0.6
sep_index = int(863 * train_valid_split)

X_test = df_for_rf[catboost_features].iloc[sep_index:]
y_test = df_for_rf["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]

yhat = cat_boost_raw_model.predict(X_test.values)

catboost_results = dict(predictions_cb_raw=list(yhat), images=image_names.to_list())
catboost_results_df = pd.DataFrame(catboost_results)

# from xgboost raw:
X_test = df_for_xg[xgboost_features].iloc[sep_index:]
y_test = df_for_xg["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]
y_pred = model_xboost.predict(X_test.values)
xgboost_results = dict(predictions_xg=list(y_pred), images=image_names.to_list())
xgboost_results_df = pd.DataFrame(xgboost_results)

# from catbtoost imputed:
catboost_features = variables.copy()
catboost_features.remove("Prognosis")

X_test = df_imputed_train[catboost_features].iloc[sep_index:]
y_test = df_imputed_train["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]

yhat = cat_boost_imputed_model.predict(X_test.values)

catboost_imputed_results = dict(predictions_cb_imputed=list(yhat), images=image_names.to_list(), targets=y_test.values)
catboost_imputed_results_df = pd.DataFrame(catboost_imputed_results)

# from random forest:
X_test_scaled = scaler.transform(X_test.values)
y_pred = rf_regressor.predict(X_test_scaled)

rf_results = dict(predictions_rf=list(y_pred), images=image_names.to_list())
rf_results_df = pd.DataFrame(rf_results)


# from xgboost imputed:
y_pred = model_xboost_imputed.predict(X_test.values)
xgboost_imputed_results = dict(predictions_xg_imputed=list(y_pred), images=image_names.to_list())
xgboost_imputed_results_df = pd.DataFrame(xgboost_imputed_results)



# Stack predicions and train final model:

In [76]:
rf_results_df = rf_results_df.set_index("images")
catboost_imputed_results_df = catboost_imputed_results_df.set_index("images")
catboost_results_df = catboost_results_df.set_index("images")
resnet_df = resnet_df.set_index("images")
xgboost_imputed_results_df = xgboost_imputed_results_df.set_index("images")
xgboost_results_df = xgboost_results_df.set_index("images")

In [77]:
all_predictions = pd.concat([rf_results_df, catboost_imputed_results_df, catboost_results_df, resnet_df, xgboost_imputed_results_df, xgboost_results_df], axis=1)
X_train = all_predictions.drop(columns="targets").values[:-10]
X_test = all_predictions.drop(columns="targets").values[-10:]
y_train = all_predictions["targets"].values[:-10]
y_test = all_predictions["targets"].values[-10:]

# final stacking model:
final_model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
final_model.fit(X_train, y_train)
yhat = final_model.predict(X_test)

In [78]:
all_predictions

Unnamed: 0_level_0,predictions_rf,predictions_cb_imputed,targets,predictions_cb_raw,predictions_resnet,predictions_xg_imputed,predictions_xg
images,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P_404.png,0.05,0.0,0.0,0,0.444716,0.0,0
P_428.png,0.15,0.0,0.0,0,0.512289,0.0,0
P_507.png,0.80,1.0,1.0,1,0.457178,1.0,1
P_508.png,0.85,1.0,1.0,1,0.618459,1.0,1
P_526.png,0.65,0.0,1.0,0,0.639944,1.0,1
...,...,...,...,...,...,...,...
P_1_12.png,0.70,1.0,1.0,1,0.516278,1.0,1
P_1_8.png,0.70,1.0,1.0,1,0.419664,1.0,1
P_1_10.png,0.10,0.0,0.0,0,0.442719,0.0,0
P_1_26.png,0.35,0.0,1.0,1,0.453410,1.0,1


# Get final predictions on testset for submission:

In [79]:
# from resnet:
dl = HidaDataLoader(num_workers=0, batch_size=8, data_path="../data", transform=transform)
dl.setup(stage="test")

predictions = []
targets = []
test_dataloader = dl.test_dataloader()

counter = 0
for batch in tqdm(test_dataloader):
    model_input, label, (label_name, image_name) = batch
    outputs_single = sc.expit(ort_sess.run([output_name], {input_name: model_input.cpu().numpy()})[0]).T[0]
    predictions += list(outputs_single)
    targets += list(image_name)

resnset_results = dict(predictions_resnet=predictions, images=targets)
resnet_df = pd.DataFrame(resnset_results)
resnet_df["images"] = resnet_df["images"].apply(lambda x: x.split(".")[0])

# from catboost raw:
df_for_rf = df_test.copy()
df_for_rf["Prognosis"] = df_for_rf["Prognosis"].astype("category").cat.codes
df_for_rf = df_for_rf[variables]

catboost_features = variables.copy()
catboost_features.remove("Prognosis")

X_test = df_for_rf[catboost_features]
y_test = df_for_rf["Prognosis"]
image_names = df_test["PatientID"]

yhat = cat_boost_raw_model.predict(X_test.values)

catboost_results = dict(predictions_cb_raw=list(yhat), images=image_names.to_list())
catboost_results_df = pd.DataFrame(catboost_results)

# from xgboost raw:
y_pred = model_xboost.predict(X_test.values)
xgboost_results = dict(predictions_xg=list(y_pred), images=image_names.to_list())
xgboost_results_df = pd.DataFrame(xgboost_results)

# from catbtoost imputed:
catboost_features = variables.copy()
catboost_features.remove("Prognosis")

X_test = df_imputed_test[catboost_features]
y_test = df_imputed_test["Prognosis"]
image_names = df_test["PatientID"]

yhat = cat_boost_imputed_model.predict(X_test.values)

catboost_imputed_results = dict(predictions_cb_imputed=list(yhat), images=image_names.to_list(), targets=y_test.values)
catboost_imputed_results_df = pd.DataFrame(catboost_imputed_results)

# from random forest:
X_test_scaled = scaler.transform(X_test.values)
y_pred = rf_regressor.predict(X_test_scaled)

rf_results = dict(predictions_rf=list(y_pred), images=image_names.to_list())
rf_results_df = pd.DataFrame(rf_results)

# from xgboost imputed:
y_pred = model_xboost_imputed.predict(X_test.values)
xgboost_imputed_results = dict(predictions_xg_imputed=list(y_pred), images=image_names.to_list())
xgboost_imputed_results_df = pd.DataFrame(xgboost_imputed_results)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [01:45<00:00,  8.82s/it]


### stack final predictions

In [80]:
# stack predicions:
rf_results_df = rf_results_df.set_index("images")
catboost_imputed_results_df = catboost_imputed_results_df.set_index("images")
catboost_results_df = catboost_results_df.set_index("images")
resnet_df = resnet_df.set_index("images")
xgboost_imputed_results_df = xgboost_imputed_results_df.set_index("images")
xgboost_results_df = xgboost_results_df.set_index("images")

In [81]:

# make final prediction:
all_predictions = pd.concat([rf_results_df, catboost_imputed_results_df, catboost_results_df, resnet_df, xgboost_imputed_results_df, xgboost_results_df], axis=1)
X_test = all_predictions.drop(columns="targets").values
yhat = final_model.predict(X_test)

In [82]:
catboost_results_df

Unnamed: 0_level_0,predictions_cb_raw
images,Unnamed: 1_level_1
P_102,0
P_117,0
P_16,0
P_118,0
P_114,0
...,...
P_88,0
P_92,0
P_86,1
P_9,1
