In [1]:
import onnxruntime as ort
import sys
sys.path.append("..")
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from argparse import ArgumentParser
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from src.utils.DataLoader import HidaDataLoader
import pandas as pd
import scipy.special as sc
import numpy as np
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import os

In [2]:
transform = transforms.Compose([
    transforms.CenterCrop(size=(900,900)),
    transforms.ToTensor()
])
dl = HidaDataLoader(num_workers=0, batch_size=8, data_path="../data", transform=transform)
dl.setup()
    


In [4]:
# get predictions from ResNet image classifier1:

ONNX_FILE = "C:/Users/Tobias/PycharmProjects/HIDA_LFL/logs/checkpoints/HIDA/model_193.onnx"
options = ort.SessionOptions()
options.inter_op_num_threads = 12
options.intra_op_num_threads = 12

ort_sess = ort.InferenceSession(ONNX_FILE, sess_options=options)
ort_sess.get_inputs()[0].name




valid_dataloader = dl.val_dataloader()

input_name = ort_sess.get_inputs()[0].name
output_name = ort_sess.get_outputs()[0].name

predictions = []
targets = []

counter = 0
for batch in tqdm(valid_dataloader):
    model_input, label, (label_name, image_name) = batch

    outputs_single = sc.expit(ort_sess.run([output_name], {input_name: model_input.cpu().numpy()})[0]).T[0]
    predictions += list(outputs_single)
    targets += list(image_name)
    counter += 1

resnset_results = dict(predictions_resnet=predictions, images=targets)
resnet_df = pd.DataFrame(resnset_results)

  0%|                                                                                                                                                                                   | 0/11 [00:00<?, ?it/s]

('P_678.png', 'P_673.png', 'P_669.png', 'P_675.png', 'P_640.png', 'P_694.png', 'P_692.png', 'P_807.png')


  9%|███████████████▌                                                                                                                                                           | 1/11 [00:08<01:25,  8.53s/it]

('P_810.png', 'P_811.png', 'P_796.png', 'P_795.png', 'P_756.png', 'P_824.png', 'P_768.png', 'P_814.png')


 18%|███████████████████████████████                                                                                                                                            | 2/11 [00:17<01:20,  8.99s/it]

('P_783.png', 'P_819.png', 'P_833.png', 'P_843.png', 'P_828.png', 'P_791.png', 'P_803.png', 'P_801.png')


 27%|██████████████████████████████████████████████▋                                                                                                                            | 3/11 [00:27<01:13,  9.21s/it]

('P_825.png', 'P_835.png', 'P_829.png', 'P_841.png', 'P_832.png', 'P_1_7.png', 'P_1_1.png', 'P_844.png')


 36%|██████████████████████████████████████████████████████████████▏                                                                                                            | 4/11 [00:36<01:04,  9.16s/it]

('P_1_92.png', 'P_1_57.png', 'P_1_56.png', 'P_1_103.png', 'P_1_142.png', 'P_1_87.png', 'P_1_101.png', 'P_1_59.png')


 45%|█████████████████████████████████████████████████████████████████████████████▋                                                                                             | 5/11 [00:45<00:55,  9.25s/it]

('P_1_111.png', 'P_1_102.png', 'P_1_154.png', 'P_1_82.png', 'P_1_61.png', 'P_1_73.png', 'P_1_69.png', 'P_1_44.png')


 55%|█████████████████████████████████████████████████████████████████████████████████████████████▎                                                                             | 6/11 [00:55<00:46,  9.33s/it]

('P_1_54.png', 'P_1_51.png', 'P_1_23.png', 'P_1_76.png', 'P_1_31.png', 'P_1_55.png', 'P_1_20.png', 'P_1_17.png')


 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                              | 7/11 [01:04<00:37,  9.34s/it]

('P_1_37.png', 'P_1_34.png', 'P_1_16.png', 'P_1_22.png', 'P_1_25.png', 'P_1_42.png', 'P_1_65.png', 'P_1_100.png')


 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                              | 8/11 [01:14<00:28,  9.38s/it]

('P_1_107.png', 'P_1_79.png', 'P_1_60.png', 'P_1_123.png', 'P_1_134.png', 'P_1_156.png', 'P_1_47.png', 'P_1_118.png')


 82%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                               | 9/11 [01:23<00:18,  9.33s/it]

('P_1_53.png', 'P_1_80.png', 'P_1_85.png', 'P_1_21.png', 'P_1_163.png', 'P_1_128.png', 'P_1_77.png', 'P_1_110.png')


 91%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌               | 10/11 [01:32<00:09,  9.20s/it]

('P_1_126.png', 'P_1_18.png', 'P_1_12.png', 'P_1_8.png', 'P_1_10.png', 'P_1_26.png', 'P_1_146.png')


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [01:40<00:00,  9.13s/it]


In [6]:
# get predictions from VIT image classifier:



In [19]:
# impute datasets:

train_data = "../data/trainSet/trainSet.txt"
test_data = "../data/testSet/testSet.txt"

df_train = pd.read_csv(train_data)
df_train_length_idx = len(df_train)
df_test = pd.read_csv(test_data)

# Merge the two datasets
train_test = [df_train, df_test]
df_train_test = pd.concat(train_test)
# df_train_test['Prognosis'].loc[df_train_test['Prognosis'] == '<undefined>'] = np.nan
df_train_test.loc[df_train_test['Prognosis'] == '<undefined>', 'Prognosis'] = np.nan

# all variables in the dataset (incl. outcome)
variables = list(df_train_test.columns[3:])
# which variables to use for catboost (only numerical ones!)
variables_for_regression = ['WBC', 'Temp_C', 'CRP', 'Fibrinogen', 'LDH', 'Ddimer', 'Ox_percentage', 'PaO2', 'SaO2', 'pH', 'Age']
variables_for_classification = [ 'RespiratoryFailure', 'Sex', 'CardiovascularDisease', 'DifficultyInBreathing', 'Cough']

# Set up catboost for each variable separately
# Here we don't update and always use the original dataset with missing values in all variables

# Generate new dataframe for imputed values (for this we copy the original test_trai datetset and store the index of the imputed ones and fill them in)
df_imputed_train_test = df_train_test.copy()
del df_imputed_train_test['Prognosis']

# Prognosis should be Boolian
df_imputed_train_test['Prognosis']= np.nan
df_imputed_train_test.loc[df_train_test['Prognosis']=='MILD', 'Prognosis'] = 0
df_imputed_train_test.loc[df_train_test['Prognosis']=='SEVERE', 'Prognosis'] = 1


df_train_test['Prognosis'] = df_imputed_train_test['Prognosis']

if not os.path.isfile("imputed_test.csv"):
    for catboost_variable in variables_for_regression:
        print(catboost_variable)
        catboost_features = variables.copy()
        catboost_features.remove(catboost_variable)
        catboost_df_train_test = df_train_test[df_train_test[catboost_variable].notna()]
        X_train = catboost_df_train_test[catboost_features]
        y_train = catboost_df_train_test[catboost_variable]

        index_missing = df_train_test[catboost_variable].isna()
        catboost_df_test = df_train_test[df_train_test[catboost_variable].isna()]
        X_test = catboost_df_test[catboost_features]
        y_test = catboost_df_test[catboost_variable]

        # fit the model 
        model = CatBoostRegressor(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
        model.fit(X_train.values, y_train.values)

        # make a prediction
        yhat = model.predict(X_test.values)
        df_imputed_train_test.loc[df_imputed_train_test[catboost_variable].isna(), catboost_variable] = yhat.T[0].copy()


    for catboost_variable in variables_for_classification:
        print(catboost_variable)
        catboost_features = variables.copy()
        catboost_features.remove(catboost_variable)
        catboost_df_train_test = df_train_test[df_train_test[catboost_variable].notna()]
        X_train = catboost_df_train_test[catboost_features]
        y_train = catboost_df_train_test[catboost_variable]

        index_missing = df_train_test[catboost_variable].isna()
        catboost_df_test = df_train_test[df_train_test[catboost_variable].isna()]
        X_test = catboost_df_test[catboost_features]
        y_test = catboost_df_test[catboost_variable]

        # fit the model 
        model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
        model.fit(X_train.values, y_train.values)

        # make a prediction
        yhat = model.predict(X_test.values)
        df_imputed_train_test.loc[df_imputed_train_test[catboost_variable].isna(), catboost_variable] = yhat.T[0].copy()

    # separate the two datasets:
    df_imputed_train = df_imputed_train_test.iloc[0:df_train_length_idx, : ]
    df_imputed_test = df_imputed_train_test.iloc[df_train_length_idx:, :]

    df_imputed_train.to_csv("imputed_train.csv")
    df_imputed_test.to_csv("imputed_test.csv")

else:
    df_imputed_train = pd.read_csv("imputed_train.csv")
    df_imputed_test = pd.read_csv("imputed_test.csv")


WBC
Temp_C
CRP
Fibrinogen
LDH
Ddimer
Ox_percentage
PaO2
SaO2
pH
Age
RespiratoryFailure
Sex
CardiovascularDisease
DifficultyInBreathing
Cough


In [20]:
# get predictions from catboost not imputed:

df_for_rf = df_train.copy()
df_for_rf["Prognosis"] = df_for_rf["Prognosis"].astype("category").cat.codes
df_for_rf = df_for_rf[variables]

catboost_features = variables.copy()
catboost_features.remove("Prognosis")

train_valid_split = 0.9
sep_index = int(863 * train_valid_split)

X_train = df_for_rf[catboost_features].iloc[0:sep_index]
y_train = df_for_rf["Prognosis"].iloc[0:sep_index]


X_test = df_for_rf[catboost_features].iloc[sep_index:]
y_test = df_for_rf["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]

# fit the model 
model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
model.fit(X_train.values, y_train.values)

# make a prediction
yhat = model.predict(X_test.values)

catboost_results = dict(predictions_cb_raw=list(yhat), images=image_names.to_list())
catboost_results_df = pd.DataFrame(catboost_results)

In [33]:
# get predictions from catboost imputed:
catboost_features = variables.copy()
catboost_features.remove("Prognosis")

train_valid_split = 0.9
sep_index = int(863 * train_valid_split)

X_train = df_imputed_train[catboost_features].iloc[0:sep_index]
y_train = df_imputed_train["Prognosis"].iloc[0:sep_index]

X_test = df_imputed_train[catboost_features].iloc[sep_index:]
y_test = df_imputed_train["Prognosis"].iloc[sep_index:]
image_names = df_train["ImageFile"].iloc[sep_index:]

# fit the model 
model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
model.fit(X_train.values, y_train.values)

# make a prediction
yhat = model.predict(X_test.values)

catboost_imputed_results = dict(predictions_cb_imputed=list(yhat), images=image_names.to_list(), targets=y_test.values)
catboost_imputed_results_df = pd.DataFrame(catboost_imputed_results)

In [22]:
# get predictions from imputed random forest:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train.values)
X_test_scaled = sc.transform(X_test.values)


regressor = RandomForestRegressor(n_estimators=20, random_state=0)
regressor.fit(X_train_scaled, y_train)
y_pred = regressor.predict(X_test_scaled)

rf_results = dict(predictions_rf=list(y_pred), images=image_names.to_list())
rf_results_df = pd.DataFrame(rf_results)

In [44]:
# stack predicions:
rf_results_df = rf_results_df.set_index("images")
catboost_imputed_results_df = catboost_imputed_results_df.set_index("images")
catboost_results_df = catboost_results_df.set_index("images")
resnet_df = resnet_df.set_index("images")

all_predictions = pd.concat([rf_results_df, catboost_imputed_results_df,resnet_df, catboost_results_df], axis=1)
X_train = all_predictions.drop(columns="targets").values[:-10]
X_test = all_predictions.drop(columns="targets").values[-10:]
y_train = all_predictions["targets"].values[:-10]
y_test = all_predictions["targets"].values[-10:]


# final stacking model:
model = CatBoostClassifier(verbose=0, iterations=1000, task_type="GPU", devices='0:1')
model.fit(X_train, y_train)
yhat = model.predict(X_test)

In [43]:
for x,y in zip(y_test, yhat):
    print(x,y)

1.0 0.0
0.0 0.0
0.0 1.0
1.0 1.0
1.0 1.0
1.0 0.0
1.0 0.0
0.0 0.0
1.0 1.0
0.0 0.0
