In [6]:
# this file should be used from the root of the repository
import pyeddl.eddl as eddl
import pyecvl.ecvl as ecvl
from pyeddl.tensor import Tensor

import pandas as pd
import numpy as np
from posixpath import join
import yaml
import os

from utils.data_partitioning import load_data_split
from eddl_lib.uc5_dataset import Uc5Dataset

import json


In [7]:
# ALL LABELS, INCLUDING NORMAL
exp_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/wp6"
cnn_fn = "cnn_84val_neptune179.onnx"
ds_fn = "img_reports_phi2_enc.tsv"
img_fld = "/mnt/datasets/uc5/std-dataset/image"

# read files from exp_fld
train_ids, valid_ids, test_ids = load_data_split(exp_fld)

data split read from disk. |train|=5230, |valid|=748, |test|=1492


In [27]:
# ALL LABELS, INCLUDING NORMAL
exp_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/wp6_without_normal"
cnn_fn = "best_cnn.onnx"
ds_fn = "img_reports.tsv"
img_fld = "/mnt/datasets/uc5/std-dataset/image"

# read files from exp_fld
train_ids, valid_ids, test_ids = load_data_split(exp_fld)

data split read from disk. |train|=3214, |valid|=460, |test|=917


specify paths before this cell

In [34]:


cnn = eddl.import_net_from_onnx_file(join(exp_fld, cnn_fn))
eddl.build(
    cnn,
    eddl.rmsprop(0.01),
    ["soft_cross_entropy"],
    ["categorical_accuracy"],
    eddl.CS_GPU(mem="full_mem"),  # if args.gpu else eddl.CS_CPU(mem=args.mem),
    False  # do not initialize weights to random values
)
cnn.resize(1)
# eddl.summary(cnn)
eddl.set_mode(cnn, 0)

ds = pd.read_csv(join(exp_fld, ds_fn), sep="\t").set_index("filename")  # .set_index("image_filename")
print(ds.shape)
print(ds.T.head())
if "labels" not in ds.columns:
    ds["labels"] = ds["auto_labels"]


semantic_dim = eddl.getLayer(cnn, "cnn_out").output.shape[1]
print("semantic dimension:", semantic_dim)


(4591, 4)
filename    /mnt/datasets/uc5/UC5_pipeline_forked/data/image/CXR2122_IM-0747-1001.png  \
report_id                                                 2122                          
enc_text     15 6 4 18 36 2. 15 6 4 26 12 50 11 25 2. 5 34 ...                          
auto_labels                                                  0                          
mesh_labels                                                 17                          

filename    /mnt/datasets/uc5/UC5_pipeline_forked/data/image/CXR2502_IM-1027-1001-0001.png  \
report_id                                                 2502                               
enc_text     15 6 39 18 250 8 58 20 5 24 198 548 2. 86 243 ...                               
auto_labels                                              15;17                               
mesh_labels                                      2;16;20;22;24                               

filename    /mnt/datasets/uc5/UC5_pipeline_forked/data/image/CXR2502_IM-1

CS with full memory setup
Building model without initialization
copying onnx params to devices


In [35]:
# aux functions
def load_image(filename):
    augs = ecvl.SequentialAugmentationContainer([
                ecvl.AugToFloat32(divisor=255.0),
                ecvl.AugNormalize([0.48197903, 0.48197903, 0.48197903], [0.26261734, 0.26261734, 0.26261734]),
                ecvl.AugResizeDim([300, 300]),
                ecvl.AugCenterCrop([224, 224]),  # to do: test random crop also in prediction
                ])
    img = ecvl.ImRead(filename, flags=None)  # , flags=ecvl.ImReadMode.GRAYSCALE)
    ecvl.RearrangeChannels(img, img, "xyc")
    augs.Apply(img)
    ecvl.RearrangeChannels(img, img, "cxy")
    return img

def label_list(lab_str):
    return [int(s) for s in lab_str.split(";")]

In [40]:
split_ids = train_ids + valid_ids
tsv_name = "thresholds_trva.tsv"

split_ids = test_ids
tsv_name = "thresholds_te.tsv"

predictions = np.empty( (semantic_dim, len(split_ids)) )
targets = np.empty_like(predictions)
predictions.fill(np.nan)
targets.fill(np.nan)

for pos, id in enumerate(split_ids):
    if ((pos+1) % 1000) == 0:
        print(f"# {pos+1}/{len(split_ids)}")
    elif ((pos+1) % 100) == 0:
        print(".", end="")
    img = load_image(join(img_fld, id))
    # img = ecvl.ImageToTensor(img)
    a = np.expand_dims(np.array(img, copy=False), axis=0)  # add batch dimension
    eddl.forward(cnn, [Tensor.fromarray(a)])

    layer = eddl.getLayer(cnn, "cnn_out")
    p = np.array(eddl.getOutput(layer), copy=False)
    predictions[:, pos] = p
    
    labels = np.zeros_like(p)
    lab_list = label_list(ds.loc[id].labels)
    # print(f"{pos}: n_labels: {len(lab_list)}")
    for l in lab_list:
        labels[0, l] = 1
    targets[:, pos] = labels
    
    # n_labels = nnz(labels)
    # if n_labels != len(lab_list):
    #     print(pos)
    #     print(lab_list)
    #     print(labels)
    
    # print(labels)
    # if pos == 50:
    #     print("dev mode, breaking at 50")
    #     break
    #print(p.shape)
    # cnn_out_in = eddl.Input([semantic_dim], name="in_semantic_features")
# for i, split in enumerate([train_ids, valid_ids, test_ids]):

print("cell done.")

.........cell done.


In [41]:
with open( join(exp_fld, "index2lab.json"), "r") as fin:
    i2l = json.load(fin)

for i, l in i2l.items():
    print(f"{i}: {l}")

from sklearn.metrics import roc_curve, accuracy_score
import matplotlib.pyplot as plt
import math
from numpy import count_nonzero as nnz

auc_acc = []
J_acc = []

auc_ths = []
J_ths = []
label_names = []

for i in range(predictions.shape[0]):  # for i over labels
    y_est = predictions[i, :]  # these are the predictions made by the cnn
    y = targets[i, :]  # there are the true target values 
    fpr, tpr, thresholds = roc_curve(y, y_est)  # check threshoold
    
    # auc
    crit = np.sqrt(tpr * (1 - fpr) )
    m2 = thresholds[np.argmax(crit)]
    auc_ths.append(m2)
    y_est1 = np.where(y_est > m2, 1, 0)
    auc_acc.append(accuracy_score(y_est1, y) * 100)

    # Youden
    J = tpr - fpr
    ij = np.argmax(J)
    th_j = thresholds[ij]
    J_ths.append(th_j)
    y_est4 = np.where(y_est > th_j, 1, 0)
    J_acc.append(accuracy_score(y_est4, y)*100)
    label_names.append(i2l[str(i)])

    print(f"*** label {i}: {i2l[str(i)]}")
    print(f"  - auc1:", auc_acc[-1])
    print(f"  - Youden:", J_acc[-1])
    print(f"auc {m2}, youden {th_j}")

0: 
1: aorta
2: atelectasis
3: calcified granuloma
4: cardiomegaly
5: deformity
6: degenerative change
7: diaphragm
8: emphysema
9: granuloma
10: granulomatous disease
11: hiatal hernia
12: infiltrates
13: misc
14: nodule
15: opacity
16: osteophyte
17: pleural effusion
18: pneumonia
19: pulmonary edema
20: pulmonary emphysema
21: scarring
22: sternotomy
23: thoracic aorta
*** label 0: 
  - auc1: 51.035986913849506
  - Youden: 32.93347873500545
auc 0.1751871258020401, youden 0.059371016919612885
*** label 1: aorta
  - auc1: 33.26063249727372
  - Youden: 15.485278080697928
auc 0.002168679144233465, youden 0.0009834381053224206
*** label 2: atelectasis
  - auc1: 59.10577971646674
  - Youden: 71.53762268266085
auc 0.0166606567800045, youden 0.03361339867115021
*** label 3: calcified granuloma
  - auc1: 50.4907306434024
  - Youden: 40.458015267175576
auc 0.017004063352942467, youden 0.010816198773682117
*** label 4: cardiomegaly
  - auc1: 48.63685932388223
  - Youden: 28.46237731733915
auc 

In [None]:
from sklearn.metrics import roc_curve, accuracy_score
import matplotlib.pyplot as plt
import math
from numpy import count_nonzero as nnz

auc_acc = []
J_acc = []

auc_ths = []
J_ths = []

label_names = []

for i in range(predictions.shape[0]):  # for i over labels
    y_est = predictions[i, :]  # these are the predictions made by the cnn
    y = targets[i, :]  # there are the true target values 
    fpr, tpr, thresholds = roc_curve(y, y_est)  # check threshoold
    
    # auc
    crit = np.sqrt(tpr * (1 - fpr) )
    m2 = thresholds[np.argmax(crit)]
    auc_ths.append(m2)
    y_est1 = np.where(y_est > m2, 1, 0)
    auc_acc.append(accuracy_score(y_est1, y) * 100)

    # Youden
    J = tpr - fpr
    ij = np.argmax(J)
    th_j = thresholds[ij]
    J_ths.append(th_j)
    y_est4 = np.where(y_est > th_j, 1, 0)
    J_acc.append(accuracy_score(y_est4, y)*100)
    

    print(f"*** label {i}")
    print(f"  - auc1:", auc_acc[-1])
    print(f"  - Youden:", J_acc[-1])
    print(f"auc {m2}, youden {th_j}")

In [42]:
d = {"label": label_names, "auc_t": auc_ths, "auc_acc": auc_acc, "youden_t": J_ths, "youden_acc": J_acc}
df = pd.DataFrame.from_dict(d)
# df["different"] = df.auc_t != df.youden_t
display(df)
print(df[["auc_acc", "youden_acc"]].mean(axis=0))

fn = join(exp_fld, tsv_name)
# fn = join(exp_fld, "thresholds_te.tsv")
df.to_csv(fn, sep="\t", index=False ) 
print("saved:", fn)

df2 = pd.read_csv(fn, sep="\t").set_index("label")
display(df2)

Unnamed: 0,label,auc_t,auc_acc,youden_t,youden_acc
0,,0.175187,51.035987,0.059371,32.933479
1,aorta,0.002169,33.260632,0.000983,15.485278
2,atelectasis,0.016661,59.10578,0.033613,71.537623
3,calcified granuloma,0.017004,50.490731,0.010816,40.458015
4,cardiomegaly,0.032663,48.636859,0.011207,28.462377
5,deformity,0.013412,52.671756,0.007267,29.225736
6,degenerative change,0.009607,42.202835,0.000588,17.339149
7,diaphragm,0.006194,44.274809,0.003861,28.462377
8,emphysema,0.010036,43.293348,0.00254,11.668484
9,granuloma,0.007796,48.418757,0.226414,94.002181


auc_acc       51.522174
youden_acc    41.771174
dtype: float64
saved: /mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/wp6_without_normal/thresholds_te.tsv


Unnamed: 0_level_0,auc_t,auc_acc,youden_t,youden_acc
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,0.175187,51.035987,0.059371,32.933479
aorta,0.002169,33.260632,0.000983,15.485278
atelectasis,0.016661,59.10578,0.033613,71.537623
calcified granuloma,0.017004,50.490731,0.010816,40.458015
cardiomegaly,0.032663,48.636859,0.011207,28.462377
deformity,0.013412,52.671756,0.007267,29.225736
degenerative change,0.009607,42.202835,0.000588,17.339149
diaphragm,0.006194,44.274809,0.003861,28.462377
emphysema,0.010036,43.293348,0.00254,11.668484
granuloma,0.007796,48.418757,0.226414,94.002181
