In [15]:
# this file should be used from the root of the repository
import pyeddl.eddl as eddl
import pyecvl.ecvl as ecvl
from pyeddl.tensor import Tensor

import pandas as pd
import numpy as np
from posixpath import join
import yaml
import os

from utils.data_partitioning import load_data_split
from eddl_lib.uc5_dataset import Uc5Dataset

import json


In [7]:
# ALL LABELS, INCLUDING NORMAL
exp_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/wp6"
cnn_fn = "cnn_84val_neptune179.onnx"
ds_fn = "img_reports_phi2_enc.tsv"
img_fld = "/mnt/datasets/uc5/std-dataset/image"

# read files from exp_fld
train_ids, valid_ids, test_ids = load_data_split(exp_fld)

data split read from disk. |train|=5230, |valid|=748, |test|=1492


In [27]:
# ALL LABELS, INCLUDING NORMAL
exp_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/wp6_without_normal"
cnn_fn = "best_cnn.onnx"
ds_fn = "img_reports.tsv"
img_fld = "/mnt/datasets/uc5/std-dataset/image"

# read files from exp_fld
train_ids, valid_ids, test_ids = load_data_split(exp_fld)

data split read from disk. |train|=3214, |valid|=460, |test|=917


In [2]:
# MAY 2022, ALL LABELS, INCLUDING NORMAL, last 
exp_fld = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/wp6last"
cnn_fn = "best_cnn.onnx"
ds_fn = "img_reports.tsv"
img_fld = "/mnt/datasets/uc5/std-dataset/image"

# read files from exp_fld
train_ids, valid_ids, test_ids = load_data_split(exp_fld)

data split read from disk. |train|=4954, |valid|=708, |test|=1413


specify paths before this cell

In [3]:


cnn = eddl.import_net_from_onnx_file(join(exp_fld, cnn_fn))
eddl.build(
    cnn,
    eddl.rmsprop(0.01),
    ["soft_cross_entropy"],
    ["categorical_accuracy"],
    eddl.CS_GPU(mem="full_mem"),  # if args.gpu else eddl.CS_CPU(mem=args.mem),
    False  # do not initialize weights to random values
)
cnn.resize(1)
# eddl.summary(cnn)
eddl.set_mode(cnn, 0)

ds = pd.read_csv(join(exp_fld, ds_fn), sep="\t").set_index("filename")  # .set_index("image_filename")
print(ds.shape)
print(ds.T.head())
if "labels" not in ds.columns:
    ds["labels"] = ds["auto_labels"]


semantic_dim = eddl.getLayer(cnn, "cnn_out").output.shape[1]
print("semantic dimension:", semantic_dim)


(7075, 4)
filename    /mnt/datasets/uc5/UC5_pipeline_forked/data/image/CXR2122_IM-0747-1001.png  \
report_id                                                 2122                          
enc_text     1 15 6 4 18 36 2. 1 15 6 4 26 12 50 11 25 2. 1...                          
auto_labels                                                  0                          
mesh_labels                                                 17                          

filename    /mnt/datasets/uc5/UC5_pipeline_forked/data/image/CXR2502_IM-1027-1001-0001.png  \
report_id                                                 2502                               
enc_text     1 15 6 39 18 250 8 58 20 5 24 198 548 2. 1 86 ...                               
auto_labels                                              16;18                               
mesh_labels                                      2;16;20;22;24                               

filename    /mnt/datasets/uc5/UC5_pipeline_forked/data/image/CXR2502_IM-1

Generating Random Table
CS with full memory setup
Building model without initialization
Selecting GPU device 0
EDDL is running on GPU device 0, Tesla V100-SXM2-32GB
CuBlas initialized on GPU device 0, Tesla V100-SXM2-32GB
CuRand initialized on GPU device 0, Tesla V100-SXM2-32GB
CuDNN initialized on GPU device 0, Tesla V100-SXM2-32GB
copying onnx params to devices
sh: 1: dot: not found
[PLOT] Unable to run the following command:
	=> dot -T pdf ./tmp.dot >./smodel.pdf


In [4]:
# aux functions
def load_image(filename):
    augs = ecvl.SequentialAugmentationContainer([
                ecvl.AugToFloat32(divisor=255.0),
                ecvl.AugNormalize([0.48197903, 0.48197903, 0.48197903], [0.26261734, 0.26261734, 0.26261734]),
                ecvl.AugResizeDim([300, 300]),
                ecvl.AugCenterCrop([224, 224]),  # to do: test random crop also in prediction
                ])
    img = ecvl.ImRead(filename, flags=None)  # , flags=ecvl.ImReadMode.GRAYSCALE)
    ecvl.RearrangeChannels(img, img, "xyc")
    augs.Apply(img)
    ecvl.RearrangeChannels(img, img, "cxy")
    return img

def label_list(lab_str):
    return [int(s) for s in lab_str.split(";")]

In [5]:
split_ids = train_ids + valid_ids
tsv_name = "thresholds_trva.tsv"

split_ids = test_ids
tsv_name = "thresholds_te.tsv"

predictions = np.empty( (semantic_dim, len(split_ids)) )
targets = np.empty_like(predictions)
predictions.fill(np.nan)
targets.fill(np.nan)

for pos, id in enumerate(split_ids):
    if ((pos+1) % 1000) == 0:
        print(f"# {pos+1}/{len(split_ids)}")
    elif ((pos+1) % 100) == 0:
        print(".", end="")
    img = load_image(join(img_fld, id))
    # img = ecvl.ImageToTensor(img)
    a = np.expand_dims(np.array(img, copy=False), axis=0)  # add batch dimension
    eddl.forward(cnn, [Tensor.fromarray(a)])

    layer = eddl.getLayer(cnn, "cnn_out")
    p = np.array(eddl.getOutput(layer), copy=False)
    predictions[:, pos] = p
    
    labels = np.zeros_like(p)
    lab_list = label_list(ds.loc[id].labels)
    # print(f"{pos}: n_labels: {len(lab_list)}")
    for l in lab_list:
        labels[0, l] = 1
    targets[:, pos] = labels
    
    # n_labels = nnz(labels)
    # if n_labels != len(lab_list):
    #     print(pos)
    #     print(lab_list)
    #     print(labels)
    
    # print(labels)
    # if pos == 50:
    #     print("dev mode, breaking at 50")
    #     break
    #print(p.shape)
    # cnn_out_in = eddl.Input([semantic_dim], name="in_semantic_features")
# for i, split in enumerate([train_ids, valid_ids, test_ids]):

print("cell done.")

.........# 1000/1413
....cell done.


In [14]:
with open( join(exp_fld, "auto_index2lab.json"), "r") as fin:
    i2l = json.load(fin)

for i, l in i2l.items():
    print(f"{i}: {l}")

from sklearn.metrics import roc_curve, accuracy_score
import matplotlib.pyplot as plt
import math
from numpy import count_nonzero as nnz

auc_acc = []
J_acc = []

auc_ths = []
J_ths = []
label_names = []

for i in range(predictions.shape[0]):  # for i over labels
    y_est = predictions[i, :]  # these are the predictions made by the cnn
    y = targets[i, :]  # there are the true target values 
    fpr, tpr, thresholds = roc_curve(y, y_est)  # check threshoold
    
    # auc
    crit = np.sqrt(tpr * (1 - fpr) )
    m2 = thresholds[np.argmax(crit)]
    auc_ths.append(m2)
    y_est1 = np.where(y_est > m2, 1, 0)
    auc_acc.append(accuracy_score(y_est1, y) * 100)

    # Youden
    J = tpr - fpr
    ij = np.argmax(J)
    th_j = thresholds[ij]
    J_ths.append(th_j)
    y_est4 = np.where(y_est > th_j, 1, 0)
    J_acc.append(accuracy_score(y_est4, y)*100)
    label_names.append(i2l[str(i)])

    print(f"*** label {i}: {i2l[str(i)]}")
    print(f"  - auc1:", auc_acc[-1])
    print(f"  - Youden:", J_acc[-1])
    print(f"auc {m2}, youden {th_j}")

0: 
1: aorta
2: atelectasis
3: calcified granuloma
4: cardiomegaly
5: deformity
6: degenerative change
7: diaphragm
8: emphysema
9: granuloma
10: granulomatous disease
11: hiatal hernia
12: infiltrates
13: misc
14: nodule
15: normal
16: opacity
17: osteophyte
18: pleural effusion
19: pneumonia
20: pulmonary edema
21: pulmonary emphysema
22: scarring
23: sternotomy
24: thoracic aorta
*** label 0: 
  - auc1: 37.08421797593772
  - Youden: 19.10828025477707
auc 0.002322520362213254, youden 0.000525999057572335
*** label 1: aorta
  - auc1: 50.03538570417552
  - Youden: 17.8343949044586
auc 0.0025211507454514503, youden 0.001001763972453773
*** label 2: atelectasis
  - auc1: 46.355272469922156
  - Youden: 32.41330502477
auc 0.00887772161513567, youden 0.003619846422225237
*** label 3: calcified granuloma
  - auc1: 58.244869072894545
  - Youden: 58.244869072894545
auc 0.005392505321651697, youden 0.005392505321651697
*** label 4: cardiomegaly
  - auc1: 60.438782731776364
  - Youden: 60.438782

In [12]:
from sklearn.metrics import roc_curve, accuracy_score
import matplotlib.pyplot as plt
import math
from numpy import count_nonzero as nnz

auc_acc = []
J_acc = []

auc_ths = []
J_ths = []

label_names = []

for i in range(predictions.shape[0]):  # for i over labels
    y_est = predictions[i, :]  # these are the predictions made by the cnn
    y = targets[i, :]  # there are the true target values 
    fpr, tpr, thresholds = roc_curve(y, y_est)  # check threshoold
    
    # auc
    crit = np.sqrt(tpr * (1 - fpr) )
    m2 = thresholds[np.argmax(crit)]
    auc_ths.append(m2)
    y_est1 = np.where(y_est > m2, 1, 0)
    auc_acc.append(accuracy_score(y, y_est1) * 100)

    # Youden
    J = tpr - fpr
    ij = np.argmax(J)
    th_j = thresholds[ij]
    J_ths.append(th_j)
    y_est4 = np.where(y_est > th_j, 1, 0)
    J_acc.append(accuracy_score(y, y_est4)*100)
    

    print(f"*** label {i}")
    print(f"  - auc1:", auc_acc[-1])
    print(f"  - Youden:", J_acc[-1])
    print(f"auc {m2}, youden {th_j}")

*** label 0
  - auc1: 37.08421797593772
  - Youden: 19.10828025477707
auc 0.002322520362213254, youden 0.000525999057572335
*** label 1
  - auc1: 50.03538570417552
  - Youden: 17.8343949044586
auc 0.0025211507454514503, youden 0.001001763972453773
*** label 2
  - auc1: 46.355272469922156
  - Youden: 32.41330502477
auc 0.00887772161513567, youden 0.003619846422225237
*** label 3
  - auc1: 58.244869072894545
  - Youden: 58.244869072894545
auc 0.005392505321651697, youden 0.005392505321651697
*** label 4
  - auc1: 60.438782731776364
  - Youden: 60.438782731776364
auc 0.026809075847268105, youden 0.026809075847268105
*** label 5
  - auc1: 47.77070063694268
  - Youden: 29.582448690728945
auc 0.019269632175564766, youden 0.011968121863901615
*** label 6
  - auc1: 51.238499646142955
  - Youden: 41.8966737438075
auc 0.01135784201323986, youden 0.006883059628307819
*** label 7
  - auc1: 56.475583864118896
  - Youden: 86.05803255484784
auc 0.0007500104838982224, youden 0.002519395900890231
*** l

In [13]:
d = {"label": label_names, "auc_t": auc_ths, "auc_acc": auc_acc, "youden_t": J_ths, "youden_acc": J_acc}
print(len(label_names))
df = pd.DataFrame.from_dict(d)
# df["different"] = df.auc_t != df.youden_t
display(df)
print(df[["auc_acc", "youden_acc"]].mean(axis=0))


fn = join(exp_fld, tsv_name)
print(fn)
# fn = join(exp_fld, "thresholds_te.tsv")
# df.to_csv(fn, sep="\t", index=False ) 
print("saved:", fn)

df2 = pd.read_csv(fn, sep="\t").set_index("label")
display(df2)

ValueError: All arrays must be of the same length