In [None]:
import pandas as pd
from posixpath import join
import numpy as np
import random
from numpy import count_nonzero as nnz
from collections import defaultdict

base_path = "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl//mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/eddl_ext_CNN_20tags"
 

### RAW
inspect the "report_raw" file, containing the raw xml content

In [None]:
ds = pd.read_csv( "/mnt/datasets/uc5/UC5_pipeline_forked/experiments_eddl/reports_raw.tsv", sep="\t", na_filter=False )
print(f"raw reports, shape {ds.shape}")
print("** columns:")
for c in ds.columns:
    print(f" - {c}")

# reports without images
iii = ds.n_images == 0
print(f"*** number of reports without images {nnz(iii)}")
ds = ds.loc[~iii].reset_index()

num_reports = ds.shape[0]
num_images = ds.n_images.sum()
print(f"reports {num_reports}, images {num_images}")

img_g = ds.loc[:, ["id", "n_images"]].groupby(["n_images"]).agg(["count"])
display(img_g)

mm_g = ds[["id", "n_major_mesh"]].groupby(["n_major_mesh"]).agg(["count"])
display(mm_g)

at_g = ds[["id", "n_auto_term"]].groupby(["n_auto_term"]).agg(["count"])
display(at_g)

# remove reports without images
del iii, img_g, mm_g, at_g

<h3>empty auto terms</h3>there are many reports with an empty set of auto terms. Check how many corresponds to normal reports.

In [None]:
normal_ids = ds.major_mesh == "normal"
print(f"normal reports (mesh): {nnz(normal_ids)}")

empty_auto_ids = ds.n_auto_term == 0
normal_auto_ids = empty_auto_ids & normal_ids
print(f"empty auto terms that are normal according to mesh: {nnz(normal_auto_ids)}/{nnz(empty_auto_ids)}")

empty_auto_ids = empty_auto_ids & ~normal_auto_ids
print(f"empty auto terms that are not normal according to mesh: {nnz(empty_auto_ids)}")


### combos
unique combinations of terms (combos), on unprocessed, raw data

In [None]:

print("major mesh, sample:\n\t", random.sample(ds.major_mesh.tolist(), 5))
print("auto term, sample:\n\t", random.sample(ds.auto_term.tolist(), 5))

def get_unique_set(col):
    # out = [t.lower() for g in col for t in g.split(";") if len(g) > 0]
    out = set()
    for terms in col:
        if len(terms) == 0:
            continue
        for t in terms.split(";"):
            out.add(t.strip().lower())
    return out

u_mesh_combos = get_unique_set(ds.major_mesh)
u_auto_combos = get_unique_set(ds.auto_term)

print(f"unique mesh combos: {len(u_mesh_combos)}")
print(f"unique auto combos: {len(u_auto_combos)}")

print("unique mesh, sample:", random.sample(u_mesh_combos, 10))
print("unique auto, sample:", random.sample(u_auto_combos, 10))

mesh_in_empty_auto = get_unique_set(ds.major_mesh.loc[empty_auto_ids])
print(f"auto tags associated to non-normal empty auto terms: {len(mesh_in_empty_auto)} terms")
# print(mesh_in_empty_auto)



<h3>MeSH ONLY</h3>
From now on, only mesh terms. 

Terms are made as it follows:
HEADING/subheading;HEADING/subheading/...

Simplify terms, keep only the HEADING terms and remote SUBHEADINS
HEADING;HEADING;
<b>some heading terms are repeated, in raw files repeated headings are specialized (thus being unique) with subeadings</b>

In [None]:
# there are no empty major mesh fields
# expected syntax:
import re
print("sample of mesh terms")
print(random.sample(ds.major_mesh.tolist(), 3))

# TODO
# some headings terms appear more than once, ex:
#      Calcified Granuloma/mediastinum/large;Calcified Granuloma/lung/hilum/right/large --> ['calcified granuloma', 'calcified granuloma']
def simplify_terms(terms):
    # split on ; [heading/subheadings, heading/subheading]
    # then take only the first
    out = [g.split("/")[0].strip().lower() for g in terms.split(";")]  # heading/subheadings ; heading/subheadings ; 
    
    # in some multi-word headings, words are separated by multiple spaces:
    return list(set( [re.sub(r"\s+", ' ', terms) for terms in out] ))

simplified_mesh = ds.major_mesh.apply(lambda x: simplify_terms(x))
# for m, s in zip(ds.major_mesh.tolist(), simplified_mesh.tolist()):
#      print(f" {m} --> {s}")

ds["labels"] = simplified_mesh
# labels_s: labels as string
ds["labels_s"] = simplified_mesh.apply(lambda x: ";".join(x))  # labels joined as string
ds["n_labels"] = ds.labels.apply(lambda l: len(l))


# the two "counts" that follow show different results because
# some tags contain the same heading more than once followed by different subheadings (n_major_mesh)
# in n_labels only unique headings are included
mm_g = ds[["id", "n_major_mesh"]].groupby(["n_major_mesh"]).agg(["count"])
gnl = ds[["id", "n_labels"]].groupby(["n_labels"]).agg(["count"])
display(gnl)
display(mm_g)


print("number of unique combinations of terms:", len(ds.labels_s.unique()))

u_mesh = set()
for l in ds.labels:
    for v in l:
        u_mesh.add(v)

print("unique mesh terms: ", len(u_mesh))
del mm_g, gnl


In [None]:
# for each term mark where it occurs

occ_reps = {}
occ_imgs = {}

for ut in u_mesh:
    reps = ds.labels.apply(lambda x: ut in x)
    n_images = ds.n_images[reps].sum()
    occ_reps[ut] = nnz(reps)
    occ_imgs[ut] = n_images

# for ut in u_mesh:
#     print(f"{ut}, {occ_reps[ut]} reports, {occ_imgs[ut]} images")


thresholds = [90, 100, 110, 120, 130]

th2num = []
for t in thresholds:
    d = {}
    for k, v in occ_imgs.items():
        if v > t:
            d[k] = v
    th2num.append(d)

for t, d in zip(thresholds, th2num):
    print(f"threshold {t}, number of tags {len(d)}")
    new_combos = ds.labels.apply(lambda l: [t for t in l if t in d.keys()])
    len_new_combos = new_combos.apply(lambda x: len(x))
    val_counts = len_new_combos.value_counts()
    print(val_counts)
    iii = len_new_combos > 1
    jjj = len_new_combos == 0
    print(f"\t {nnz(iii)}/{len(iii)} with multiple tags")
    print(f"\t {nnz(jjj)} without tags")

# last threshold
print(ds.labels[iii])
print(ds.labels[jjj])






In [None]:
# build a binary occurrence matrix

# rep_tags = np.zeros( (ds.shape[0], len(u_mesh)), dtype=int)
# img_tags = np.zeros( (n_images, len(u_mesh)), dtype=int)
terms = sorted(u_mesh)
terms = ["normal"] + [l for l in terms if l != "normal"]

print("normal" in terms)

assert "normal" in terms 
assert ("normal" in terms[1:]) == False

matrix = []  # rows correspond to reports
rep_matrix = []  # rows correspond to reports
index = []

for t in ds.itertuples():
    enc = []
    for term in terms:
        enc.append(term in t.labels)
    rep_matrix.append(enc)
    for i in [fn for fn in t.image_filename.split(";")]:
        index.append(i)
        matrix.append(enc)

rep_ds = pd.DataFrame(data=rep_matrix, columns=terms)
rep_ds["id"] = ds.id
print(f"dataframe, index is report id: {rep_ds.shape}")
display(rep_ds)


img_ds = pd.DataFrame(data=matrix, columns = terms)
rep_ds["image_filename"] = pd.Series(index)
print(f"dataframe, index is image_filename: {img_ds.shape}")
display(img_ds)
# npterms = np.array(terms, dtype=object)
# matrix = np.array(matrix)
# idx = 3
# print(matrix[idx,:])
# print(npterms[matrix[idx,:]])
            
frequencies = img_ds.sum(axis=0)
print(frequencies)


In [None]:
# use img_ds
def label_imbalance(df):
    lab_n1 = df.sum(axis=0)
    lab_n0 = df.shape[0] - lab_n1

    lds = pd.concat([lab_n1, lab_n0], axis=1)
    lds.columns = ["n1", "n0"]

    # all of the following based on the fact that the minority class is 1
    lds["IRLbl"]  =  1/ lds.n1 * lds.n1.max(axis=0)
    lds["ImR"] = lds.n0 / lds.n1  # it is max / min, but here majority class is 0

    lds["m_IRLbl"] = lds.IRLbl.mean()
    lds["m_ImR"] = lds.ImR.mean()
    return lds

lab_imb = label_imbalance(img_ds)
display(lab_imb)

def drop_columns(df, columns):
    df2 = df.drop(columns=columns)
    n_labels = df2.sum(axis=1)
    df2 = df2.drop(df.index[n_labels == 0])
    return df2

img_ds_others = drop_columns(img_ds, ["normal"])
lab_imb_others = label_imbalance(img_ds_others)
display(lab_imb_others)



In [None]:


# imbalance
# tag to images
lab2img = defaultdict(list)
n_images = 0
for row in ds.itertuples():
    labels = row.labels
    images = [fn for fn in row.image_filename.split(";")]
    n_images += len(images)
    for l in labels:
        lab2img[l] += images

print(f"total number of images {n_images}, labels {len(lab2img)}")
records = []
for k, v in lab2img.items():
    #print(f"{k}: {len(v)}")
    records.append({'label': k, 'n_1': len(v), 'n_0': (n_images-len(v))})

df = pd.DataFrame.from_records(records)

max_n_1 = df.n_1.max()
df["IRLbl"] = 1/df.n_1 * max_n_1
df["ImR"] = df.n_0 / df.n_1
print(max_n_1)
display(df)

n_labels = df.shape[0]

meanIR = df.IRLbl.sum() / n_labels
print(meanIR)

meanImR = df.ImR.sum() / n_labels

import math
CVIR = 1 / meanIR * math.sqrt(  ((df.IRLbl - meanIR)**2 / (n_labels - 1)).sum() )
CVImR = 1 / meanImR * math.sqrt( ((df.ImR - meanImR)**2 / (n_labels - 1)).sum() )




In [None]:
print(ds.id[:10])

In [None]:
# TODO: substitute synonims in the vocabulary

pd.set_option('display.max_rows', 20)

voc = pd.read_csv( join("text", "radiology_vocabulary_final.csv"), na_filter=False)
print("radiology vocabulary: ", voc.shape)
print(voc.columns)

term2syns = dict()
syn2term = dict()

syns_s = voc.apply(lambda row: [v.lower().strip() for v in row[3:] if (type(v) is str) and len(v)>0], axis=1)

for term, syns in zip(voc["Term"], syns_s):
    term2syns[term.lower().strip()] = syns
    for syn in syns:
        syn2term[syn] = term.lower()

random_keys = random.sample(list(term2syns.keys()), 5)
for key in random_keys:
    print(f" - {key}: {term2syns[key]}")

# Check which terms are not present in the vocabulary (read from disk in the cell above)
missing_terms = []
for i, t in enumerate(sorted(u_mesh)):
    ok = t in term2syns.keys()
    if not ok:
        missing_terms.append(t)
    # print(f"{i:03}. {t} - ok? {ok}")


# they should be two "no indexing" and "normal"
print(f"missing terms: {len(missing_terms)}")
for t in missing_terms:
    print(t)

