In [54]:
import pandas as pd
import json
import re
import numpy as np
import os

Results were collected between 11.06. -- 23.07.2021

Prior to parsing, spurious responses were discarded on the basis of:

* All extreme values
* Inappropriate comments
* Non-target user profile (userProfiling_position == 'other')

## Data parsing

In [55]:
# Loading Data
RESULTS_FILEPATH = "data/20210726170044_results.json"
figure_save_dir = os.path.splitext(RESULTS_FILEPATH)[0]

ids_to_exclude = [
    "1624013360083", #inappropriate comment, extreme values
    "1623688924533", #test input
    ]

with open(RESULTS_FILEPATH) as f:
    # remove image0/image1 vars since it prevents proper df merging
    text = f.read()
    text = re.sub(r"_image[0-3]", "", text)
    text = re.sub(r"user_profiling_", "user_", text)
    text = re.sub(r"user_", "userProfiling_", text)
    lines = text.splitlines()
    data = []
    for line in lines[1:-1]:
        entry = json.loads(line)
        entry["id"] = str(round(entry["id"]))
        if "userProfiling_aiFamiliarity" not in entry['userProfiling_'].keys():
            entry["userProfiling_"]["userProfiling_aiFamiliarity"]="missing"
        data.append(entry)
    df = pd.json_normalize(data)
    df.drop(labels=["__v", "_id.$oid"], axis=1, inplace=True)

renamed = [name.split("_.")[-1] for name in df.columns]
col_rename = {i: j for i, j in zip(df.columns, renamed)}
df = df.rename(columns=col_rename)
df.drop_duplicates(subset=['id'], inplace=True)

df = df[~df.id.isin(ids_to_exclude)]
df = df[df.userProfiling_position != "other"]
df.set_index("id", inplace=True)
df



Unnamed: 0_level_0,userProfiling_age,userProfiling_position,userProfiling_useOfAI,userProfiling_useOfDP,userProfiling_mlFamiliarity,userProfiling_aiFamiliarity,saliencyMaps_globalSaliency_understandability,saliencyMaps_globalSaliency_usability,saliencyMaps_globalSaliency_informativeness,saliencyMaps_globalSaliency_value,...,trustScores_borderlineCases_value,userProfiling_useOfAI_details,saliencyMaps_globalSaliency_comments,saliencyMaps_localSaliency_comments,conceptAttribution_textAttributes_comments,trustScores_borderlineCases_comments,userProfiling_comments,counterfactuals_twoAxisCounterfactuals_comments,counterfactuals_prototypeInterpolation_comments,userProfiling_position-Comment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1623319499818,30-40,Assisting physician (Assistenzarzt) for pathology/neuropathology,in routine diagnostics,in routine diagnostics,1,missing,5,5,6,6,...,6,,,,,,,,,
1623330321488,41-50,Researcher in pathology/neuropathology,in research,in research,3,missing,7,7,7,7,...,3,"from MindPeak (BreastIHC), from VMscope (Cognition Master)","There is one cell slightly to the right of the center that is labeled as positive and is quite darkly stained by IHC, but does not show up on the saliency map, which is a bit surprising. It looks like an endothelial cell.",Per-cell saliency is probably too much detail.,"This is excellent but I think there could be quite a lot of additional factors, so this would need some supervision.","High confidence should provide examples of both classes (pos. and neg.), no?",,,,
1623391402856,30-40,Researcher in pathology/neuropathology,[in research],[in research],7,5,6,3,4,5,...,6,,,,,,,,,
1623391916479,41-50,Technician (MTA) for pathology/neuropathology,[in routine diagnostics],[in research],7,4,5,7,3,6,...,6,,,,,,,,,
1623439901422,30-40,Trainee (Assistenzarzt) in pathology/neuropathology,[in research],[in research],2,5,4,5,6,4,...,5,QuPath immunohistochemistry positive cell detection,,,,,,,,
1623441394465,30-40,Consultant (Facharzt) for pathology/neuropathology,[in research],[in research],7,7,6,5,5,6,...,5,Aiforia,,,,,,,,
1623655000204,51-60,Researcher in pathology/neuropathology,[none],[none],2,3,5,5,5,5,...,5,,,,,,,,,
1623674498302,41-50,Consultant (Facharzt) for pathology/neuropathology,[none],[in routine diagnostics],4,5,1,1,1,1,...,2,,,,,Confidence for what? positive or negative or both?,,,,
1623676868025,41-50,Consultant (Facharzt) for pathology/neuropathology,[none],[none],5,5,3,3,2,2,...,6,,,,,,,,,
1623678209878,51-60,Consultant (Facharzt) for pathology/neuropathology,"[in routine diagnostics, in research]","[in research, in routine diagnostics]",4,2,6,6,5,5,...,4,Roche Diagnostics,,,,,,,,


In [56]:
user_df = df[
    [
        "userProfiling_age",
        "userProfiling_position",
        "userProfiling_useOfDP",
        "userProfiling_useOfAI",
        "userProfiling_useOfAI_details",
        "userProfiling_mlFamiliarity",
        "userProfiling_aiFamiliarity"
    ]
]

fields = ["Understandability", "Usability", "Informativeness", "Value"]

instance_identifiers = {
    "Counterfactuals (One-axis)": "counterfactuals_prototypeInterpolation",
    "Counterfactuals (Two-axis)": "counterfactuals_twoAxisCounterfactuals",
    "Saliency Map (Local)": "saliencyMaps_localSaliency",
    "Saliency Map (Global)": "saliencyMaps_globalSaliency",
    "Concept Attribution": "conceptAttribution_textAttributes",
    "Prototypes": "prototypes_prototypes",
    "Trust Scores": "trustScores_borderlineCases",
}

result_dataframes = dict()
comments = dict()
for name, id_ in instance_identifiers.items():
    result_dataframes[name] = df[[f"{id_}_{field.lower()}" for field in fields]]
    result_dataframes[name].columns = fields

    comments_column = f"{id_}_comments"
    if comments_column in df.columns:
        comments[name] = df[[comments_column]].dropna().values.squeeze(axis=1).tolist()
    else:
        comments[name] = []

In [57]:
df1 = pd.DataFrame()

for name, df in result_dataframes.items():
    df1[name] = df[df > 4].count()
df1 = df1.transpose()
for field in fields:
    df1 = df1.sort_values(by=field, ascending=False)

index_sorted = df1.index
df1
    

Unnamed: 0,Understandability,Usability,Informativeness,Value
Trust Scores,19,14,18,17
Counterfactuals (One-axis),19,16,16,16
Concept Attribution,17,17,13,15
Counterfactuals (Two-axis),15,14,14,14
Prototypes,20,14,13,12
Saliency Map (Global),13,12,11,12
Saliency Map (Local),12,14,9,9


In [58]:
comments

{'Counterfactuals (One-axis)': ["Like previous, initial reaction is that interpolation is useful but doesn't give the full picture of possible negatives. Perhaps clicking around each cell and seeing the interpolation for each would feel more useful",
  'What is a counterfactual? '],
 'Counterfactuals (Two-axis)': ['It seems there may be many counterfactual examples that are more closely related to the positive example which could be useful for understanding the nuances and building trust',
  'Do not understand this question'],
 'Saliency Map (Local)': ['Per-cell saliency is probably too much detail. '],
 'Saliency Map (Global)': ['There is one cell slightly to the right of the center that is labeled as positive and is quite darkly stained by IHC, but does not show up on the saliency map, which is a bit surprising. It looks like an endothelial cell. '],
 'Concept Attribution': ['This is excellent but I think there could be quite a lot of additional factors, so this would need some super

## Overall comparison

## Stacked diverging bar charts

In [59]:
from typing import Optional
from collections import Counter

import plotly
import plotly.graph_objects as go
import plotly.express as px

from PIL import Image
import io
from IPython.display import display

def plotly_figure_to_image(fig, extension: str = 'png', **kwargs):
    img_bytes = fig.to_image(format=extension, **kwargs)
    return Image.open(io.BytesIO(img_bytes))

def save_image(image, name: str, save_dir: str = "images", extension: str="png"):
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    filename = "".join(i for i in name if i not in "\\/:*?<>|()- ")
    filepath = f"{save_dir}/{filename}.{extension}"
    image.save(filepath)

def stackedBarChartDF(
    df: pd.DataFrame,
    title: str,
    palette: list,
    diverging: bool = False,
    attributes: list = ["Understandability", "Usability", "Informativeness", "Value"],
    labels: Optional[list] = None,
    save_fig: bool = False,
    save_name: Optional[str] = None,
    save_dir: str = "images",
    width: int = 1200,
    height: int = 400,
    font: dict = {'size' : 14},
    web_display: bool = False,
    legend: dict=dict(
            orientation="h",
            yanchor="bottom",
            y=-0.4,
            xanchor="center",
            x=0.5),
    showlegend: bool=False,
    **kwargs
):
    counters = [Counter(df[attribute]) for attribute in attributes]

    fig = go.Figure()  # type: ignore

    category_order = [
        "Strongly<br>disagree",
        "Disagree<br>",
        "Slightly<br>disagree",
        "Neutral<br>",
        "Slightly<br>agree",
        "Agree<br>",
        "Strongly<br>agree",
    ]

    number_order = [1, 2, 3, 4, 5, 6, 7]

    labels = labels if labels else attributes

    def add_bar_trace(num, transform, showlegend: bool = True):
        rating = number_order[num]
        counts = [counter[rating] for counter in counters]
        total_count = sum(counters[0].values())

        xvals = [transform(count) / total_count for count in counts]

        fig.add_trace(
            go.Bar(
                x=xvals,
                y=labels,
                # text=counts,
                # textposition='inside',
                orientation="h",
                name=category_order[num],
                width=0.8,
                marker_color=palette[num],
                legendgroup=category_order[num],
                showlegend=False,
                
            )  # type: ignore
        )

    if diverging:
        # negative side
        add_bar_trace(3, lambda x: x * -0.5)
        for num in reversed(range(0, 3)):
            add_bar_trace(num, lambda x: x * -1)

        # positive side
        add_bar_trace(3, lambda x: x * 0.5, showlegend=False)
        for num in range(4, 7):
            add_bar_trace(num, lambda x: x)
    else:
        for num in reversed(range(0,7)):
            add_bar_trace(num, lambda x: x)
    

    for num in reversed(range(0, 7)):
            fig.add_trace(
                go.Bar(
                    x=[0] * 7,
                    y=labels,
                    orientation="h",
                    name=category_order[num],
                    marker_color=palette[num],
                    legendgroup=category_order[num]
                )  # type: ignore
            )

    fig.update_layout(
        barmode="relative" if diverging else "stack",
        yaxis_autorange="reversed",
        # title=dict(
        #     text=title,
        #     x=0.0,
        #     y=0.92
        #     ),
        xaxis={"tick0": 0, "tickformat": "%", "range": [-1, 1]},
        legend=legend,
        showlegend=showlegend,
        font=font,
        autosize=False,
        **kwargs
    )

    if not diverging:
        fig.update_layout(
            xaxis = {
                "range" : [0, 1],
                'side': 'top',
                "color" : palette[-1]
                }, 
            xaxis2= {
                'anchor': 'y', 
                'overlaying': 'x', 
                'side': 'bottom', 
                "tickformat": "%", 
                "range": [1, 0], 
                "color" : palette[0]
                })
        fig.data[-1].update(xaxis='x2')

    im = plotly_figure_to_image(fig, width=width, height=height, validate=True)
    
    if web_display: 
        fig.show()
    else:
        display(im)

    if save_fig:
        save_image(im, save_name if save_name else title, save_dir)

palette = [plotly.colors.diverging.RdBu[n] for n in [2, 3, 4, 5, 6, 7, 8]]  # type: ignore
space = " " * 4
labels = [
    f"I find the explanation{space}<br>intuitively understandable{space}",
    f"The explanation helps me to understand{space}<br>factors relevant to the algorithm{space}",
    f"The explanation helps me to decide whether{space}<br> I can trust the generated annotations{space}",
    f"The explanation provides me with{space}<br>valuable information for my work{space}",
]

counters = []

# for name, df in result_dataframes.items():
#     fig, counter = stackedBarChartDF(
#         df,
#         name,
#         palette,
#         labels=labels,
#         web_display=True,
#         save_fig=False,
#         save_dir=figure_save_dir,
#         font={'size':14},
#         width=1200,
#         height=400
#     )

#     px.imshow(df.corr(method='spearman'), zmin=0, width = 500, height=300).show()

#     for comment in comments[name]:
#         print(comment)
#     counters.append(counter) 

In [63]:
for name, df in result_dataframes.items():
    df.columns = labels
    px.imshow(
        df.corr(method=method),
        title=f"{method} correlation between responses for {name}",
        zmin=0).show()

    for comment in comments[name]:
        print(comment)

Like previous, initial reaction is that interpolation is useful but doesn't give the full picture of possible negatives. Perhaps clicking around each cell and seeing the interpolation for each would feel more useful
What is a counterfactual? 


It seems there may be many counterfactual examples that are more closely related to the positive example which could be useful for understanding the nuances and building trust
Do not understand this question


Per-cell saliency is probably too much detail. 


There is one cell slightly to the right of the center that is labeled as positive and is quite darkly stained by IHC, but does not show up on the saliency map, which is a bit surprising. It looks like an endothelial cell. 


This is excellent but I think there could be quite a lot of additional factors, so this would need some supervision. 


High confidence should provide examples of both classes (pos. and neg.), no? 
Confidence for what? positive or negative or both?
I felt I needed the pos or neg labels in addition to the confidence


In [61]:
import plotly.express as px

df2 = pd.DataFrame()

category_order = [
        "Strongly<br>disagree",
        "Disagree<br>",
        "Slightly<br>disagree",
        "Neutral<br>",
        "Slightly<br>agree",
        "Agree<br>",
        "Strongly<br>agree",
    ]

number_order = [1, 2, 3, 4, 5, 6, 7]
response_dict = dict(zip(number_order, category_order))

for name in index_sorted:
    df = result_dataframes[name]
    df2[name] = df.median()

px.imshow(
    df2,
    title="median responses from 1=strongly disagree to 7=strongly agree",
    zmin=1, 
    zmax= 7, 
    color_continuous_scale=palette
    )

In [62]:
index=0
width = 1900
font={'size':24,'family':'Times New Roman'}
labels = [
    f"I find the explanation intuitively understandable{space}",
    f"The explanation helps me to understand factors relevant to the algorithm{space}",
    f"The explanation helps me to decide whether I can trust the generated annotations{space}",
    f"The explanation provides me with valuable information for my work{space}",
]

for index, name in enumerate(index_sorted_by_usability[:-1]):
    stackedBarChartDF(
        result_dataframes[name],
        name,
        palette,
        labels=labels,
        showlegend=False,
        web_display=False,
        save_fig=True,
        save_dir=figure_save_dir,
        save_name=f"{index}_{name}",
        font=font,
        width=width,
        height=300,
        margin=dict(
            r=2,
            l=0,
            t=10,
            b=10,
        )
    )

final_index = index_sorted_by_usability[-1]
stackedBarChartDF(
        result_dataframes[final_index],
        final_index,
        palette,
        labels=labels,
        showlegend=True,
        web_display=False,
        save_fig=True,
        save_dir=figure_save_dir,
        save_name=f"{index+1}_{final_index}",
        font=font,
        width=width,
        height=380,
        margin=dict(
            r=2,
            l=0,
            t=10,
            b=80,
        ),
        legend=dict(
            font=font,
            orientation="h",
            yanchor="bottom",
            y=-0.5,
            xanchor="center",
            x=0.5,
            valign="top",
            itemwidth=40),
    )


KeyError: 'Understandability'

In [None]:
labels = [
    f"I find the explanation intuitively understandable",
    f"The explanation helps me to understand factors relevant to the algorithm",
    f"The explanation helps me to decide whether I can trust the generated annotations",
    f"The explanation provides me with valuable information for my work",
]

for name, df in result_dataframes.items():
    print(name)
    for index, column in enumerate(df.columns):
        percentage_agree = df[df[column] > 4][column].count() / df[column].count()
        percentage_stronglyagree = df[df[column] > 5][column].count() / df[column].count()

        percentage_disagree = df[df[column] < 4][column].count() / df[column].count()
        percentage_stronglydisagree = df[df[column] < 3][column].count() / df[column].count()
        print(
            '\t', labels[index], "\n\t\t", 
            '{:.0%}'.format(percentage_stronglydisagree), " disagree or strongly disagree", " | ",
            '{:.0%}'.format(percentage_disagree), " disagree", " | ", 
            '{:.0%}'.format(percentage_agree), " agree", " | "
            '{:.0%}'.format(percentage_stronglyagree), " agree or strongly agree")

Counterfactuals (One-axis)
	 I find the explanation intuitively understandable 
		 8%  disagree or strongly disagree  |  12%  disagree  |  76%  agree  | 60%  agree or strongly agree
	 The explanation helps me to understand factors relevant to the algorithm 
		 16%  disagree or strongly disagree  |  24%  disagree  |  64%  agree  | 48%  agree or strongly agree
	 The explanation helps me to decide whether I can trust the generated annotations 
		 16%  disagree or strongly disagree  |  24%  disagree  |  64%  agree  | 44%  agree or strongly agree
	 The explanation provides me with valuable information for my work 
		 24%  disagree or strongly disagree  |  28%  disagree  |  64%  agree  | 28%  agree or strongly agree
Counterfactuals (Two-axis)
	 I find the explanation intuitively understandable 
		 20%  disagree or strongly disagree  |  28%  disagree  |  60%  agree  | 32%  agree or strongly agree
	 The explanation helps me to understand factors relevant to the algorithm 
		 12%  disagree or s

## User profiling

In [None]:
new_user_df = user_df.copy()
new_user_df.columns = [column.split("_")[1] for column in user_df.columns]
new_user_df

Unnamed: 0_level_0,age,position,useOfDP,useOfAI,useOfAI,mlFamiliarity,aiFamiliarity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1623319499818,30-40,Assisting physician (Assistenzarzt) for pathol...,in routine diagnostics,in routine diagnostics,,1,missing
1623330321488,41-50,Researcher in pathology/neuropathology,in research,in research,"from MindPeak (BreastIHC), from VMscope (Cogni...",3,missing
1623391402856,30-40,Researcher in pathology/neuropathology,[in research],[in research],,7,5
1623391916479,41-50,Technician (MTA) for pathology/neuropathology,[in research],[in routine diagnostics],,7,4
1623439901422,30-40,Trainee (Assistenzarzt) in pathology/neuropath...,[in research],[in research],QuPath immunohistochemistry positive cell dete...,2,5
1623441394465,30-40,Consultant (Facharzt) for pathology/neuropatho...,[in research],[in research],Aiforia,7,7
1623655000204,51-60,Researcher in pathology/neuropathology,[none],[none],,2,3
1623674498302,41-50,Consultant (Facharzt) for pathology/neuropatho...,[in routine diagnostics],[none],,4,5
1623676868025,41-50,Consultant (Facharzt) for pathology/neuropatho...,[none],[none],,5,5
1623678209878,51-60,Consultant (Facharzt) for pathology/neuropatho...,"[in research, in routine diagnostics]","[in routine diagnostics, in research]",Roche Diagnostics,4,2


In [None]:
pd.set_option('display.max_colwidth', None)
px.histogram(new_user_df['position'].replace(
    {
        "Assisting physician (Assistenzarzt) for pathology/neuropathology" : "Trainee",
        "Trainee (Assistenzarzt) in pathology/neuropathology" : "Trainee",
        "Trainee (Assistenzärztin*arzt) in pathology/neuropathology" : "Trainee",
        "Consultant (Facharzt) for pathology/neuropathology" : "Consultant",
        "Consultant (Fachärztin*arzt) for pathology/neuropathology" : "Consultant",
        "Researcher in pathology/neuropathology" : "Researcher",
        "Technician (MTA) for pathology/neuropathology" : "Technician"
    }
)).update_xaxes(categoryorder="total descending")

In [None]:
px.histogram(new_user_df['age'])

In [None]:
print("familiar with ML: ",new_user_df[new_user_df['mlFamiliarity'] > 4].count()[0])
print("not familiar with ML: ",new_user_df[new_user_df['mlFamiliarity'] < 4].count()[0])

print("\nfamiliar with AI: ", new_user_df[2:][new_user_df[2:]['aiFamiliarity'] > 4].count()[0])
print("not familiar with AI: ", new_user_df[2:][new_user_df[2:]['aiFamiliarity'] < 4].count()[0])
print("missing: 2")

familiar with ML:  13
not familiar with ML:  10

familiar with AI:  12
not familiar with AI:  7
missing: 2


In [None]:
px.histogram(new_user_df['useOfAI'].iloc[:,0])

In [None]:
px.histogram(new_user_df['useOfDP'])

In [None]:
px.scatter(x= new_user_df[2:]['mlFamiliarity'], y = new_user_df[2:]['aiFamiliarity'])

mlFamiliarity = new_user_df['mlFamiliarity'][2:].astype('int32')
aiFamiliarity = new_user_df['aiFamiliarity'][2:].astype('int32')
aiFamiliarity.corr(mlFamiliarity, method="spearman")

0.5970744598756759