In [1]:
from sklearn import metrics
from scipy import stats
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats
sns.set()
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 200)
sns.set_theme(style='whitegrid')
plt.rcParams["figure.figsize"] = (10,10)


def format_e(n):
    a = '%2e' % n
    return a.split('e')[0].rstrip('0').rstrip('.') + 'e' + a.split('e')[1]

def mean_confidence_interval(data, confidence=0.95, rounding=4):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return round(m, rounding), round(m-h, rounding), round(m+h, rounding)

def confidence_interval(data, size=10000, func=np.mean):
    """creates a bootstrap sample, computes replicates and returns replicates array"""
    # Create an empty array to store replicates
    bs_replicates = np.empty(size)
    
    np.random.seed(42)
    # Create bootstrap replicates as much as size
    for i in range(size):
        # Create a bootstrap sample
        bs_sample = np.random.choice(data,size=len(data))
        # Get bootstrap replicate and append to bs_replicates
        bs_replicates[i] = func(bs_sample)
    
    return np.percentile(bs_replicates, [2.5, 97.5])

metadata_phase1 = pd.read_csv("/home/kti01/Documents/My Files/Projects/Overlap/data/phase1/metadata_phase1.csv")
metadata_phase2 = pd.read_csv("/home/kti01/Documents/My Files/Projects/Overlap/data/phase2/metadata_phase2.csv")
metadata_phase3 = pd.read_csv("/home/kti01/Documents/My Files/Projects/Overlap/data/phase3/metadata_phase3.csv")
#metadata_phase3 = metadata_phase3[metadata_phase3['language'] == 'en']
#metadata_phase2 = metadata_phase2[metadata_phase2.participant.isin(metadata_phase3.participant)]

metadata_phase2 = metadata_phase2[~metadata_phase2.issues.isin(['Insufficient Image Quality', 'unzureichende Bildqualität'])]
metadata_phase3 = metadata_phase3[~metadata_phase3.issues.isin(['Insufficient Image Quality', 'unzureichende Bildqualität'])]
metadata = pd.merge(metadata_phase2, metadata_phase3, how='inner', left_on=['participant', 'mask'], right_on=['participant', 'mask'])


In [2]:
participant_list = []
kappas2 = []
kappas3 = []
for participant in metadata.participant.unique():
    df = metadata[metadata.participant == participant]
    kappas2.append(metrics.cohen_kappa_score(np.floor(df.prediction_x), df.AI_prediction_x))
    kappas3.append(metrics.cohen_kappa_score(np.floor(df.prediction_y), df.AI_prediction_y))
    participant_list.append(participant)
kappas_df = pd.DataFrame({'participant': participant_list, 'phase2': kappas2, 'phase3': kappas3})
print(kappas_df.phase2.mean())
print(kappas_df.phase3.mean())
stats.ttest_rel(kappas_df.phase2, kappas_df.phase3)

0.5274136758353467
0.580195675643555


Ttest_relResult(statistic=-2.8650650389347954, pvalue=0.004958064948696825)

## Agreement overall

In [21]:
agreement = {'participant': [], 'phase2': [], 'phase3': []}
for participant in metadata.participant.unique():
    df = metadata[metadata.participant == participant]
    agreement['participant'].append(participant)
    agreement['phase2'].append((df.AI_prediction_x == np.floor(df.prediction_x)).sum() / df.shape[0])
    agreement['phase3'].append((df.AI_prediction_y == np.floor(df.prediction_y)).sum() / df.shape[0])
agreement = pd.DataFrame(agreement)
agreement['change'] = (agreement['phase3']-agreement['phase2'])
print(stats.ttest_rel(agreement.phase2, agreement.phase3))
print(agreement.phase2.mean())
print(agreement.phase3.mean())

Ttest_relResult(statistic=-2.6387152838834345, pvalue=0.009475211056486086)
0.7708207022862195
0.7947723253757737


In [24]:
print(agreement.change.mean())
print(confidence_interval(agreement.change))

0.023951623089554126
[0.00649165 0.04225401]


In [12]:
confidence_interval(agreement.phase3)

array([0.77096383, 0.81770241])

In [6]:
print((metadata.AI_prediction_x == np.floor(metadata.prediction_x)).sum() / metadata.shape[0])
print((metadata.AI_prediction_x == np.floor(metadata.prediction_y)).sum() / metadata.shape[0])


0.7712952158693116
0.79463243873979


## Agreement erroneous preds

In [28]:
agreement = {'participant': [], 'phase2': [], 'phase3': []}
for participant in metadata.participant.unique():
    df = metadata[metadata.participant == participant]
    df = df[df['AI_prediction_y'] != df['benign_malignant_y']]
    if len(df) == 0:
        continue
    agreement['participant'].append(participant)
    agreement['phase2'].append((df.AI_prediction_x == np.floor(df.prediction_x)).sum() / df.shape[0])
    agreement['phase3'].append((df.AI_prediction_y == np.floor(df.prediction_y)).sum() / df.shape[0])
agreement = pd.DataFrame(agreement)
agreement['change'] = (agreement['phase3']-agreement['phase2'])#/agreement['phase2']

print(stats.ttest_rel(agreement.phase2, agreement.phase3))
print(agreement.phase2.mean())
print(agreement.phase3.mean())

Ttest_relResult(statistic=-1.5410324833223832, pvalue=0.12620707597496442)
0.6312121212121212
0.678939393939394


In [30]:
print(agreement.change.mean())
print(confidence_interval(agreement.change))

0.04772727272727273
[-0.01242424  0.10863636]


In [7]:
print(confidence_interval(agreement.phase2))
print(confidence_interval(agreement.phase3))

[0.56954167 0.69090909]
[0.61893561 0.73651894]


In [9]:
df = metadata[metadata.AI_prediction_x != metadata.benign_malignant_x]
(df.AI_prediction_x == np.floor(df.prediction_x)).sum() / df.shape[0]

0.6226993865030674

In [10]:
(df.AI_prediction_x == np.floor(df.prediction_y)).sum() / df.shape[0]

0.6625766871165644

In [22]:
metadata[(np.floor(metadata.prediction_x) != metadata.benign_malignant_x)&
        (np.floor(metadata.prediction_y) == metadata.benign_malignant_x)].shape

(130, 20)

In [23]:
metadata[(np.floor(metadata.prediction_x) == metadata.benign_malignant_x)&
        (np.floor(metadata.prediction_y) != metadata.benign_malignant_y)].shape

(116, 20)