In [1]:
import json 
import copy
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_score
from matplotlib.path import Path
from torchvision import transforms
from torch.utils.data import Dataset
from skimage import draw
from skimage import io
import torch
import numpy as np
import pandas as pd
plt.rcParams["figure.figsize"] = (15,8)
from skimage.io import imread, imshow, imsave
from skimage.color import rgb2gray
from skimage.draw import polygon
from skimage import draw
from sklearn.metrics import confusion_matrix
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from collections import Counter
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)


mappings = {
	'Thick reticular or branched lines': 'TRBL',
	'Dicke retikuläre oder verzweigte Linien': 'TRBL',

	'Eccentrically located structureless area (any colour except skin colour, white and grey)': 'ESA',
	'Exzentrisch gelegenes, strukturloses Areal jeglicher Farbe, außer hautfarben, weiß und grau': 'ESA',

	'Grey patterns': 'GP',
	'Graue Muster': 'GP',

	'Polymorphous vessels': 'PV',
	'Polymorphe Gefäße': 'PV',

	'Pseudopods or radial lines at the lesion margin that do not occupy the entire lesional circumference': 'PRL',
	'Pseudopodien oder radiale Linien am Läsionsrand, die nicht den gesamten Läsionsumfang einnehmen': 'PRL',

	'Black dots or globules in the periphery of the lesion': 'BDG',
	'Schwarze Punkte oder Schollen in der Läsionsperipherie': 'BDG',

	'White lines or white structureless area': 'WLSA',
	'Weiße Linien oder weißes strukturloses Areal': 'WLSA',

	'Parallel lines on ridges (acral lesions only)': 'PLR',
	'Parallele Linien auf den Leisten (nur akrale Läsionen)': 'PLR',

	'Pigmentation extends beyond the area of the scar (only after excision)': 'PES',
	'Pigmentierung überschreitet Narbenareal (nur nach Entfernung)': 'PES',

	'Pigmentation invades the openings of hair follicles (facial lesions)': 'PIF',
	'Pigmentierung überschreitet Follikelöffnung (Gesichtsläsionen)': 'PIF',

	'Only one pattern and only one colour': 'OPC',
	'Nur ein Muster und nur eine Farbe': 'OPC',

	'Symmetrical combination of patterns and;or colours': 'SPC',
	'Symmetrische Kombination von Mustern und;oder Farben': 'SPC',

	'Monomorphic vascular pattern': 'MVP',
	'Monomorphes Gefäßmuster': 'MVP',

	'Pseudopods or radial lines at the lesional margin involving the entire lesional circumference': 'PRLC',
	'Pseudopodien oder radiale Linien am Läsionsrand über den gesamten Läsionsumfang': 'PRLC',

	'Parallel lines in the furrows (acral lesions only)': 'PLF',
	'Parallele Linien in den Furchen (nur akrale Läsionen)': 'PLF',

	'Pigmentation does not extend beyond the area of the scar (only after excision)': 'PDES',
	'Pigmentierung überschreitet Narbe nicht (nur nach Entfernung)': 'PDES',

	'Asymmetric combination of multiple patterns and;or colours in the absence of other melanoma criteria': 'APC',
	'Asymmetrische Kombination mehrerer Muster und;oder Farben ohne weitere Melanomkriterien': 'APC',

	'Melanoma simulator': 'MS',
	'Melanomsimulator': 'MS',
    
	'Please select at least one explanation.': 'XX',
	'Bitte wählen Sie mindestens eine Erklärung aus.': 'XX',
    
    'Andere (bitte angeben)': 'XX'
}

char_class_labels = ['TRBL', 'ESA', 'BDG', 'GP', 'PV', 'PRL', 'WLSA', 'PLR', 'PES', 'PIF', 'OPC', 'SPC', 'MVP', 'PRLC', 'PLF', 'PDES', 'APC', 'MS', 'None']

language_dict = {
    '1': 'de',
    '1a': 'de',
    'gt1': 'en',
    'gt2': 'en',
    'gt3': 'de',
    'gt4': 'de',
    'gt7': 'en',
    'gt10': 'en',
    'gt12': 'en',
    'gt13': 'en',
    'gt14': 'en',
    'gt15': 'en',
    'gt16': 'en',
    'gt17': 'en',
    'gt19': 'en'
}


def process_annotation(polygons, language='en'):
    
    if language == 'en':
        explanation_key = 'Explanation'
    else:
        explanation_key = 'Erklärung'
    
    polygon = np.array(polygons['data']['points'])
    polygon[:, 0] *= 600
    polygon[:, 1] *= 450
    polygon = polygon.astype(int)
    explanation = polygons['children'][explanation_key]['data']['selected']
    
    explanation = [mappings[exp] for exp in explanation]

    while 'XX' in explanation: explanation.remove('XX')
    
    if len(explanation) == 0:
        explanation.append('None')

    return polygon, explanation

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,APC,BDG,ESA,GP,MS,MVP,None,OPC,PDES,PES,PIF,PLF,PLR,PRL,PRLC,PV,SPC,TRBL,WLSA,benign_malignant,image,APC_annotation,BDG_annotation,ESA_annotation,GP_annotation,MS_annotation,MVP_annotation,None_annotation,OPC_annotation,PDES_annotation,PES_annotation,PIF_annotation,PLF_annotation,PLR_annotation,PRL_annotation,PRLC_annotation,PV_annotation,SPC_annotation,TRBL_annotation,WLSA_annotation,annotator
0,HAM_0005678,ISIC_0031023,mel,histo,60.0,male,chest,vidir_modern,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,ISIC_0031023.jpg,-1,"[[[280, 116], [272, 83], [289, 74], [329, 64],...",-1,"[[[280, 116], [272, 83], [289, 74], [329, 64],...",-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[280, 116], [272, 83], [289, 74], [329, 64],...",-1,-1,-1,"[[[280, 116], [272, 83], [289, 74], [329, 64],...","[[[285, 113], [266, 101], [239, 119], [232, 12...",1
1,HAM_0005191,ISIC_0031177,mel,histo,40.0,female,back,rosendahl,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,ISIC_0031177.jpg,-1,"[[[36, 208], [34, 263], [76, 317], [119, 344],...",-1,"[[[36, 208], [34, 263], [76, 317], [119, 344],...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[36, 208], [34, 263], [76, 317], [119, 344],...",-1,1
2,HAM_0007310,ISIC_0026120,mel,histo,55.0,male,back,vidir_modern,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,ISIC_0026120.jpg,-1,"[[[328, 57], [311, 21], [284, 16], [233, 27], ...",-1,"[[[357, 155], [320, 133], [287, 124], [265, 14...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[357, 155], [320, 133], [287, 124], [265, 14...",-1,"[[[328, 57], [311, 21], [284, 16], [233, 27], ...","[[[357, 155], [320, 133], [287, 124], [265, 14...",1
3,HAM_0004476,ISIC_0030417,mel,histo,70.0,male,face,vidir_modern,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,ISIC_0030417.jpg,-1,"[[[135, 168], [123, 120], [139, 90], [173, 67]...",-1,"[[[135, 168], [123, 120], [139, 90], [173, 67]...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[135, 168], [123, 120], [139, 90], [173, 67]...",-1,-1,-1,1
4,HAM_0000876,ISIC_0026531,mel,histo,55.0,male,abdomen,vidir_modern,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,ISIC_0026531.jpg,-1,"[[[186, 144], [180, 114], [158, 110], [136, 16...",-1,"[[[186, 144], [180, 114], [158, 110], [136, 16...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[283, 298], [269, 304], [242, 332], [232, 34...",-1,"[[[186, 144], [180, 114], [158, 110], [136, 16...","[[[283, 298], [269, 304], [242, 332], [232, 34...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,HAM_0006538,ISIC_0029397,mel,histo,5.0,male,back,vidir_molemax,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,ISIC_0029397.jpg,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[125, 338], [134, 388], [146, 419], [194, 42...",-1,gt19
216,HAM_0000282,ISIC_0027155,mel,histo,65.0,male,neck,rosendahl,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,ISIC_0027155.jpg,-1,"[[[331, 319], [342, 311], [341, 291], [324, 29...","[[[385, 204], [400, 205], [402, 221], [395, 24...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[215, 212], [179, 219], [162, 251], [161, 27...","[[[87, 128], [128, 155], [160, 137], [183, 140...",gt19
217,HAM_0000125,ISIC_0028777,mel,histo,60.0,male,lower extremity,rosendahl,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,ISIC_0028777.jpg,-1,"[[[191, 211], [162, 239], [176, 274], [199, 30...","[[[196, 209], [211, 178], [239, 163], [281, 14...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[365, 210], [378, 221], [388, 236], [397, 24...",-1,-1,-1,-1,-1,gt19
218,HAM_0006423,ISIC_0028352,mel,histo,55.0,male,back,rosendahl,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,ISIC_0028352.jpg,-1,-1,-1,"[[[252, 145], [315, 118], [374, 135], [442, 14...",-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,"[[[163, 196], [181, 174], [190, 153], [216, 16...","[[[200, 264], [233, 257], [260, 236], [281, 21...",gt19


## Process labels into tabular format

In [3]:
ham = pd.read_csv("/home/kti01/Documents/My Files/Data/HAM10000/HAM10000/metadata")
ham = ham[ham.dx.isin(['mel', 'nv'])]
ham = ham[ham.dx_type=='histo']
dx_dict = dict(zip(ham.image_id, ham.dx))

gts = ['1', '1a', 'gt1', 'gt2', 'gt3', 'gt4', 'gt7', 'gt10', 'gt12', 'gt13', 'gt14', 'gt15', 'gt16', 'gt17', 'gt19']

for gt in gts:
    
    print('Processing ', gt)
    datasets = []
    data_dir = "/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/"+gt
    language = language_dict[gt]

    if language == 'en':
        melanoma = 'Melanoma'
        nevus = 'Nevus'
    else:
        melanoma = 'Melanom'
        nevus = 'Nävus'

    for file in ["train.json", "validation.json", "test.json"]:

        data_file = os.path.join(data_dir, file)

        with open(os.path.join(data_file)) as f:
            d = json.load(f)
            annotations = dict(d)['labels']

        # Store labels in a dict with image names as keys and labels as values
        labels = {annotation['dataId']: annotation['annotations'] for annotation in annotations}

        # Rename dict keys: remove the diagnosis at the start of the key and also remove the file extensions at the end
        labels = {'_'.join(k.split('_')[1:]).split('.')[0]: v for k, v in labels.items()}
        
        # Go through labels and store the polygons and explanations in lists
        # Check the chosen diagnosis by seeing if the Melanoma key is empty of the Nevus key.
        # The non-empty key is the diagnosis that the annotator has chosen.
        image_list = []
        polygons_list = []
        explanations_list = []
        for k, v in labels.items():
            
            # Exclude images here based on criteria: 
            #  if annotated dx differs from true dx
            #  if both mel and nev features have been annotated
            if (len(v[melanoma]) != 0 and dx_dict[k] == 'nv') or (len(v[nevus]) != 0 and dx_dict[k] == 'mel'):
                continue
            
            if len(v[melanoma]) != 0 and len(v[nevus]) != 0:
                continue
            

            if len(v[melanoma]) != 0:
                for polygons in v[melanoma]:

                    polygon, explanation = process_annotation(polygons, language=language)

                    image_list.append(k)
                    polygons_list.append(polygon)
                    explanations_list.append(explanation)
            else:
                for polygons in v[nevus]:
                    polygon, explanation = process_annotation(polygons, language=language)

                    image_list.append(k)
                    polygons_list.append(polygon)
                    explanations_list.append(explanation)


        labels = pd.DataFrame({'image_id': image_list, 'polygon': polygons_list, 'explanation': explanations_list})



        metadata_df = pd.DataFrame(labels.explanation.explode())
        metadata_df = pd.merge(labels['image_id'], metadata_df, left_index=True, right_index=True)
        metadata_df = pd.get_dummies(metadata_df.explanation)

        # Add characteristics that the participant has not chosen in any image.
        for col in char_class_labels:
            if col not in metadata_df.columns:
                metadata_df[col] = 0

        metadata_df = labels.join(metadata_df)
        metadata_df = metadata_df.groupby('image_id').sum()
        metadata_df = metadata_df.reset_index()

        for col in metadata_df.columns.drop('image_id'):
            metadata_df.loc[metadata_df[col]>0, col] = 1 

        #ham = pd.read_csv("/home/kti01/Documents/My Files/Data/HAM10000/HAM10000/metadata")
        metadata_df = pd.merge(ham, metadata_df, on='image_id', how='inner')
        metadata_df['benign_malignant'] = metadata_df['dx'].apply(lambda x: 0 if x == 'nv' else 1)
        #one_hot = pd.get_dummies(metadata_df['dx'])
        #metadata_df = metadata_df.join(one_hot)


        annotations_dict = labels.set_index('image_id', drop=True).groupby(level=0).apply(lambda x: x.to_dict('records')).to_dict()
        #metadata_df['annotations'] = [annotations_dict[img] for img in metadata_df.image_id]
        metadata_df['image'] = metadata_df['image_id'].apply(lambda x: x+'.jpg')

        # Put each explanation and it's corresponding polygon in individual rows
        d = {'image_id': [], 'explanation': [], 'polygon': []}
        for idx, row in labels.iterrows():
            for exp in row['explanation']:
                d['image_id'].append(row['image_id'])
                d['explanation'].append(exp)
                d['polygon'].append(row['polygon'])
        d = pd.DataFrame(d)

        final_dict = {'image_id': [], 'explanation': [], 'polygon': []}
        # Loop over each image
        for image_id in d['image_id'].unique():
            # Get sub dataframe consisting of only one image's explanations 
            sub_d = d[d['image_id'] == image_id]
            # Loop over each explanation for the current image
            for exp in sub_d['explanation'].unique():
                # Select all rows of the dataframe where the explanation is the current explanation
                sub_de = sub_d[sub_d['explanation'] == exp]
                # Loop over each polygon and append to list
                poly_list = []
                for poly in sub_de['polygon']:
                    poly_list.append(poly)

                # Keep appending to lists in a dict so that we can create a dataframe out of this
                final_dict['polygon'].append(poly_list)
                final_dict['image_id'].append(image_id)
                final_dict['explanation'].append(exp)
        final_df = pd.DataFrame(final_dict)
        final_df = final_df.pivot_table(values='polygon', index=final_df['image_id'], columns='explanation', aggfunc='first')

        final_df.fillna(-1, inplace=True)

        # Add char columns not present in the current df
        for col in char_class_labels:
            if col not in final_df.columns:
                final_df[col] = -1

        # Rename columns
        for col in final_df.columns:
            final_df.rename(columns={col: col+'_annotation'}, inplace=True)

        final_merged_df = pd.merge(metadata_df, final_df, on='image_id', how='inner')

        datasets.append(final_merged_df)

    final_final_merged_df = pd.concat(datasets)
    final_final_merged_df.reset_index(drop=True, inplace=True)

    with open(os.path.join(data_dir, 'metadata.pkl'), 'wb') as fp:
        pickle.dump(final_final_merged_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
        
# Since annotator 1a's annotations are corrections of annotator 1's images, 
#  concat 1a's annotations with annotator 1's frame. 
# Drop duplicates (incorrect images from annotator 1) from 1's frame and keep 1a's images
# Dump the pickle in annotator 1's folder
gt1 = pd.read_pickle("/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/1/metadata.pkl")
gt1a = pd.read_pickle("/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/1a/metadata.pkl")
one = pd.concat([gt1, gt1a], ignore_index=True).drop_duplicates('image_id', keep='last').reset_index(drop=True)
with open(os.path.join("/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/1/metadata.pkl"), 'wb') as fp:
    pickle.dump(one, fp, protocol=pickle.HIGHEST_PROTOCOL)
    


Processing  1
Processing  1a
Processing  gt1
Processing  gt2
Processing  gt3
Processing  gt4
Processing  gt7
Processing  gt10
Processing  gt12
Processing  gt13
Processing  gt14
Processing  gt15
Processing  gt16
Processing  gt17
Processing  gt19


In [4]:
metadata_list = []
# Combine every annotator's metadata into one dataframe and save it
for gt in ['1', 'gt1', 'gt2', 'gt3', 'gt4', 'gt7', 'gt10', 'gt12', 'gt13', 'gt14', 'gt15', 'gt16', 'gt17', 'gt19']:
    data_dir = "/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/"+gt
    
    with open(os.path.join(data_dir, "metadata.pkl"), 'rb') as f:
        df = pickle.load(f)
    df['annotator'] = gt
    metadata_list.append(df)

metadata = pd.concat(metadata_list)

with open(os.path.join("/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/metadata_gt_consolidated.pkl"), 'wb') as fp:
    pickle.dump(metadata, fp, protocol=pickle.HIGHEST_PROTOCOL)
    print('Saved consolidated metadata file')

    
metadata_csv = metadata.copy()
metadata_csv = metadata_csv[['image_id']+char_class_labels].groupby('image_id').sum().reset_index()
for col in char_class_labels:
    metadata_csv[col] = metadata_csv[col].apply(lambda x: 1 if x > 1 else x)
    
metadata_csv = pd.merge(metadata[['lesion_id', 'image_id', 'benign_malignant']].drop_duplicates('image_id', keep='last'), metadata_csv, on='image_id', how='right')

# Add train test info
test_imgs = pd.read_csv("/home/kti01/Documents/My Files/Projects/Overlap/data/metadata_testset.csv")['image_id'].values
metadata_csv['split'] = metadata_csv['image_id'].apply(lambda x: 'test' if x in test_imgs else 'train')
metadata_csv.to_csv("/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/metadata_gt.csv", index=False)

print('Saved combined gt metadata file')

Saved consolidated metadata file
Saved combined gt metadata file


# Count num annotators per image

In [52]:
data = pd.read_pickle("/home/kti01/Documents/My Files/Projects/Overlap/data/ground_truth/metadata_gt_consolidated.pkl")
annotation_count = pd.DataFrame(Counter(data.image_id), index=['count']).T.reset_index().rename({'index': 'image_id'}, axis=1)
annotation_count['count'].value_counts()

2    2345
1     995
3     271
Name: count, dtype: int64

In [53]:
unique_imgs = ham.drop_duplicates('lesion_id', keep='last').image_id
unique_annotation_count = annotation_count[annotation_count['image_id'].isin(unique_imgs)]
unique_annotation_count['count'].value_counts()

2    1728
3     226
1      27
Name: count, dtype: int64

In [6]:
remaining_image_id = unique_annotation_count[unique_annotation_count['count'] == 1]
#remaining_image_id.to_csv('remaining_image_id.csv')