In [1]:
import json 
import copy
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_score
from matplotlib.path import Path
from torchvision import transforms
from torch.utils.data import Dataset
from skimage import draw
from skimage import io


import torch
import numpy as np
import pandas as pd
plt.rcParams["figure.figsize"] = (15,8)
from skimage.io import imread, imshow, imsave
from skimage.color import rgb2gray
from skimage.draw import polygon
from skimage import draw
from sklearn.metrics import confusion_matrix
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
pd.set_option('display.max_columns', 500)


mappings = {
	'Thick reticular or branched lines': 'TRBL',
	'Dicke retikuläre oder verzweigte Linien': 'TRBL',

	'Eccentrically located structureless area (any colour except skin colour, white and grey)': 'ESA',
	'Exzentrisch gelegenes, strukturloses Areal jeglicher Farbe, außer hautfarben, weiß und grau': 'ESA',

	'Grey patterns': 'GP',
	'Graue Muster': 'GP',

	'Polymorphous vessels': 'PV',
	'Polymorphe Gefäße': 'PV',

	'Pseudopods or radial lines at the lesion margin that do not occupy the entire lesional circumference': 'PRL',
	'Pseudopodien oder radiale Linien am Läsionsrand, die nicht den gesamten Läsionsumfang einnehmen': 'PRL',

	'Black dots or globules in the periphery of the lesion': 'BDG',
	'Schwarze Punkte oder Schollen in der Läsionsperipherie': 'BDG',

	'White lines or white structureless area': 'WLSA',
	'Weiße Linien oder weißes strukturloses Areal': 'WLSA',

	'Parallel lines on ridges (acral lesions only)': 'PLR',
	'Parallele Linien auf den Leisten (nur akrale Läsionen)': 'PLR',

	'Pigmentation extends beyond the area of the scar (only after excision)': 'PES',
	'Pigmentierung überschreitet Narbenareal (nur nach Entfernung)': 'PES',

	'Pigmentation invades the openings of hair follicles (facial lesions)': 'PIF',
	'Pigmentierung überschreitet Follikelöffnung (Gesichtsläsionen)': 'PIF',

	'Only one pattern and only one colour': 'OPC',
	'Nur ein Muster und nur eine Farbe': 'OPC',

	'Symmetrical combination of patterns and;or colours': 'SPC',
	'Symmetrische Kombination von Mustern und;oder Farben': 'SPC',

	'Monomorphic vascular pattern': 'MVP',
	'Monomorphes Gefäßmuster': 'MVP',

	'Pseudopods or radial lines at the lesional margin involving the entire lesional circumference': 'PRLC',
	'Pseudopodien oder radiale Linien am Läsionsrand über den gesamten Läsionsumfang': 'PRLC',

	'Parallel lines in the furrows (acral lesions only)': 'PLF',
	'Parallele Linien in den Furchen (nur akrale Läsionen)': 'PLF',

	'Pigmentation does not extend beyond the area of the scar (only after excision)': 'PDES',
	'Pigmentierung überschreitet Narbe nicht (nur nach Entfernung)': 'PDES',

	'Asymmetric combination of multiple patterns and;or colours in the absence of other melanoma criteria': 'APC',
	'Asymmetrische Kombination mehrerer Muster und;oder Farben ohne weitere Melanomkriterien': 'APC',

	'Melanoma simulator': 'MS',
	'Melanomsimulator': 'MS',
    
	'Please select at least one explanation': 'XX',
	'Bitte wählen Sie mindestens eine Erklärung aus.': 'XX',
    'Andere (bitte angeben)': 'XX'
}

char_class_labels = ['TRBL', 'ESA', 'BDG', 'GP', 'PV', 'PRL', 'WLSA', 'PES', 'PIF', 'OPC', 'SPC', 'MVP', 'PRLC', 'PLF', 'PDES', 'APC', 'MS']
mel_class_labels = ['TRBL', 'ESA', 'BDG', 'GP', 'PV', 'PRL', 'WLSA', 'PES', 'PIF', 'PLR']
nv_class_labels = ['OPC', 'SPC', 'MVP', 'PRLC', 'PLF', 'PDES', 'APC', 'MS']


def process_annotation(polygons):
    # Sometimes the polygon label was defined as bounding box (human error),
    #  the 'points' key is missing in that case, so the following if condition handles that.
    if 'points' in polygons['data']:
        polygon = np.array(polygons['data']['points'])
    else:
        botton_left = polygons['data']['min']
        top_right = polygons['data']['max']
        botton_right = [top_right[0], botton_left[1]]
        top_left = [botton_left[0], top_right[1]]
        polygon = np.array([botton_left, botton_right, top_right, top_left])

    polygon[:, 0] *= 600
    polygon[:, 1] *= 450
    polygon = polygon.astype(int)
    
    # Different spellings of Explanation, sometimes even the English word was defined
    try:
        explanation = polygons['children']['Erklärung']['data']['selected']
    except:
        try:
            explanation = polygons['children']['Erklärungen']['data']['selected']
        except: 
            explanation = polygons['children']['Explanation']['data']['selected']
                
    try:
        confidence = polygons['children']['Konfidenz']['data']['selected']
    except:
        confidence = -1
    if confidence == 'Bitte wählen Sie eine Konfidenz.':
        confidence = -1
    else:
        confidence = int(confidence)
    
    if type(explanation) != list:
        explanation = [explanation]
    explanation = [mappings[exp] for exp in explanation]
    
    # XX has been mapped from "Please choose 1 explanation" and "Other Explanation"
    # Since XX is auto selected in every explanation we're dropping XX
    # Unless XX is the only explanation, which means no explanation was chosen,
    #  renaming it to 'None'
    while 'XX' in explanation: explanation.remove('XX')
    
    if len(explanation) == 0:
        explanation.append('None')
    
    return polygon, explanation, confidence


  from .autonotebook import tqdm as notebook_tqdm


# Process metadata files individually in a loop

In [2]:
#ham = pd.read_csv("/home/kti01/Documents/My Files/Data/HAM10000/HAM10000/metadata")
#ham = ham[ham.dx.isin(['mel', 'nv'])]
#ham = ham[ham.dx_type=='histo']

ham = pd.read_csv("/home/kti01/Documents/My Files/Projects/Overlap/data/metadata_testset.csv").dropna(how='all')
ham['mask'] = ham['mask'].astype(int)

# CHANGE THIS DIR
phase1_dir = "/home/kti01/Documents/My Files/Projects/Overlap/data/phase1/repeated_image_de"
participants = os.listdir(phase1_dir)
participants = [participant for participant in participants if not (participant.endswith('csv') or participant.endswith('pkl'))]
participants = [x for x in participants if not x.endswith('.pkl')]
confidences = {}
repeating_img = {}

for participant in participants:
    data_dir = os.path.join(phase1_dir, participant)
    files = []
    for file in ['train.json', 'validation.json', 'test.json']:

        data_file = os.path.join(data_dir, file)

        with open(os.path.join(data_file)) as f:
            d = json.load(f)
            annotations = dict(d)['labels']

        # Store labels in a dict with image names as keys and labels as values
        labels = {annotation['dataId']: annotation['annotations'] for annotation in annotations}


        image_list = []
        polygons_list = []
        explanations_list = []
        confidence_list = []
        prediction_list = []
        for k, v in labels.items():

            if len(v['Melanom']) != 0:
                for polygons in v['Melanom']:

                    polygon, explanation, confidence = process_annotation(polygons)

                    image_list.append(k)
                    polygons_list.append(polygon)
                    explanations_list.append(explanation)
                    confidence_list.append(confidence)
                    prediction_list.append(1)
            elif len(v['Nävus (belassen)']) != 0:
                for polygons in v['Nävus (belassen)']:
                    polygon, explanation, confidence = process_annotation(polygons)

                    image_list.append(k)
                    polygons_list.append(polygon)
                    explanations_list.append(explanation)
                    confidence_list.append(confidence)
                    prediction_list.append(0)
            elif len(v['Nävus (exzidieren)']) != 0:
                for polygons in v['Nävus (exzidieren)']:
                    polygon, explanation, confidence = process_annotation(polygons)

                    image_list.append(k)
                    polygons_list.append(polygon)
                    explanations_list.append(explanation)
                    confidence_list.append(confidence)
                    prediction_list.append(0.5)

        labels = pd.DataFrame({'image_id': image_list, 'polygon': polygons_list, 'explanation': explanations_list,
                              'confidence': confidence_list, 'prediction': prediction_list})
        #labels['explanation'] = labels['explanation'].apply(lambda x: ['Bitte wählen Sie mindestens eine Erklärung aus.'] if len(x)==0 else x)
    
        
        metadata_df = pd.DataFrame(labels.explanation.explode())
        metadata_df = pd.merge(labels['image_id'], metadata_df, left_index=True, right_index=True)
        metadata_df = pd.get_dummies(metadata_df.explanation)


        metadata_df = labels.join(metadata_df)
        metadata_df = metadata_df.groupby('image_id').mean()
        metadata_df = metadata_df.reset_index()
            
            
        for col in metadata_df.columns.drop(['image_id', 'confidence', 'prediction']):
            metadata_df.loc[metadata_df[col]>0, col] = 1 

        metadata_df['mask'] = metadata_df['image_id'].apply(lambda x: x.split('.')[0]).astype(np.int64)
        #metadata_df.drop('image_id', axis=1, inplace=True)
        metadata_df.sort_values('mask', inplace=True)
        
        
        metadata_df = pd.merge(metadata_df, ham[['mask', 'group', 'image_id']].rename({'image_id': 'id'}, axis=1), 
                               on=['mask'], how='left')

        labels['mask'] = labels['image_id'].apply(lambda x: x.split('.')[0]).astype(np.int64)
        labels = pd.merge(labels, metadata_df[['mask']], on='mask', how='inner')

        # Add polygons as a column
        annotations_dict = labels.set_index('image_id', drop=True).groupby(level=0).apply(lambda x: x.to_dict('records')).to_dict()
        metadata_df['annotations'] = [annotations_dict[img] for img in metadata_df.image_id]
        metadata_df['image'] = metadata_df['mask'].apply(lambda x: str(x)+'.jpg')


        # Put each explanation and it's corresponding polygon in individual rows
        d = {'image_id': [], 'explanation': [], 'polygon': []}
        for idx, row in labels.iterrows():
            for exp in row['explanation']:
                d['image_id'].append(row['image_id'])
                d['explanation'].append(exp)
                d['polygon'].append(row['polygon'])
        d = pd.DataFrame(d)

        final_dict = {'image_id': [], 'explanation': [], 'polygon': []}
        # Loop over each image
        for image_id in d['image_id'].unique():
            # Get sub dataframe consisting of only one image's explanations 
            sub_d = d[d['image_id'] == image_id]
            # Loop over each explanation for the current image
            for exp in sub_d['explanation'].unique():
                # Select all rows of the dataframe where the explanation is the current explanation
                sub_de = sub_d[sub_d['explanation'] == exp]
                # Loop over each polygon and append to list
                poly_list = []
                for poly in sub_de['polygon']:
                    poly_list.append(poly)

                # Keep appending to lists in a dict so that we can create a dataframe out of this
                final_dict['polygon'].append(poly_list)
                final_dict['image_id'].append(image_id)
                final_dict['explanation'].append(exp)
                
        final_df = pd.DataFrame(final_dict)
        final_df = final_df.pivot_table(values='polygon', index=final_df['image_id'], columns='explanation', aggfunc='first')

        final_df.fillna(-1, inplace=True)

        # Rename columns
        for col in final_df.columns:
            final_df.rename(columns={col: col+'_annotation'}, inplace=True)
        final_df['participant'] = participant

        final_merged_df = pd.merge(metadata_df, final_df, on='image_id', how='inner')
        files.append(final_merged_df)
        
        
    final_final_merged_df = pd.concat(files)
    final_final_merged_df.reset_index(drop=True, inplace=True)
    final_final_merged_df.fillna(-1, inplace=True)
    
    final_final_merged_df['image_id'] = final_final_merged_df['id']
    final_final_merged_df = final_final_merged_df.drop('id', axis=1)
    
    
    final_final_merged_df = pd.merge(final_final_merged_df, ham[['image_id', 'dx']], on='image_id', how='left').drop_duplicates(subset='mask')
    final_final_merged_df['benign_malignant'] = final_final_merged_df['dx'].apply(lambda x: 0 if x == 'nv' else 1)
    
    final_final_merged_df = final_final_merged_df.sort_values('mask').reset_index(drop=True)
    
    try:
        idx = final_final_merged_df[final_final_merged_df['mask'] == 9898].index[0]
        final_final_merged_df.at[idx, 'group'] = final_final_merged_df.loc[0, 'group']
        final_final_merged_df.at[idx, 'image_id'] = final_final_merged_df.loc[2, 'image_id']
    except:
        pass
    
    with open(os.path.join(data_dir, 'metadata.pkl'), 'wb') as fp:
        pickle.dump(final_final_merged_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
    print('Saved metadata for '+participant)
    

Saved metadata for g80
Saved metadata for g50
Saved metadata for g40
Saved metadata for g49
Saved metadata for g55
Saved metadata for g77
Saved metadata for g3
Saved metadata for g42
Saved metadata for g60
Saved metadata for g9
Saved metadata for g44
Saved metadata for g31
Saved metadata for g43
Saved metadata for g53
Saved metadata for g6
Saved metadata for g22
Saved metadata for g71
Saved metadata for g39
Saved metadata for g69
Saved metadata for g61
Saved metadata for g51
Saved metadata for g26
Saved metadata for g28
Saved metadata for g72
Saved metadata for g79
Saved metadata for g15
Saved metadata for g78
Saved metadata for g24
Saved metadata for g5
Saved metadata for g35
Saved metadata for g11
Saved metadata for g29
Saved metadata for g48
Saved metadata for g19
Saved metadata for g10
Saved metadata for g30
Saved metadata for g32
Saved metadata for g36
Saved metadata for g14
Saved metadata for g85
Saved metadata for g76
Saved metadata for g82
Saved metadata for g16
Saved metadata 

# Merge all individual annotator's metadata files into one file

In [3]:
participants = os.listdir(phase1_dir)
participants = [x for x in participants if not x.endswith('.pkl') and not x.endswith('.csv')]

metadata_list = []
for participant in participants:
    data_dir = os.path.join(phase1_dir, participant)
    
    metadata = pd.read_pickle(os.path.join(data_dir, 'metadata.pkl'))
    metadata['participant'] = participant
    metadata_list.append(metadata)
metadata_df = pd.concat(metadata_list)
metadata_df.fillna(-1, inplace=True)
metadata_df = metadata_df[['participant'] + [col for col in metadata_df.columns if col != 'participant' ]]
metadata_df['confidence'] = metadata_df['confidence'].round()

In [4]:
metadata_df.to_csv(os.path.join(phase1_dir, 'metadata_phase1_german.csv'), index=False)
with open(os.path.join(phase1_dir, 'metadata_phase1_german.pkl'), 'wb') as fp:
    pickle.dump(metadata_df, fp, protocol=pickle.HIGHEST_PROTOCOL)
    print('Saved consolidated metadata file')

Saved consolidated metadata file
