In [1]:
import numpy as np

import pandas as pd
import sys
import os

In [2]:
# import plotting libraries
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('seaborn') # pretty matplotlib plots

import seaborn as sns
sns.set('notebook', font_scale=1.25, style='whitegrid')



## Setup Paths on your System

In [3]:
PATH_TO_TMED2 = "/Users/mhughes/Box/TMED2/approved_users_only/"

# Utility Functions

Simple functions, reused later

In [4]:
def map_fine_diagnosis_str_to_int(s):
    if s.count("no"):
        return 0
    elif s.count("mild_AS"):
        return 1
    elif s.count("mildtomod_AS"):
        return 2
    elif s.count("moderate_AS"):
        return 3
    elif s.count("severe_AS"):
        return 4
    else:
        raise ValueError("BAD STRING")
        
def convert_int_to_diagnosis_str(i):
    if i == 0:
        return "no_AS"
    elif i == 1:
        return "mild_AS"
    elif i == 2:
        return "mildtomod_AS"
    elif i == 3:
        return "moderate_AS"
    elif i == 4:
        return "severe_AS"
    else:
        raise ValueError("BAD INT VALUE (not in 0-4)")

In [5]:
def make_coarse_label(s):
    if s.count("mild"):
        return 'early_AS'
    elif s.count("no"):
        return 'no_AS'
    else:
        return 'significant_AS'
    
def make_coarse_label_column_from_df(s_arr):
    return [make_coarse_label(s) for s in s_arr]

In [6]:
def make_percentile_func(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

In [7]:
def pretty_print_description(df, colname, percentiles=[0.0,.05,.50,.95,1.00]):
    descr_df = df[colname].describe(percentiles)
    
    b_df = pd.DataFrame([descr_df.values], columns=descr_df.index.values)
    keys = ['count'] + ['%d%%' % (100*p) for p in percentiles]
    print(b_df[keys].to_string(index=False))
    return b_df, keys

## Function to load labeled set summary dataframe from CSV files

In [8]:
def load_fold_specific_dataframes(
        labeled_csv_relpath='DEV479/TMED2_fold0_labeledpart.csv',
        unlabeled_csv_relpath=None,
        count_key = '__count__',
        verbose=False,
        ):
    ''' Load dataframes describing labels and train/test allocations for each patient/study/image
    
    Returns
    -------
    all_image_df : Dataframe with one row for every image (labeled and unlabeled)
    labeled_image_df : Dataframe with one row per labeled image
    study_df : Dataframe with one row per study
    patient_df : Dataframe with one row per patient
    '''
    # Load CSV describing every view-labeled image fully-labeled set
    labeled_df = pd.read_csv(os.path.join(PATH_TO_TMED2, labeled_csv_relpath))

    # Load CSV describing unlabeled images that correspond to the studies in labeled set
    if unlabeled_csv_relpath is None:
        unlabeled_csv_relpath = labeled_csv_relpath.replace("_labeled", "_unlabeled")
    unlabeled_df = pd.read_csv(os.path.join(PATH_TO_TMED2, unlabeled_csv_relpath))
    
    # Convert patient_id and study_id to integers for easy operations
    # Also create a patient_study_id key that uniquely ids each patient-study by concat-ing the two together
    for name, df in [('labeled',labeled_df), ('unlabeled',unlabeled_df)]:
        df[['patient_id_str','study_id_str','_unused_']] = df['query_key'].str.split("s|_", expand=True)
        df['patient_id'] = df['patient_id_str'].astype(int)
        df['study_id'] = df['study_id_str'].astype(int)
        assert df['patient_id'].max() < 10000
        df['patient_study_id'] = df['patient_id'] * 10000 + df['study_id']
        if verbose:
            print(name.upper())
            print("raw num patients: %5d" % df['patient_id'].unique().size)
            print("raw num studies : %5d" % df['patient_study_id'].unique().size)
    
    # Keep only the images with a view label in the fully-labeled set
    labeled_image_df = labeled_df.query("SourceFolder == 'view_and_diagnosis_labeled_set/labeled'").copy()
    labeled_image_df.sort_values(['patient_study_id', 'query_key', 'view_label'], inplace=True)

    # Build dataframe where each row is a unique PATIENT-STUDY with 1+ images in fully-labeled set
    study_keys = ['patient_id','study_id','patient_study_id',
                  'view_classifier_split','diagnosis_classifier_split', 'diagnosis_label']
    study_df = labeled_image_df[study_keys].drop_duplicates(keep='last')
    study_df.sort_values(['patient_study_id'], inplace=True)

    # Build dataframe where each row is a unique PATIENT with 1+ images in fully-labeled set
    patient_keys = ['patient_id','view_classifier_split','diagnosis_classifier_split']
    patient_df = labeled_image_df[patient_keys].drop_duplicates(keep='last')
    patient_df.sort_values(['patient_id'], inplace=True)

    # Verify each row of study_df is uniquely keyed by patient_study_id
    assert study_df['patient_study_id'].unique().size == study_df.shape[0]

    # Verify each row of patient_df is uniquely keyed by patient_id
    assert patient_df['patient_id'].unique().size == patient_df.shape[0]
    
    # Verify view and diagnosis split assignments always the same
    assert np.all(patient_df['view_classifier_split'] == patient_df['diagnosis_classifier_split'])
    assert np.all(study_df['view_classifier_split'] == study_df['diagnosis_classifier_split'])
    
    # Verify each patient's studies always assigned to exactly one split
    for p in patient_df['patient_id']:
        q_df = study_df.query("patient_id == %d" % p)
        vals = q_df['view_classifier_split'].unique()
        assert vals.size == 1
        vals = q_df['diagnosis_classifier_split'].unique()
        assert vals.size == 1

    if verbose:
        # Pretty print the first 10 and last 10 images from labeled set
        def pretty_print_img_df(img_df):
            simple_view_df = img_df[['query_key', 'view_label', 'diagnosis_label', 'diagnosis_classifier_split']]
            print(simple_view_df.head(6).to_string(index=False))
            print("...")
            print(simple_view_df.tail(6).to_string(index=False, header=False))
        print("Loaded from filepath: %s" % labeled_csv_relpath)
        pretty_print_img_df(labeled_image_df)
    
    # Grab unlabeled images that correspond to the labeled studies
    # Make sure these images are put into the right split
    keep_df_list = list()
    for split in ['train', 'val', 'test']:
        q_df = study_df.query("diagnosis_classifier_split == '%s'" % split)
        labeled_study_ids = q_df['patient_study_id'].unique()
        q_unlabeled_img_df = unlabeled_df.loc[
            unlabeled_df['patient_study_id'].isin(labeled_study_ids).values].copy()
        q_unlabeled_img_df['diagnosis_classifier_split'] = split
        keep_df_list.append(q_unlabeled_img_df)
    keep_unlabeled_img_df = pd.concat(keep_df_list)

    # Build combined dataframe of ALL images (labeled and unlabeled)
    # FYI some "query_key" are redundant due to separate indexing in lab/unlab sets
    all_image_df = pd.concat([labeled_image_df, keep_unlabeled_img_df])
    all_image_df.sort_values(['patient_study_id', 'query_key', 'view_label'], inplace=True)

    # Clean up all dataframes (fresh indexing from 0,1,... for each one)
    all_image_df = all_image_df.reset_index(drop=True)
    labeled_image_df = labeled_image_df.reset_index(drop=True)
    study_df = study_df.reset_index(drop=True)
    patient_df = patient_df.reset_index(drop=True)
    
    # Count number of images per study
    all_image_df[count_key] = 1
    labeled_image_df[count_key] = 1
    study_df['n_all_image_per_study'] = all_image_df.groupby(['patient_study_id'], as_index=False).agg(
        {count_key: 'sum'})[count_key]
    study_df['n_labeled_image_per_study'] = labeled_image_df.groupby(['patient_study_id'], as_index=False).agg(
        {count_key: 'sum'})[count_key]
    study_df['diagnosis_label_as_int'] = [map_fine_diagnosis_str_to_int(s) for s in study_df['diagnosis_label'].values]
    
    # Count number of studies per patient and worst diagnosis per patient
    study_df[count_key] = 1
    patient_df['n_study_per_patient'] = study_df.groupby(['patient_id'], as_index=False).agg(
        {count_key: 'sum'})[count_key]
    patient_df['worst_diagnosis_label_as_int'] = study_df.groupby(['patient_id'], as_index=False).agg(
        {'diagnosis_label_as_int': 'max'})['diagnosis_label_as_int']
    patient_df['worst_diagnosis_label'] = [convert_int_to_diagnosis_str(i) for i in patient_df['worst_diagnosis_label_as_int']]

    return all_image_df, labeled_image_df, study_df, patient_df

In [9]:
for fold_id, split_csv in [
        ('fold0', 'DEV479/TMED2_fold0_labeledpart.csv'),
        ('fold1', 'DEV479/TMED2_fold1_labeledpart.csv'),
        ('fold2', 'DEV479/TMED2_fold2_labeledpart.csv'),
        ]:
    
    all_image_df, labeled_image_df, study_df, patient_df = load_fold_specific_dataframes(split_csv)    

    print("============== FOLD %s" % fold_id)
    print("TOTAL PATIENTS")
    print(patient_df.shape[0])
    print("TOTAL STUDIES")
    print(study_df.shape[0])
    print("STUDIES PER PATIENT")
    vals, counts = np.unique(patient_df['n_study_per_patient'], return_counts=1)
    for vv, cc in zip(vals, counts):
        print(" %3d patients contributed %d studies" % (cc, vv))    
    print("TOTAL IMAGES")
    print(all_image_df.shape[0])
    print("TOTAL LABELED IMAGES")
    print(labeled_image_df.shape[0])
    
    with pd.option_context('display.float_format', '{:0.1f}'.format):
        print("NUM ALL IMAGES PER STUDY")
        pretty_print_description(study_df, 'n_all_image_per_study')
        
        print("NUM LABELED IMAGES PER STUDY")
        pretty_print_description(study_df, 'n_labeled_image_per_study')
        
    print()


TOTAL PATIENTS
576
TOTAL STUDIES
598
STUDIES PER PATIENT
 554 patients contributed 1 studies
  22 patients contributed 2 studies
TOTAL IMAGES
43823
TOTAL LABELED IMAGES
17270
NUM ALL IMAGES PER STUDY
 count   0%   5%  50%   95%  100%
 598.0 15.0 48.0 70.0 105.1 181.0
NUM LABELED IMAGES PER STUDY
 count  0%  5%  50%  95%  100%
 598.0 1.0 4.9 19.0 71.0 107.0

TOTAL PATIENTS
576
TOTAL STUDIES
598
STUDIES PER PATIENT
 554 patients contributed 1 studies
  22 patients contributed 2 studies
TOTAL IMAGES
43823
TOTAL LABELED IMAGES
17270
NUM ALL IMAGES PER STUDY
 count   0%   5%  50%   95%  100%
 598.0 15.0 48.0 70.0 105.1 181.0
NUM LABELED IMAGES PER STUDY
 count  0%  5%  50%  95%  100%
 598.0 1.0 4.9 19.0 71.0 107.0

TOTAL PATIENTS
576
TOTAL STUDIES
598
STUDIES PER PATIENT
 554 patients contributed 1 studies
  22 patients contributed 2 studies
TOTAL IMAGES
43823
TOTAL LABELED IMAGES
17270
NUM ALL IMAGES PER STUDY
 count   0%   5%  50%   95%  100%
 598.0 15.0 48.0 70.0 105.1 181.0
NUM LABELED 

# Create tables summarizing cohorts across train/test splits

In [10]:
def summarize_train_test_label_contents_of_fold(
        all_image_df, labeled_image_df, study_df, patient_df,
        percentiles=[10, 20, 50, 80, 90],
        verbose=False,
        count_key = '__count__'
        ):
    row_dict_list = list()
    for diagnosis_label in ['no_AS', 'mild_AS', 'mildtomod_AS', 'moderate_AS', 'severe_AS']:
        for split in ['train', 'val', 'test']:
            q_df = study_df.query(
                "diagnosis_label == '%s' and diagnosis_classifier_split == '%s'" % (diagnosis_label, split))

            row_dict = dict(
                fine_label=diagnosis_label,
                split=split,
                num_patients=q_df['patient_id'].unique().size,
                num_studies=q_df.shape[0],
            )
            # TODO add this key to summarize all images not just labeled ('all', all_image_df)
            for imgtype_key, img_df in [
                    ('all', all_image_df),
                    ('labeled', labeled_image_df),
                    ('frac_labeled', labeled_image_df)]:
                iq_df = img_df.query(
                    "diagnosis_label == '%s' and diagnosis_classifier_split == '%s'" % (
                        diagnosis_label, split)).copy()
                iq_df[count_key] = 1
                n_im_per_study_df = iq_df.groupby(['patient_study_id'], as_index=False).agg(
                    {count_key: 'sum'})[['patient_study_id', count_key]]
                n_im_per_study = n_im_per_study_df[count_key]
                if imgtype_key.startswith("frac"):
                    n_im_per_study /= n_im_per_study_all
                
                if verbose and split == 'train' and diagnosis_label == 'no_AS':
                    print("Num %s images in first 10 studies in TRAIN with label no_AS" % imgtype_key)
                    print(n_im_per_study.values[:10])

                if not imgtype_key.startswith('frac'):
                    row_dict['num_images_%s' % imgtype_key] = iq_df.shape[0]
                    
                for perctile in percentiles:
                    if imgtype_key.startswith('frac'):
                        key = 'frac_images_per_study_%s_%02d' % (imgtype_key[5:], perctile)
                    else:
                        key = 'num_images_per_study_%s_%02d' % (imgtype_key, perctile)
                    row_dict[key] = \
                        np.percentile(n_im_per_study, perctile)
                if imgtype_key == 'all':
                    n_im_per_study_all = np.asarray(n_im_per_study, dtype=np.float64)
                
            row_dict_list.append(row_dict)
    
    summary_df = pd.DataFrame(row_dict_list)
    summary_df['coarse_label'] = make_coarse_label_column_from_df(summary_df['fine_label'])
    summary_df['coarse_label'] = pd.Categorical(summary_df['coarse_label'], categories=['no_AS', 'early_AS', 'significant_AS'])
    summary_df['split'] = pd.Categorical(summary_df['split'], categories=['train', 'val', 'test'])
    
    keys = [
        'num_patients', 'num_studies',
        'num_images_labeled'] + ['num_images_per_study_labeled_%02d' % p for p in percentiles]
    if 'num_images_all' in row_dict:
        keys = keys + [k.replace("_labeled", "_all") for k in keys if k.count('_labeled')]
        keys = keys + [k.replace("num_images_per_study_", "frac_images_per_study_") for k in keys if k.count('num_images_per_study_labeled')]
    keys = np.unique(keys).tolist()
    coarse_df = summary_df[['coarse_label', 'split'] + keys].copy()
    sum_df_by_key = dict()
    for key in keys:
        if key.count("frac_images"):
            aggfunc = 'max'
            margins = False
        elif key.count("per_study"):
            aggfunc = 'sum'
            margins = False # cant aggregate acros per-study stats easily            
        else:
            aggfunc = 'sum'
            margins = True
        my_df = coarse_df.groupby(['coarse_label', 'split'], as_index=False).agg({key: aggfunc})
        sum_df = pd.crosstab(my_df['split'], my_df['coarse_label'], my_df[key], aggfunc=aggfunc, margins=margins)
        sum_df_by_key[key] = sum_df
        if verbose:
            print("=== KEY " + key)
            with pd.option_context('display.float_format', '{:0.2f}'.format):
                print(sum_df)
            print("")
    return sum_df_by_key #, n_im_per_study_all, n_im_per_study_df

In [11]:
def pprint_tables_horizontal_stack(df, col1, col2):
    str1 = df[col1].to_string(index=True).replace("coarse_label", "            ")
    str2 = df[col2].to_string(index=True).replace("coarse_label", "            ")
    L1, L2 = str1.count("\n"), str2.count("\n")
    str2 += "\n" * (L1-L2)
    print("%50s  ||  %s" % (col1.upper(), col2.upper()))
    print("%50s  ||  %s" % ('-' * 50, '-' * len(col2)))
    for line1, line2 in zip(str1.split("\n"), str2.split("\n")):
        print("%40s  ||  %s" % (line1,line2))

## FOLD 0 summary

In [12]:
fold0_df = summarize_train_test_label_contents_of_fold(
    *load_fold_specific_dataframes('DEV479/TMED2_fold0_labeledpart.csv'))

In [13]:
pprint_tables_horizontal_stack(fold0_df, 'num_studies', 'num_images_per_study_labeled_50')

                                       NUM_STUDIES  ||  NUM_IMAGES_PER_STUDY_LABELED_50
--------------------------------------------------  ||  -------------------------------
              no_AS  early_AS  significant_AS  All  ||                no_AS  early_AS  significant_AS
split                                               ||  split                                        
train            76       103             181  360  ||  train          13.0      21.0            95.5
val              25        34              60  119  ||  val            14.0      28.0           102.0
test             25        34              60  119  ||  test           14.0      35.0           114.5
All             126       171             301  598  ||  


## FOLD 1 summary

In [14]:
fold1_df = summarize_train_test_label_contents_of_fold(
    *load_fold_specific_dataframes('DEV479/TMED2_fold1_labeledpart.csv'))

In [15]:
pprint_tables_horizontal_stack(fold1_df, 'num_studies', 'num_images_per_study_labeled_50')

                                       NUM_STUDIES  ||  NUM_IMAGES_PER_STUDY_LABELED_50
--------------------------------------------------  ||  -------------------------------
              no_AS  early_AS  significant_AS  All  ||                no_AS  early_AS  significant_AS
split                                               ||  split                                        
train            76       103             181  360  ||  train          13.0      22.0           100.5
val              25        34              60  119  ||  val            14.0      23.0            97.0
test             25        34              60  119  ||  test           10.0      28.0           110.0
All             126       171             301  598  ||  


## FOLD 2 summary

In [16]:
fold2_df = summarize_train_test_label_contents_of_fold(
    *load_fold_specific_dataframes('DEV479/TMED2_fold2_labeledpart.csv'))

pprint_tables_horizontal_stack(fold2_df, 'num_studies', 'num_images_per_study_labeled_50')

                                       NUM_STUDIES  ||  NUM_IMAGES_PER_STUDY_LABELED_50
--------------------------------------------------  ||  -------------------------------
              no_AS  early_AS  significant_AS  All  ||                no_AS  early_AS  significant_AS
split                                               ||  split                                        
train            76       103             181  360  ||  train          13.5      22.0            99.5
val              24        34              60  118  ||  val            13.0      22.0            95.5
test             26        34              60  120  ||  test           13.5      31.0            98.0
All             126       171             301  598  ||  
