# Imports:

In [2]:
import pandas as pd
import json, glob, os
from pathlib import Path
from typing import Callable, Optional, Tuple, Union
import PIL.Image

# Create Binary Image Labels:

- Create labels for all Tharun Thompson images based on binary classification - being PTC-like (1) or not (0).

In [88]:
image_paths = glob.glob("data/TharunThompson/*")
image_paths[0:5]

['all_imgs/TharunThompson\\100a.jpeg',
 'all_imgs/TharunThompson\\100b.jpeg',
 'all_imgs/TharunThompson\\100c.jpeg',
 'all_imgs/TharunThompson\\100d.jpeg',
 'all_imgs/TharunThompson\\100e.jpeg']

In [89]:
class_df = pd.read_csv('data/tt_class_detailed.csv')
class_df.head()

Unnamed: 0,patient,diagnose,Slices,Resolution (20x/40x),classification,img_lbl,bi_class
0,1,PTC,10,40,PTC-like,1a.jpeg,1.0
1,1,PTC,10,40,PTC-like,1b.jpeg,1.0
2,1,PTC,10,40,PTC-like,1c.jpeg,1.0
3,1,PTC,10,40,PTC-like,1d.jpeg,1.0
4,1,PTC,10,40,PTC-like,1e.jpeg,1.0


In [251]:
# Count the number of images of each classification type:
class_df.classification.value_counts()

non-PTC-like    850
PTC-like        646
Name: classification, dtype: int64

In [90]:
fnames = [os.path.basename(x) for x in image_paths]
fnames[0:5]

['100a.jpeg', '100b.jpeg', '100c.jpeg', '100d.jpeg', '100e.jpeg']

In [91]:
class_lbls = [int(class_df[class_df['img_lbl']==x].bi_class.values[0]) for x in fnames]
class_lbls[0:5]

[0, 0, 0, 0, 0]

In [92]:
patient_lbls = [str(class_df[class_df['img_lbl']==x].patient.values[0]) for x in fnames]
print(patient_lbls[0:5])
diagnosis_lbls = [str(class_df[class_df['img_lbl']==x].diagnose.values[0]) for x in fnames]
print(diagnosis_lbls[0:5])

['100', '100', '100', '100', '100']
['FA', 'FA', 'FA', 'FA', 'FA']


In [93]:
# Creating labels to be used during GAN and model training: 
lbl_list = list(map(lambda x, y:[x,y], fnames, class_lbls))
lbl_list[0:5]

[['100a.jpeg', 0],
 ['100b.jpeg', 0],
 ['100c.jpeg', 0],
 ['100d.jpeg', 0],
 ['100e.jpeg', 0]]

In [94]:
# Creating meta-data to be used during model evaluation:
meta_list = list(map(lambda a,b,c,d:[a,b,c,d], fnames, class_lbls, patient_lbls, diagnosis_lbls))
meta_list[0:5]

[['100a.jpeg', 0, '100', 'FA'],
 ['100b.jpeg', 0, '100', 'FA'],
 ['100c.jpeg', 0, '100', 'FA'],
 ['100d.jpeg', 0, '100', 'FA'],
 ['100e.jpeg', 0, '100', 'FA']]

In [95]:
lab_dict = {}
lab_dict['labels'] = lbl_list
lab_dict['meta'] = meta_list

In [96]:
# Saving labels dict to json file:
with open('data/labels/bi_dataset.json', 'w') as f:
    json.dump(lab_dict, f)

# Create Multi-Class Classification Labels:
- Create labels based on multi-class diagnosis, being as follows:
    
    PTC-like:
        - Classical papillary thyroid carcinoma (PTC, 53 patients) - 0
        - Noninvasive follicular thyroid neoplasm with papillary-like nuclear features (NIFTP, 9 patients) - 1
        - Follicular variant papillary thyroid carcinoma (FVPTC, 9 patients) - 2
    Non-PTC-like
        - Follicular thyroid adenoma (FA, 53 patients) - 3
        - Follicular thyroid carcinoma (FTC, 32 patients) - 4

In [101]:
# Count the number of images in each diagnosis class:
class_df.diagnose.value_counts()

FA       530
PTC      484
FTC      320
FVPTC     86
NIFTP     76
Name: diagnose, dtype: int64

In [102]:
multi_class_dict = {
    "PTC": 0,
    "NIFTP": 1,
    "FVPTC": 2,
    "FA": 3,
    "FTC": 4
}

In [116]:
# Creating multi-class labels based on diagnose column:
m_c_lbls = [multi_class_dict[x] for x in class_df.diagnose]
      
# Assingn labels to column in df:
class_df['mc_lbls'] = m_c_lbls

# Confirm the right number of class labels (= to no. images above)
class_df['mc_lbls'].value_counts()

3    530
0    484
4    320
2     86
1     76
Name: mc_lbls, dtype: int64

In [118]:
# Align the multi-class labels with the filenames:
mc_fname_lbls = [int(class_df[class_df['img_lbl']==x].mc_lbls.values[0]) for x in fnames]
mc_fname_lbls[0:5]

[3, 3, 3, 3, 3]

In [119]:
# Concat the labels with the fnames:
mc_lbl_list = list(map(lambda x, y:[x,y], fnames, mc_fname_lbls))
mc_lbl_list[0:5]

[['100a.jpeg', 3],
 ['100b.jpeg', 3],
 ['100c.jpeg', 3],
 ['100d.jpeg', 3],
 ['100e.jpeg', 3]]

In [120]:
# Save as dictionary:
mc_lab_dict = {}
mc_lab_dict['labels'] = mc_lbl_list

In [122]:
# Saving labels dict to json file:
with open('data/labels/mc_dataset.json', 'w') as f:
    json.dump(mc_lab_dict, f)

# Labelling the External Dataset Images:

In [97]:
ext_paths = glob.glob("data/niki_tcga/*")
ext_paths[0:5]

['all_imgs/External\\NIK-A008_0.jpeg',
 'all_imgs/External\\NIK-A008_1.jpeg',
 'all_imgs/External\\NIK-A008_10.jpeg',
 'all_imgs/External\\NIK-A008_11.jpeg',
 'all_imgs/External\\NIK-A008_12.jpeg']

In [98]:
ext_df = pd.read_csv('data/ext_data_classification.csv')
ext_df.head()

Unnamed: 0,case_id,diag_short,diagnosis,gender,dataset,classification,binary_class
0,TCGA-BJ-A0Z2,FA,"Follicular adenocarcinoma, NOS",Male,tcga,non-PTC-like,0
1,TCGA-DJ-A4UQ,NSC,Nonencapsulated sclerosing carcinoma,Male,tcga,non-PTC-like,0
2,TCGA-BJ-A192,OA,Oxyphilic adenocarcinoma,Female,tcga,non-PTC-like,0
3,TCGA-L6-A4ET,FVPTC,"Papillary carcinoma, follicular variant",Female,tcga,PTC-like,1
4,TCGA-EM-A3OA,FVPTC,"Papillary carcinoma, follicular variant",Female,tcga,PTC-like,1


In [99]:
ext_df['diag_short'].unique().tolist()

['FA', 'NSC', 'OA', 'FVPTC', 'NIFTP', 'B']

In [100]:
# Return list of external filenames:
ext_fnames = [os.path.split(x)[1] for x in ext_paths]
ext_fnames[0:5]

['NIK-A008_0.jpeg',
 'NIK-A008_1.jpeg',
 'NIK-A008_10.jpeg',
 'NIK-A008_11.jpeg',
 'NIK-A008_12.jpeg']

In [101]:
# Returning lists for binary classification (PTC-like (1.0) or Not (0.0)), diagnosis and patient id:
ext_class = [int(ext_df[ext_df['case_id']==x.split('_')[0]].binary_class.values[0]) for x in ext_fnames]
print(ext_class[0:5])
ext_diag =[ext_df[ext_df['case_id']==x.split('_')[0]].diag_short.values[0] for x in ext_fnames]
print(ext_diag[0:5])
ext_pat = [x.split('_')[0].split('-')[-1] for x in ext_fnames]
print(ext_pat[0:5])

[1, 1, 1, 1, 1]
['NIFTP', 'NIFTP', 'NIFTP', 'NIFTP', 'NIFTP']
['A008', 'A008', 'A008', 'A008', 'A008']


In [102]:
ext_meta_list = list(map(lambda a,b,c,d:[a,b,c,d], ext_fnames, ext_class, ext_pat, ext_diag))
ext_meta_list[0:5]

[['NIK-A008_0.jpeg', 1, 'A008', 'NIFTP'],
 ['NIK-A008_1.jpeg', 1, 'A008', 'NIFTP'],
 ['NIK-A008_10.jpeg', 1, 'A008', 'NIFTP'],
 ['NIK-A008_11.jpeg', 1, 'A008', 'NIFTP'],
 ['NIK-A008_12.jpeg', 1, 'A008', 'NIFTP']]

In [103]:
ext_lab_dict = {}
# Only using these for testing not training, so only need a 'meta' dict:
ext_lab_dict['meta'] = ext_meta_list

In [104]:
# Saving labels dict to json file:
with open('data/labels/ext_dataset.json', 'w') as f:
    json.dump(ext_lab_dict, f)

In [105]:
# Add all filenames to a dictionary for testing:
test_dict = {}
test_dict['test'] = ext_fnames

In [106]:
# Saving testing dict to json file:
with open('data/labels/ext_test_fnames.json', 'w') as f:
    json.dump(test_dict, f)