In [1]:
import pickle
import tqdm
import collections 
import re
import numpy as np

In [2]:
datafile = '/gpfs/data/geraslab/Nan/data/breast_mml_datalists/20220111/breasts_lists/ffdm_screening_only/full/train'
with open(datafile, "rb") as f:
    data = pickle.load(f)

In [13]:
import pandas as pd

df = []
for instance in data:
    df.append({'episode_id': instance['episode_id'], 
        'biopsied': instance['biopsied'],
        'malignant': instance['malignant'],
        'benign': instance['benign'],
        'lateral': instance['ffdm']['view'][0]
        })

In [14]:
df = pd.DataFrame(df)

In [20]:
df.groupby(['episode_id', 'lateral']).mean().sum(0)

biopsied     6509
malignant     500
benign       3231
dtype: int64

In [2]:
datafile = '/gpfs/data/geraslab/Ashen/multimodal/multi_modality_datalist/03092022/datalist.pkl'
with open(datafile, "rb") as f:
    data_full_set = pickle.load(f)

In [3]:
single_modal_dict_prefix = '/gpfs/data/geraslab/Ashen/multimodal/single_modality_dictionary/01102022'

with open(os.path.join(single_modal_dict_prefix, 'ffdm_screening.pkl') , "rb") as f:
    ffdm_screening_data = pickle.load(f)

In [47]:
def get_ffdm_info_list(ffdm_exam, diagnostic, lateral):
    
    img_prefix = '/gpfs/data/geraslab/jp4989/data/2021.07.16.combined_ffdm_cropped'
    if diagnostic:
        img_prefix = f"{img_prefix}_diagnostic"
    accession_number = ffdm_exam['accession_number']
    
    ffdm_info_list = []
    for view in ffdm_exam['original_image_size'].keys():
        if view.startswith(lateral[0].upper()):
            images = ffdm_exam[view]
            for index, filename in enumerate(images):
                info = {'img_prefix': img_prefix, 
                        'accession_number': accession_number, 
                        'index': index,
                        'view': view,
                        'filename': filename,
                        'horizontal_flip': ffdm_exam['horizontal_flip'][view][index] if diagnostic \
                            else ffdm_exam['horizontal_flip'], 
                        'best_center': ffdm_exam['best_center'][view][index],
                        'crop_method': 'no_crop' if diagnostic else 'best_center'
                }
                ffdm_info_list.append(info)
    return ffdm_info_list

def generate_list(mrn, cancer_label, exam, ffdm_data=ffdm_screening_data):
    
    breasts = []
    for lateral in ['left', 'right']:
        ffdms = get_ffdm_info_list(ffdm_data[exam['acn']], False, lateral)

        for ffdm in ffdms:
            breast_instance = {'accession_number': exam['acn'],
                                'lateral': lateral,
                                'mrn': mrn, 
                                'study_date': exam['study_date'],
                                'benign': cancer_label[f"{lateral}_benign"], 
                                'malignant': cancer_label[f"{lateral}_malignant"],
                                'biopsied': cancer_label[f"{lateral}_biopsied"],
                                'ffdm': ffdm}
            breasts.append(breast_instance)
            
    return breasts

In [48]:
ffdm_images_train = []
for episode in tqdm.tqdm(data_full_set[0]):
    cancer_label = episode['cancer_label']
    for exam in episode['exams']:
        if exam['modality']=='ffdm_screening':
            images = generate_list(episode['mrn'], cancer_label, exam, ffdm_data=ffdm_screening_data)
            ffdm_images_train.extend(images)


100%|██████████| 520717/520717 [00:09<00:00, 53950.64it/s]


In [54]:
ffdm_images_val = []
for episode in tqdm.tqdm(data_full_set[1]):
    cancer_label = episode['cancer_label']
    for exam in episode['exams']:
        if exam['modality']=='ffdm_screening':
            images = generate_list(episode['mrn'], cancer_label, exam, ffdm_data=ffdm_screening_data)
            ffdm_images_val.extend(images)

100%|██████████| 23460/23460 [00:00<00:00, 33582.84it/s]


In [32]:
len(malignant_screening_ffdm_train), len(non_malignant_screening_ffdm_train)

(1981, 359053)

In [62]:
sum([x['malignant'] for x in ffdm_images_train ])

4481

In [51]:
sum([x['biopsied'] for x in ffdm_images_train ])

46535

In [52]:
sum([x['benign'] for x in ffdm_images_train ])

22496

In [59]:
prefix = '/gpfs/data/geraslab/Nan/data/breast_mml_datalists/20220111/breasts_lists'
savepath = 'nyu_ffdm_screening'
phase = 'train'

os.mkdir(os.path.join(prefix, savepath))
file = os.path.join(prefix, savepath, phase)

print(phase, "#pairs:", len(ffdm_images_train), '\n\t saved at: ', savepath)

with open(file, 'wb') as f:
    pickle.dump(ffdm_images_train, f)

train #pairs: 1561160 
	 saved at:  nyu_ffdm_screening


In [61]:
phase = 'val'

# os.mkdir(os.path.join(prefix, savepath))
file = os.path.join(prefix, savepath, phase)

print(phase, "#pairs:", len(ffdm_images_val), '\n\t saved at: ', savepath)

with open(file, 'wb') as f:
    pickle.dump(ffdm_images_val, f)

val #pairs: 85201 
	 saved at:  nyu_ffdm_screening
