In [1]:
import pickle
import tqdm
import collections 
import re
import numpy as np
import datetime
import spacy

sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words

In [2]:
def join_list(fields):
    return np.unique([y for x in fields  if len(x)!=0 for y in np.array(x[0]).flatten()])

def cleaning_term(s):
    s = re.sub(r'[^\w]', ' ', s)
    s = s.lower().strip()
    return s

prefix = "/gpfs/data/geraslab/Nan/mmselfsup/work_dirs/data/"
file = os.path.join(prefix, "extracted_pathology_info.pkl")
with open(file, "rb") as f:
    patho = pickle.load(f)
patho = patho['final_reports_df']


combined_list = patho[['list_malignant_terms_found_right', 
    'list_malignant_terms_found_left',
   'list_benign_terms_found_right',
   'list_benign_terms_found_right']].apply(lambda x: join_list(x), axis=1)

all_appeared_terms = [y for x in combined_list.values for y in x]

# get all unique benign/malignant terms 
unique_terms = collections.Counter(all_appeared_terms).keys()
unique_terms = [cleaning_term(t) for t in unique_terms]

# clean terms and obtain unique tokens
dict_tokens = collections.Counter([t for term in unique_terms for t in term.split(' ') if t not in all_stopwords and len(t)>1])
unique_tokens = list(dict_tokens.keys())

In [3]:
with open(os.path.join(prefix, 'malignant_benign_indicator_tokens.txt'), 'w') as f:
    for t in unique_tokens:
        f.write(t+'\n')

In [4]:
datafile = '/gpfs/data/geraslab/Nan/data/breast_mml_datalists/20220111/breasts_lists/ffdm_screening_only/full/train'
with open(datafile, "rb") as f:
    train_data = pickle.load(f)

datafile = '/gpfs/data/geraslab/Nan/data/breast_mml_datalists/20220111/breasts_lists/ffdm_screening_only/balanced/val'
with open(datafile, "rb") as f:
    val_data = pickle.load(f)

In [15]:
def get_term_lists(mrn, study_date, patho_info = patho):    
    if mrn not in patho_info['mrn'].unique():
        #patient_record = patho_info[patho_info['mrn']==mrn]
        #if len(patient_record)==0:
        terms = []
    else:
        patient_record = patho_info[patho_info['mrn']==mrn]
        diff_days = patient_record['date'] - study_date 

        indices = patient_record.where(diff_days<datetime.timedelta(days=120)).dropna().index.values

        terms = [term for i in indices for term in combined_list[i]] 
    return terms 

def align_exams_with_patho(data):
    noisy_subcancer_label = []
    for pair in tqdm.tqdm(data):
        if pair['biopsied']:
            mrn, study_date = pair['mrn'], pair['study_date']
            terms = get_term_lists(mrn, study_date)
        else:
            terms = []
        noisy_subcancer_label.append(terms)
    return noisy_subcancer_label

def label_indices(s, tokens = unique_tokens):
    return [i for i, t in enumerate(tokens) if t in s] 

In [24]:
train_noisy_subcancer_label = align_exams_with_patho(train_data)
val_noisy_subcancer_label = align_exams_with_patho(val_data)

val_token_indices = [label_indices(' '.join(terms)) for terms in val_noisy_subcancer_label]
train_token_indices = [label_indices(' '.join(terms)) for terms in train_noisy_subcancer_label]

for i, pair in enumerate(val_data):
    pair['noisy_token_indicies'] = val_token_indices[i]

for i, pair in enumerate(train_data):
    pair['noisy_token_indicies'] = train_token_indices[i]

100%|██████████| 323418/323418 [23:28<00:00, 229.57it/s] 


In [34]:
datafile = '/gpfs/data/geraslab/Nan/data/breast_mml_datalists/20220111/breasts_lists/ffdm_screening_only/full/train'
with open(datafile, 'wb') as f:
    pickle.dump(train_data, f)

datafile = '/gpfs/data/geraslab/Nan/data/breast_mml_datalists/20220111/breasts_lists/ffdm_screening_only/balanced/val'
with open(datafile, 'wb') as f:
    pickle.dump(val_data, f)