In [16]:
import pandas as pd
import random
import os
import json
import gc
import datetime
import copy

VTE_path = 'Y:\\Fillmore_Cancer\\cat\\Users\\data\\derived\\vte_ml\\vte_ml\\data_for_chunlei'

In [17]:

def train_dev_test_file_random_split(metafile, train_size=0.8, dev_size=0.1, test_size=0.1):
    '''
    This function randomly splits a json file into train, dev and test datasets, and 
    automatically saved splitted files under train, dev and test folders.
    Note: there is special handling for non-string type diagnosis codes when necessary
    '''
    par_dir = os.path.dirname(metafile)
    train_dir = os.path.join(par_dir, 'train')
    dev_dir = os.path.join(par_dir, 'dev')
    test_dir = os.path.join(par_dir, 'test')
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)
    if not os.path.exists(dev_dir):
        os.mkdir(dev_dir)
    if not os.path.exists(test_dir):
        os.mkdir(test_dir)
    
    print('Loading data: {}'.format(metafile))
    dat = json.load(open(metafile, 'r'))

    # change phecodes datatype to string
    for p in dat.keys():
        for i in range(len(dat[p]['events'])):
            dat[p]['events'][i]['codes'] = str(dat[p]['events'][i]['codes'])

    all = [p for p in dat.keys()]
    random.shuffle(all)
    dev_ind_start = int(len(all) * train_size)
    test_ind_start = int(len(all) * (train_size + dev_size))

    # Train sampling
    train_sample = all[:dev_ind_start]
    train = {p: dat[p] for p in train_sample}
    for p in train:
        train[p]['split_group'] = 'train'
    train_path = os.path.join(train_dir, 'train.json')
    with open(train_path, 'w') as fwt:
        print("Saving to {}\n".format(train_path))
        json.dump(train, fwt, indent=None)
    del train
    gc.collect()

    # Dev sampling
    dev_sample = all[dev_ind_start:test_ind_start]
    dev = {p: dat[p] for p in dev_sample}
    for p in dev:
        dev[p]['split_group'] = 'dev'
    dev_path = os.path.join(dev_dir, 'dev.json')
    with open(dev_path, 'w') as fwd:
        print("Saving to {}\n".format(dev_path))
        json.dump(dev, fwd, indent=None)
    del dev
    gc.collect()

    # Test sampling
    test_sample = all[test_ind_start:]
    test = {p: dat[p] for p in test_sample}
    for p in test:
        test[p]['split_group'] = 'test'

    test_path = os.path.join(test_dir, 'test.json')
    with open(test_path, 'w') as fwtt:
        print("Saving to {}\n".format(test_path))
        json.dump(test, fwtt, indent=None)
    del test
    gc.collect()

In [11]:
phe_path = os.path.join(VTE_path, 'data_20240229_final.json')


In [12]:
phe = json.load(open(phe_path, 'r'))


In [13]:
def combine_phe_lab (phe_json, lab_json):
    '''
    phe_json: 
    lab_json:
    '''
    combined_dict = {}
    phe = json.load(open(phe_json, 'r'))
    lab = json.load(open(lab_json, 'r'))
    phe_cohort = set(phe.keys())
    lab_cohort = set(lab.keys())
    # pdb.set_trace()
    union_cohort = phe_cohort | lab_cohort
    intersect_cohort = phe_cohort & lab_cohort
    
    phe_cohort_only = phe_cohort - lab_cohort
    lab_cohort_only = lab_cohort - lab_cohort
    print(f'There are {len(phe_cohort)} and {len(lab_cohort)} in icd cohort and lab cohort.')
    print(f'Union: {len(union_cohort)}; Intersect: {len(intersect_cohort)}\nICD only: {len(phe_cohort_only)}; lab only {len(lab_cohort_only)}')
    
    for pat in (union_cohort):
        if pat in phe_cohort_only:
            combined_dict[pat] = phe[pat]

        if pat in intersect_cohort:
            pat_dict = {}

            phe_events = phe[pat]['events']
            phe_end_of_data = phe[pat]['end_of_data']
            
            lab_events = lab[pat]['events']
            lab_end_of_data = lab[pat]['end_of_data']
            # In case duplicated events, especially for pancreatic cancer patients    # pdb.set_trace()
            if len(phe_events) > 0 and len(lab_events) > 0:
                phe_events.extend(lab_events)
                # events_set = set(frozenset(e.items()) for e in icd_events if e is not None)
                # events = [dict(s) for s in events_set]
                end_of_data = max(phe_end_of_data, lab_end_of_data)
                pat_dict['events'] = phe_events
                pat_dict['end_of_data'] = end_of_data
                for key in ['birthdate', 'gender', 'split_group', 'indexdate', 'ks_mod_score']:
                    pat_dict[key] = phe[pat][key]
                combined_dict[pat] = pat_dict

    return combined_dict

In [19]:


#lab_path = os.path.join(VTE_path, 'combined\\Labs_tot.json')
#phe_path = os.path.join(VTE_path, 'data_20240229_final.json')
#phe_lab =  combine_icd_lab (phe_path,lab_path )
combined_path = os.path.join(VTE_path, 'combined\\Replicates\\phe_lab.json')
#json.dump(icd_lab, open(combined_path, 'w'))

train_dev_test_file_random_split(combined_path, train_size=0.7, dev_size=0.15, test_size=0.15)


Loading data: Y:\Fillmore_Cancer\cat\Users\data\derived\vte_ml\vte_ml\data_for_chunlei\combined\Replicates\phe_lab.json
Saving to Y:\Fillmore_Cancer\cat\Users\data\derived\vte_ml\vte_ml\data_for_chunlei\combined\Replicates\train\train.json

Saving to Y:\Fillmore_Cancer\cat\Users\data\derived\vte_ml\vte_ml\data_for_chunlei\combined\Replicates\dev\dev.json

Saving to Y:\Fillmore_Cancer\cat\Users\data\derived\vte_ml\vte_ml\data_for_chunlei\combined\Replicates\test\test.json



In [21]:


lab_json = os.path.join(VTE_path, 'combined\\Labs_tot.json')
phe_json = os.path.join(VTE_path, 'data_20240229_final.json')

In [22]:
phe = json.load(open(phe_json, 'r'))
lab = json.load(open(lab_json, 'r'))
phe_cohort = set(phe.keys())
lab_cohort = set(lab.keys())
# pdb.set_trace()
union_cohort = phe_cohort | lab_cohort
intersect_cohort = phe_cohort & lab_cohort

phe_cohort_only = phe_cohort - lab_cohort
lab_cohort_only = lab_cohort - lab_cohort

In [32]:
phe_cohort_only

set()

In [31]:
lab_cohort_only

set()

In [35]:
len(union_cohort)

589045

In [27]:
len(intersect_cohort)

123648

In [29]:
len(lab_cohort)

589045

In [30]:
len(phe_cohort)

123648