In [1]:
import pandas as pd
import random
import os
import json
import gc
import datetime
import copy
VTE_path ="//vacrrdevmavdi01.vha.med.va.gov/MAVDEV1/Fillmore_Cancer/cat/Users/data_from_vinci/dvt_cancer_20230613/data_for_tf_model/processed"

In [2]:

def train_dev_test_file_random_split(metafile, train_size=0.8, dev_size=0.1, test_size=0.1):
    '''
    This function randomly splits a json file into train, dev and test datasets, and 
    automatically saved splitted files under train, dev and test folders.
    Note: there is special handling for non-string type diagnosis codes when necessary
    '''
    par_dir = os.path.dirname(metafile)
    train_dir = os.path.join(par_dir, 'train')
    dev_dir = os.path.join(par_dir, 'dev')
    test_dir = os.path.join(par_dir, 'test')
    if not os.path.exists(train_dir):
        os.mkdir(train_dir)
    if not os.path.exists(dev_dir):
        os.mkdir(dev_dir)
    if not os.path.exists(test_dir):
        os.mkdir(test_dir)
    
    print('Loading data: {}'.format(metafile))
    dat = json.load(open(metafile, 'r'))

    # change phecodes datatype to string
    for p in dat.keys():
        for i in range(len(dat[p]['events'])):
            dat[p]['events'][i]['codes'] = str(dat[p]['events'][i]['codes'])

    all = [p for p in dat.keys()]
    random.shuffle(all)
    dev_ind_start = int(len(all) * train_size)
    test_ind_start = int(len(all) * (train_size + dev_size))

    # Train sampling
    train_sample = all[:dev_ind_start]
    train = {p: dat[p] for p in train_sample}
    for p in train:
        train[p]['split_group'] = 'train'
    train_path = os.path.join(train_dir, 'train.json')
    with open(train_path, 'w') as fwt:
        print("Saving to {}\n".format(train_path))
        json.dump(train, fwt, indent=None)
    del train
    gc.collect()

    # Dev sampling
    dev_sample = all[dev_ind_start:test_ind_start]
    dev = {p: dat[p] for p in dev_sample}
    for p in dev:
        dev[p]['split_group'] = 'dev'
    dev_path = os.path.join(dev_dir, 'dev.json')
    with open(dev_path, 'w') as fwd:
        print("Saving to {}\n".format(dev_path))
        json.dump(dev, fwd, indent=None)
    del dev
    gc.collect()

    # Test sampling
    test_sample = all[test_ind_start:]
    test = {p: dat[p] for p in test_sample}
    for p in test:
        test[p]['split_group'] = 'test'

    test_path = os.path.join(test_dir, 'test.json')
    with open(test_path, 'w') as fwtt:
        print("Saving to {}\n".format(test_path))
        json.dump(test, fwtt, indent=None)
    del test
    gc.collect()

In [3]:
X_path = os.path.join(VTE_path, 'Phe_Lab2/X.json')


In [4]:
train_dev_test_file_random_split(X_path, train_size=0.6, dev_size=0.2, test_size=0.2)


Loading data: //vacrrdevmavdi01.vha.med.va.gov/MAVDEV1/Fillmore_Cancer/cat/Users/data_from_vinci/dvt_cancer_20230613/data_for_tf_model/processed\Phe_Lab2/X.json
Saving to //vacrrdevmavdi01.vha.med.va.gov/MAVDEV1/Fillmore_Cancer/cat/Users/data_from_vinci/dvt_cancer_20230613/data_for_tf_model/processed\Phe_Lab2\train\train.json

Saving to //vacrrdevmavdi01.vha.med.va.gov/MAVDEV1/Fillmore_Cancer/cat/Users/data_from_vinci/dvt_cancer_20230613/data_for_tf_model/processed\Phe_Lab2\dev\dev.json

Saving to //vacrrdevmavdi01.vha.med.va.gov/MAVDEV1/Fillmore_Cancer/cat/Users/data_from_vinci/dvt_cancer_20230613/data_for_tf_model/processed\Phe_Lab2\test\test.json

