In [None]:
import numpy as np
import src.loadDataset as loadDataset
import src.build_tensors as build_tensors
import src.select_feature_fields as select_feature_fields
import src.build_features_from_tensors as build_features_from_tensors

In [None]:
data_dir = "/PATH/TO/CLINICAL/DATA/FLATFILES/"
desc_dir = "/PATH/TO/CLINICAL/DATA/DESCRIPTION_FILES/"
tensor_dir = "../test_commpass_ia9_tensors/"
param_dir = "parameters/"

In [None]:
run_test_data = True

In [None]:
if run_test_data:
    ##What files are we searching through to build features?
    file_names = ["stand_alone_ae", "per_patient_visit", "stand_alone_treatment_regimen", 'stand_alone_emergency_dept', 'stand_alone_admissions', 'stand_alone_medhx', 'stand_alone_famhx', 'per_patient']
    ##What strings invalidate a field as a feature?  Mostly correspond to meta-data or dates
    invalid_strings = ["date", "day", "week", "time", "enr", "interval", "dose", "unit", "frequency", "ongoing", "route", "_was", "visit", "censor", "d_pt", "ic_", "bmt", "flag", "vj", "mmtx_therapy", "mmtx_type", "dictionary"]
    ##What date are we using as the cutoff for no longer baseline?
    baseline_cutoff = 0
    ##What fraction of the feature's observations must be before treatment to treat it as a baseline feature?
    temporal_frac = 0.5
    select_feature_fields.main(file_names, invalid_strings, baseline_cutoff, temporal_frac, data_dir, param_dir, desc_dir)
    ##Create tensor dir if it doesn't already exist
    import os
    if not os.path.exists(tensor_dir):
        os.makedirs(tensor_dir)
    ##What's the min last visit number to include a person in the cohort?
    visit_cutoff = 1
    ##What's the minimum date, i.e. where is time zero for the tensors?
    min_date = -180
    ##Should we use the minimum date or set our own min date?
    calc_min_date = False
    ##The minimum number of time to see a text value to count it as a feature
    min_occurrences = 50
    ##Set debug to True to run with only 50 people and min_occurrences set to 5
    debug = True
    build_tensors.process_commpass(visit_cutoff, min_date, calc_min_date, min_occurrences, data_dir, tensor_dir, param_dir, debug=debug)

In [None]:
dataset = loadDataset.main("clinical", tensor_dir)
clinical_tensor = dataset['tensor']
clinical_obs_tensor = dataset['obs_tensor']
clinical_feature_names = dataset['feature_names']
clinical_feature_types = dataset['feature_types']
people = dataset['people']

dataset = loadDataset.main("initial", tensor_dir)
initial_tensor = dataset['tensor']
initial_feature_names = dataset['feature_names']
initial_feature_types = dataset['feature_types']
assert np.array_equal(dataset['people'], people)

dataset = loadDataset.main("treatment", tensor_dir)
treatment_tensor = dataset['tensor']
treatment_obs_tensor = dataset['obs_tensor']
treatment_feature_names = dataset['feature_names']
treatment_feature_types = dataset['feature_types']
assert np.array_equal(dataset['people'], people)

In [None]:
import src.util as util
import os

files = []
for fname in os.listdir(param_dir):
    if ".csv" in fname:
        files.append(fname)
        
dates = {}
features = {}
num_features = {}
text_features = {}
tensor_dates = {}
tensor_features = {}
for person in people:
    dates[person] = []
    features[person] = []
    num_features[person] = []
    text_features[person] = []
    tensor_dates[person] = []
    tensor_features[person] = []
    
for fname in files:
    if "_fields" in fname:
        date_dict = util.get_date_fields(param_dir, fname.replace("_fields.csv", ""))
    else:
        print fname
        continue
    
    date_fields = []
    valid_fields = []
    for key in date_dict:
        date_fields.append(date_dict[key])
        valid_fields.append(key)
    date_fields = list(set(date_fields))
    
    data, fields = util.read_clinical_data(data_dir+fname.replace("_fields.csv", "").replace("_baseline", "").upper()+".csv", param_dir)
    date_indices = []
    valid_indices = []
    for i, field in enumerate(fields):
        if field in date_fields:
            date_indices.append(i)
        if field in valid_fields:
            valid_indices.append(i)
            
    for person in people:
        if person in data:
            for line in data[person]:
                for i in range(len(line)):
                    if i in date_indices:
                        if line[i] != "":
                            dates[person].append(line[i])
                    if i in valid_indices:
                        if line[i] != "":
                            features[person].append(line[i])                
        
for person in people:
    dates[person] = list(set(dates[person]))
    for i in range(len(dates[person])):
        dates[person][i] = int(dates[person][i])
    features[person] = list(set(features[person]))
    num_features[person] = []
    for i in range(len(features[person])):
        try:
            num = float(features[person][i])
            num_features[person].append(num)
        except:
            text_features[person].append(features[person][i])

In [None]:
##There should be no dates in the tensors that aren't in the raw data, or are 0, or multiples of 90.
##These get imputed for missing dates sometimes, so they're allowed.  Note, this ignores treatments since it abides
##by different rules
min_date = -180
for i in range(clinical_obs_tensor.shape[0]):
    nz_dates = list(set(np.nonzero(clinical_obs_tensor[i, :, :])[0]))
    tensor_dates[people[i]] += nz_dates
for i in range(initial_tensor.shape[0]):
    nz_dates = list(set(np.nonzero(initial_tensor[i, :, :])[0]))
    tensor_dates[people[i]] += nz_dates

for person in people:
    tensor_dates[person] = list(set(tensor_dates[person]))

for person in people:
    for date in tensor_dates[person]:
        if date + min_date not in dates[person]:
            assert date % 90 == 0, date

In [None]:
##Dates in the raw data but not in the tensor should be rare (although not unheard of).
##Some features get thrown 

for person in people:
    to_print = []
    for date in dates[person]:
         if date - min_date not in tensor_dates[person]:
            if date - min_date >= 0:
                to_print.append(date)

    if len(to_print):
        print person
        for date in to_print:
            print date
        print ""

In [None]:
tensor_features = {}
for person in people:
    tensor_features[person] = []
    
for i in range(len(people)):
    nz_features = list(np.unique(clinical_tensor[i, :, :]))
    tensor_features[people[i]] += nz_features
for i in range(len(people)):
    nz_features = list(np.unique(initial_tensor[i, :, :]))  
    tensor_features[people[i]] += nz_features
   
for person in people:
    tensor_features[person] = list(set(tensor_features[person]))
    
for person in people:
    #print sorted(tensor_features[person])
    #print sorted(num_features[person])
    for feature in tensor_features[person]:
        if feature != 0. and feature != 1.:
            assert feature in num_features[person], feature

In [None]:
print "Most common clinical features"
util.most_common_features(tensor_dir, file_type="clinical", obs=False, sort="count", cutoff=10, top=True)
print "\n Least common clinical features"
util.most_common_features(tensor_dir, file_type="clinical", obs=False, sort="count", cutoff=10, top=False)

In [None]:
print "Most common initial features"
util.most_common_features(tensor_dir, file_type="initial", obs=False, sort="count", cutoff=10, top=True)
print "\n Least common initial features"
util.most_common_features(tensor_dir, file_type="initial", obs=False, sort="count", cutoff=10, top=False)

In [None]:
print "Most common treatment features"
util.most_common_features(tensor_dir, file_type="treatment", obs=False, sort="count", top=True)