In [1]:
# /Users/teliov/TUD/Thesis/Medvice/Notebooks/data/04_06_new_data/data/split
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [11]:
from thesislib.utils import pathutils, knifeutils
import json
import math
from collections import OrderedDict
from glob import glob

In [3]:
import pandas as pd
import numpy as np

In [4]:
symptoms_csv = pathutils.get_data_file("04_06_new_data/data/partial/symptoms.csv")

In [5]:
train_ratio = 0.8
test_ratio = 0.2

In [6]:
# first split into train and test set
df = pd.read_csv(symptoms_csv)

In [7]:
grouped = df.groupby('PATHOLOGY')

In [8]:
condition_sizes = grouped.size()

In [9]:
output_path = pathutils.get_data_file("04_06_new_data/data/partial/data/")
for code, cnd_df in grouped.__iter__():
    num_rows = cnd_df.shape[0]
    train_count = int(math.ceil(train_ratio * num_rows))
    train_df = cnd_df[:train_count]
    test_df = cnd_df[train_count:]
    train_file_name = output_path + "train-%s.csv" % code
    test_file_name = output_path + "test-%s.csv" % code
    
    train_df.to_csv(train_file_name)
    test_df.to_csv(test_file_name)

In [None]:
# now we have train and test splits that reflect the distribution of the conditions
# next use cat to combined the split files together. <on the terminal> using:
# cat file1 file2 ... filen > output
# now we do the transformation on the train set to get it in the format we want

In [12]:
generating_synthea_modules = glob(pathutils.get_data_file("04_06_new_data/data/modules/*.json"))
num_unique_conditions = len(generating_synthea_modules)

In [13]:
condition_symptom_map = {}
condition_name_map = {}
for module in generating_synthea_modules:
    condition_code, condition_name, symptom_list = knifeutils.extract_condition_symptom_from_modules(module)
    condition_symptom_map[condition_code] = symptom_list
    condition_name_map[condition_code] = condition_name

In [14]:
symptom_vector = set()
for condition_symptoms in condition_symptom_map.values():
    symptom_vector = symptom_vector.union(condition_symptoms)

In [15]:
symptom_vector = sorted(symptom_vector)
symptom_label_map = OrderedDict({})
power = np.array([2**idx for idx in range(len(symptom_vector))])
for idx, item in enumerate(symptom_vector):
    symptom_label_map[item] = power[idx]

In [16]:
condition_codes = sorted(condition_name_map.keys())
condition_label_map = OrderedDict()
for idx, code in enumerate(condition_codes):
    condition_label_map[code] = idx

In [17]:
untransformed_input = pathutils.get_data_file("04_06_new_data/data/partial/data/train.csv")
transformed_output = pathutils.get_data_file("04_06_new_data/output/")

In [25]:
knifeutils.parse_data(untransformed_input, condition_label_map, symptom_label_map, transformed_output, use_header=False)

In [26]:
# now we have a train.csv file in the way we want, and we can then attempt using the random forest classifier with the warm start attribute
# but first we must split the file into subsets to simulate what we'll be dealing with
# for now we'll attempt no cross validation and simply train on the entire set
split_files = glob(pathutils.get_data_file("04_06_new_data/output/split/x*"))

In [27]:
split_files = sorted(split_files)

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [66]:
class_weights = {idx: 1 for idx in range(9)}

In [67]:
rf_clf = RandomForestClassifier(n_estimators=140, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=2, random_state=None, verbose=0, warm_start=True, class_weight=class_weights)

In [55]:
# we're going to attempt an incremental learning
columns_interest = ['LABEL', 'GENDER', 'RACE', 'AGE'] + symptom_vector
all_columns = ['Unnamed: 0'] + columns_interest
for idx, file in enumerate(split_files):
    if idx == 0:
        df = pd.read_csv(file, usecols=columns_interest)
        n_estimators = 0
    else:
        df = pd.read_csv(file, usecols=columns_interest, names=all_columns)
        n_estimators = 140
    
    ylabels = df.LABEL.values
    xdata = df.drop(columns=['LABEL']).values
    
    rf_clf.n_estimators += n_estimators
    rf_clf.fit(xdata, ylabels)

In [30]:
# we now have a forest that has been fitted on the entire data set.
# we can now test.
# first we do need to get our test set ready.
# for convenience, we will not split the test set into parts and will process as is


In [56]:
untransformed_test = pathutils.get_data_file("04_06_new_data/data/partial/data/test.csv")
transformed_test = pathutils.get_data_file("04_06_new_data/output/")
knifeutils.parse_data(untransformed_test, condition_label_map, symptom_label_map, transformed_test, use_header=False)

In [57]:
test_df = pd.read_csv(transformed_test+ "test.csv", usecols=columns_interest)

In [62]:
test_labels = test_df.LABEL.values.reshape(test_df.shape[0], -1)
test_data = test_df.drop(columns=['LABEL']).values

In [None]:
test_score = rf_clf.score(test_data, test_labels)

In [60]:
test_data.shape

(2803, 49)

In [63]:
test_labels.shape

(2803, 1)

In [71]:
# going to try and concat the split symptom files and then split using StratifiedShuffleSplit
input_files = pathutils.get_data_file("04_06_new_data/output/split-full/x*")
output_path = pathutils.get_data_file("04_06_new_data/output/concat")
train_split = 0.8
knifeutils.concatenate_and_split(input_files, output_path, train_split)

True