# Exploratory Data Analysis

In [2]:
import os
import sys
import random
import pandas as pd
import xml.etree.ElementTree as ET
import pickle

from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
test_file = './raw_data/sample/.......'
sample_path = './raw_data/sample/'
all_path = './raw_data/all/'

In [None]:
tree = ET.parse(test_file)
root = tree.getroot()

In [None]:
root.findtext('./id_info/nct_id')

# File iterator generation

In [4]:
def gen_files(root_path, req_exts='xml'):
    """ builds the generator for all files that match a specified extension in root """
    # handling a non-list req_ext input
    if isinstance(req_exts, str):
        req_exts = [req_exts]
        
    # iterator
    for subdirs, dirs, files in os.walk(root_path):
        for files in files:
            # check the extensions
            if req_exts is not None:
                cur_ext = file.split('.')[-1]
                if cur_ext in req_exts:
                    yield os.path.join(sub_dir, file)
            else:
                yield os.path.join(sub_dir, file)
    
def list_files(root_path, req_exts='xml'):
    """ wraps the generator to get the list instead """
    return list(gen_files(root_path, req_exts=req_exts))

In [None]:
all_fps = list_files(sample_path)

## XML crawling

Methods for obtaining the list of all nodes (or leaves) in the tree

In [5]:
def crawl_tree_tags(root, leaf_only=False):
    """ returns list of all nodes, or leaves """
    all_nodes = []
    for c in root:
        all_nodes += crawl_child_tags(c, leaf_only=leaf_only)
    return all_nodes

def crawl_child_tags(node, leaf_only=False, parent_path='./'):
    """ recursively cralws all children and their children etc ... """
    all_nodes = []
    if not leaf_only or len(code) == 0:
        all_nodes += [parent_path + node.tag]
        
    for c in node:
        all_nodes += crawl_child_tags(c, leaf_only=leaf_only, parent_path=parent_path + node.tag + '/')
        
    return all_nodes

### Trying out the method

In [None]:
all_nodes = craw_tree_tags(root, leaf_only=True)
len(all_nodes)

In [None]:
all_nodes[:10]

In [None]:
all_nodes = craw_tree_tags(root, leaf_only=False)
len(all_nodes)

In [None]:
all_nodes[:10]

# Crawling files for summary stats and for the node2file dict

In [6]:
def crawl_files(root_path, leaf_only=True):
    """
    crawls all of the xml files in the roo dir and counts the type of nodes in each
    builds the dictionary that helps us go from node name to list of files, 
    used in further investigating specific nodes
    """
    column_names = ['total_count', 'file_count', 'unique_per_file']
    all_fps = list_files(root_path, req_exts='xml')
    node_dict = {}  # for building summary dataframe
    node2file = {}  # for building the node to file list dictionary
    error_files = []
    
    for fp in tqdm(all_fps):
        try:
            # list of keys already added by this file
            cur_file_nodes = []
            
            # parsing the xml
            tree = ET.parse(fp)
            root = tree.getroot()
            all_nodes = craw_tree_tags(root, leaf_only=leaf_only)
            
            # looping throuhg all of the nodes and process both dictionaries
            for cur_node in all_nodes:
                # --- processing node2file ---
                if cur_node not in node2file.keys():
                    node2file[cur_node] = [fp]
                else:
                    # note: duplicate nodes are added multiple times on purpose here
                    node2file[cur_node].append(fp)
                
                # --- processing node_dict ---
                if cur_node not in node_dict.keys():
                    node_dict[cur_node] = {column_names[0]: 0, 
                                           column_names[1]: 0,
                                           column_names[2]: True}
                # increment the counters
                if cur_node in cur_file_nodes:
                    # not the first occurance of this node in this file
                    node_dict[cur_node][column_names[0]] += 1
                    node_dict[cur_node][columns[2]] = False  # not unique
                else:
                    # first occurance of this node in this file
                    cur_file_nodes.append(cur_node)
                    node_dict[cur_node][column_names[0]] += 1
                    node_dict[cur_node][column_names[1]] += 1
        except Exception:
            error_files.append(fp)
            pass
        
    df = pd.DataFrame(node_dict, index=column_names).T
    
    # some convienence calcs on the final dataframe
    df.index.name = 'node'
    df['level'] = df.reset_index()['node'].apply()(lambda x: len(x.split('/')) - 1).values
    df['avg_per_file'] = df['total_count'] / df['file_count']
    df['pct_files'] = df['file_count'] / len(all_fps)
    
    return df, node2file, error_files
    

## Crawl samples

In [None]:
df_sample, node2file_sample, didnotcrawl = crawl_files(sample_path, leaf_only=True)

In [None]:
df_sample.to_csv('res_sample.csv')
pickle.dump(node2file_sample, open('node2file_sample.p', 'wb'))

## Crawl all files

In [None]:
df, node2file, didnotcrawl = crawl_files(all_path, leaf_only=True)

In [None]:
df.to_csv('res.csv')
pickle.dump(node2file, open('node2file.p', 'wb'))

## Some plots of stats

In [None]:
plt.figure(figsize=(8,6))
plt.plot(df['pct_files'].sort_values(ascending=False).values, label='coverage')
plt.title('Fields sorted by % coverage')
plt.xlabel('field number')
plt.ylabel('pct coverage')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.title('coverage by XML depth')
for cur_lvl in sorted(df['level'].unique()):
    sub_df = df[df['level'] == cur_lvl]
    plt.plot(sub_df['pct_files'].sort_values(ascending=False).values, label='depth='+str(cur_lvl))
plt.legend()
plt.xlabel('field number')
plt.ylabel('pct coverage')
plt.show()

In [None]:
# Memory Usage?
tot_size = sys.getsizeof(node2file)
for k, v in node2file.items():
    tot_size += sys.getsizeof(v)
    
tot_size / 1024 / 1024  # in MB

## Sampling Data Text

In [None]:
def sample_node_text(node, node2file, n=10, apply_fn=None, include_fn=False):
    """
    provided a node string and a node2file map, sample the texts in the raw files
    returns either a list or a dict depending on the include_fn parameter
    :param node: string rep of a node
    :param node2file: pre-crawled dict of node string -> files that contain the node
    :param n: number of samples to return
    :param apply_fn: applys a function to each sampled text
    :param include_fn: if true, returns a dict of filename -> field text instead
    """
    all_files = node2file[node]
    tot_files = len(all_files)
    
    if tot_files < n:
        print("only %s files with this node were found, returning all of them" % tot_files)
        n = tot_files
        
    chosen_files = random.choices(all_files, k=n)
    if include_fn:
        return dict(zip(chosen_files, get_field_from_files(node, chosen_files, apply_fn)))
    else:
        return get_field_from_files(node, chosen_files, apply_fn)
    
def get_field_from_files(node, chosen_files, apply_fn=None):
    """
    loads each file and attempts to extract the node from that file, returns the text from that file
    with an optional apply_fn to apply to the text
    :param node: string representation of a node
    :param list_files: list of file locations
    :param apply_fn: applys a transformation function to each sampled text
    """
    rt_arr = []
    for cur_file in list_files:
        raw_text = extract_text_from_file(node, cur_file)
        if apply_fn is None:
            rt_arr.append(raw_text)
        else:
            rt_arr.append(apply_fn(raw_text))
    return rt_arr

def extract_text_from_file(node, cur_file):
    """
    attempts to extract the text from a node, otherwise returns an error representation string
    :param node: string representation of a node
    :param cur_file: string path of the file to load
    """
    try:
        tree = ET.parse(cur_file)
        root = tree.getroot()
        raw_text = root.findtext(node)
        return raw_text
    except Exception:
        print("ERROR: node %s was not found in file %s" % (node, cur_file))
        pass
    

In [None]:
cur_node = './clincal_results/outcome_list/outcome/group_list/group/title'
sample_text = sample_node_text(cur_node, node2file, 5, include_fn=True)
sample_text

In [None]:
res_dict = {}
n_samples = 5

for cur_node in tqdm(node2file.keys()):
    sample_text = sample_node_text(cur_node, node2file, n_samples)
    res_dict[cur_node] = dict(zip(['ex' + str(x + 1) for x in range(0, len(sample_text))], sample_text))
    

In [None]:
df_res = pd.DataFrame(res_dict).T
df_res.index.name = 'node'
df_res.head()