## filestacker - example

In [1]:
#import libraries

from os.path import realpath
# please review file_stacker.py for more information on how pandas dataframes were loaded
from file_stacker import FileStack#files_dict, create_dfs

### Load data

In [3]:
# tuple of extensions to load 
extensions = ('csv','json')

path_to_files = realpath('data')

# create FileStack object 
files = FileStack(path=path_to_files, extensions=extensions)

# view dictionary of files in given data directory
files.files_dict()

{'Dice_1.csv': {'name': 'Dice_1',
  'ext': '.csv',
  'path': 'C:\\Users\\user\\Dropbox\\filestacker\\data',
  'tag': None,
  'delim': 'NA'},
 'Dice_2.csv': {'name': 'Dice_2',
  'ext': '.csv',
  'path': 'C:\\Users\\user\\Dropbox\\filestacker\\data',
  'tag': None,
  'delim': 'NA'}}

In [4]:
# each file has results from 1000 trials of a roll of a single die

# return FileStack.dfs(), which returns the list of dataframes in the directory, 
# as well as a dataframe map, which corresponds to a metadata mapping by index location of files to dfs


df_list, df_map = files.dfs()

In [5]:
for k, v in df_map.items():
           print('name:',k,'\nindex:',v['ix'],'\nsource:',v['source'],'\nrow count:',v['row_count'],'\n')
           display(df_list[df_map[k]['ix']].head())
           print('-'*20)


name: Dice_1 
index: 0 
source: Dice_1.csv 
row count: 1000 



Unnamed: 0,RESULT
0,2
1,4
2,3
3,4
4,2


--------------------
name: Dice_2 
index: 1 
source: Dice_2.csv 
row count: 1000 



Unnamed: 0,RESULT
0,2
1,4
2,4
3,3
4,2


--------------------


In [6]:
# using the stack method, we see that both dfs have been stacked row-wise with shape (2000, 1)
files.stack(df_indices=slice(0,2), cols=['RESULT']).shape

(2000, 1)

In [7]:
# aggregation can also be performed

files.stack(df_indices=slice(0,2), cols=['RESULT'], agg_cols=['RESULT'], agg_funcs=['count'])

Unnamed: 0,RESULT,RESULT_count
0,1,42
1,2,404
2,3,377
3,4,386
4,5,387
5,6,404


In [8]:
# a dictionary of filters can also be applied. See documentation for all the possible operators

# filter on odd numbers 
filters = {0:{'column':'RESULT', 'operator': 'in', 'value': [1,3,5]}}

In [9]:
# notice the new aggregation now only includes odd values

files.stack(df_indices=slice(0,2), cols=['RESULT'], agg_cols=['RESULT'], agg_funcs=['count'], filter_dict=filters)

Unnamed: 0,RESULT,RESULT_count
0,1,42
1,3,377
2,5,387


In [None]:
# uncomment to view documentation on files_dict function 
# files_dict?