# 1. Benchmarking listing files in a folder

In [65]:
from os import listdir
from os.path import join
import pandas as pd

In [50]:
folder = 'a.data_folder'

In [57]:
#1. list comprehension
%timeit filelist = [f for f in listdir(folder)]

1000 loops, best of 3: 992 µs per loop


In [71]:
filelist = [f for f in listdir(folder)]

In [52]:
#2. function
def list_files_in_folder(folder):
    filelist = []
    for file in listdir(folder):
        filelist.append(file)
    return filelist

%timeit filelist = list_files_in_folder(folder)

1000 loops, best of 3: 1.12 ms per loop


In [35]:
#3. Cython
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [53]:
%%cython
from os import listdir

cdef list_files_in_folder(str folder):
    cdef str file
    cdef list filelist
    filelist = []
    for file in listdir(folder):
        filelist.append(file)
    return filelist

In [54]:
%timeit filelist = list_files_in_folder(folder)

1000 loops, best of 3: 1.08 ms per loop


In [61]:
#4. Joblib parallel
from joblib import Parallel, delayed

def joblib_list_files(folder):
    return [f for f in listdir(folder)]

def x(a):
    return a

%timeit joblib_filelist = Parallel(n_jobs=8)(delayed(x)(a) for a in listdir(folder)) 

1 loop, best of 3: 191 ms per loop


# 2. Create an aggregate dataframe from json files

In [92]:
def json_to_df(file,folder):
    """Transform a json file into a Pandas dataframe row"""
    row = pd.read_json(folder+'/'+ file,
    orient='index',
    typ='series',
    dtype=True).to_frame().transpose()
    return row

In [73]:
#1. list comprehension

%timeit frames = [json_to_df(f, folder) for f in filelist]

1 loop, best of 3: 3.44 s per loop


In [96]:
#2. Parallel

def create_frames(file,folder):
    return json_to_df(file,folder)

%timeit frames1 = Parallel(n_jobs=7)(delayed(create_frames)(file,folder) for file in filelist)

1 loop, best of 3: 1.25 s per loop


In [100]:
frames1 = Parallel(n_jobs=8)(delayed(create_frames)(file,folder) for file in filelist)

In [108]:
#3. Parallel process
from parallel_process import parallel_process

def create_frames(file):
    global folder
    return json_to_df(file,folder)

%timeit frames2 = parallel_process([file for file in filelist],create_frames, n_jobs=7)

100%|██████████| 1.14K/1.14K [00:00<00:00, 3.63Kit/s]
1135it [00:00, 207720.35it/s]
100%|██████████| 1.14K/1.14K [00:00<00:00, 1.24Kit/s]
1135it [00:00, 208676.41it/s]
100%|██████████| 1.14K/1.14K [00:00<00:00, 2.64Kit/s]
1135it [00:00, 184281.15it/s]
100%|██████████| 1.14K/1.14K [00:00<00:00, 1.73Kit/s]
1135it [00:00, 180043.68it/s]

1 loop, best of 3: 1.49 s per loop





# Conclusion

* Stick to list comprehension for first algorithm
* Parallelize second algorithm - some overhead from tqdm, but at least have visibility on timing
