In [7]:
import os
import sys
import blosc
import time

def read_data(filename):
    with open(filename, 'rb') as f:
        return f.read()

def write_compressed(filepath, bytedata):
    with open(filepath, 'wb') as f:
        f.write(bytedata)

if __name__ == "__main__":
    
    # choose the compression and file extension you want here
    # zstandard gives good compression rates and is fast for clevel = 1
    compression_type = 'zstd'
    compression_level = 1 # fast for zstandard
    file_ext = '.zst' # file extension to append to dax file
    
    # this will delete the dax file once the compressed file is written
    delete = True
    
    # check the data by reading it back in uncompressing, and checking against the original
    # unecessary...
    check = False
    
    
    for root, dirs, files in os.walk(os.getcwd()):
        for file in files:
            if file.endswith('.dax'):
                
                file_path = os.path.join(root, file)
                file_size = os.stat(file_path).st_size
                
                print('working on: ' + str(file_path))
                
                t0 = time.time()
                raw = read_data(file_path)
                
                t1 = time.time()
                compressed = blosc.compress(raw, cname = compression_type, clevel = compression_level)
                t2 = time.time()
                
                compressed_path = file_path + file_ext
                write_compressed(compressed_path, compressed)
                compressed_size = os.stat(compressed_path).st_size
                print('compression time: ' + '{:.2f}'.format(t2-t1))
                print('total time: ' + '{:.2f}'.format(t2-t0))
                
                # delete the uncompressed file after it is compressed 
                if delete:
                    deleted = False
                    if os.path.exists(compressed_path):
                        os.remove(file_path)
                        deleted = True
                        
                # read back in the data to see if it is the same
                if check:
                    decompressed = blosc.decompress(read_data(compressed_path))
                    sameQ = raw == decompressed


                logfile_path = os.path.join(root, 'compression_log.txt')
                
                # write something to a log
                with open(logfile_path, 'a+') as logfile:
                    logfile.write('compressing file: ' + file_path + '\n')
                    logfile.write('compression time: ' + '{:.2f}'.format(t2 - t1) + ' s\n' )
                    logfile.write('total time: ' + '{:.2f}'.format(t2 - t0) + ' s\n' )
                    logfile.write('compression amount: ' + str(int(compressed_size/1024)) + '/' + str(int(file_size/1024)) + '\t' + '{:.2f}'.format(100 * compressed_size/file_size) + '%\n')
                    
                    if check:
                        if  sameQ:
                            logfile.write('compression passed test\n')
                        else:
                            logfile.write('compression failed test\n')
                    if not delete:
                        logfile.write('dax file not removed\n')
                    else:
                        if deleted:
                            logfile.write('dax file removed\n')
                        else:
                            logfile.write('warning check files\n')
                    logfile.write('\n')

In [2]:
src_folder = r'\\10.245.74.218\Raw_data\Bogdan\7_27_2019_IMR90RNA'
tar_folder = r'\\10.245.74.212\Chromatin_NAS_2\20190727_IMR90_intron-DNA-MERFISH'

In [3]:
full_fds = [os.path.join(src_folder, _fl) for _fl in os.listdir(src_folder) if _fl[0]=='H' or _fl[:2]=='Ab']

fov_basenames = [_fl.split(os.extsep)[0] for _fl in sorted(os.listdir(full_fds[0])) if '.dax' in _fl]


In [4]:
from shutil import copyfile

import blosc
import numpy as np
from tqdm import tqdm

In [17]:
%%time
sel_fov_ids = np.arange(5,10)

source_filenames, target_filenames = [], []

for _fov_id in sel_fov_ids:
    _fov_basename = fov_basenames[_fov_id]
    for _fd in full_fds:
        # target_fls 
        _src_fls = [os.path.join(_fd, _fl) for _fl in os.listdir(_fd) if _fov_basename in _fl]
        _tar_fls = [_fl.replace(src_folder, tar_folder) for _fl in _src_fls]
        
        # append
        source_filenames.extend(_src_fls)
        target_filenames.extend(_tar_fls)
print(len(source_filenames))

2905
Wall time: 11.3 s


In [None]:
overwrite = False
verbose = False

for _src_fl, _tar_fl in tqdm(zip(source_filenames, target_filenames)):
    
    _tar_fd = os.path.dirname(_tar_fl)
    if not os.path.exists(_tar_fd):
        if verbose:
            print(f"Create folder: {_tar_fd}")
        os.makedirs(_tar_fd)
    
    if _src_fl.split(os.extsep)[-1] == 'zst':
        _tar_fl = _tar_fl.split('.zst')[0]
        if overwrite or not os.path.exists(_tar_fl):
            if verbose:
                print(f"decopress from {_src_fl} to {_tar_fl}")
            decompressed = blosc.decompress(read_data(_src_fl))
            write_compressed(_tar_fl, decompressed)
    else:
        if overwrite or not os.path.exists(_tar_fl):
            if verbose:
                print(f"copy from {_src_fl} to {_tar_fl}")
            copyfile(_src_fl, _tar_fl)

2657it [6:51:18,  9.27s/it]