# Find Raw Data from Folders of NWB files

### from a folder of NWB files like this:

```
nwb-files-folder/
    arbitrary-subdirectory-structure/
        2023_04_20-13-06-20.nwb
        2023_04_20-15-26-40.nwb
        2023_04_20-17-06-20.nwb
        ...
    folder2/
        2024_04_20-13-06-20.nwb
        ...
    ...
```

### It will generate a `reformat.py` script, that you can execute in your raw data folder:


```
raw/
    arbitrary-subdirectory-structure/
        TSeries-04202023-001/
        TSeries-04202023-002/
        TSeries-04202023-003/
        TSeries-04202023-004/
        TSeries-04202023-005/
        2023_04_20/13-06-20/
        2023_04_20/14-28-40/
        2023_04_20/15-26-40/
        2023_04_20/17-06-20/
        ...
    folder2/
        ...
    ...
```
as:
```
python reformat.py ./raw/ REFORMATED_FOLDER
```

### to copy all the raw data related to the NWB files in a folder `to-keep/`

```
REFORMATED_FOLDER/
        2023_04_20/
            13-06-20/
                TSeries-04202023-001/
            15-26-40/
                TSeries-04202023-003/
            17-06-20/
                TSeries-04202023-005/
        2023_04_20/
            ...
```

In [None]:
import sys, os
import numpy as np
sys.path.append(os.path.join(os.path.expanduser('~'), 'work', 'physion', 'src'))
import physion

In [None]:

def find_folder_infos(datafolder, 
                      subfolder=None,
                      Nmax=1000,
                      verbose=True):

    DATASET = physion.analysis.read_NWB.scan_folder_for_NWBfiles(datafolder)
    
    if subfolder is not None:
        Files = np.sort([f for f in DATASET['files'] if (subfolder in f)])
    else:
        Files = np.sort(DATASET['files'])

    TSeries, DayFolders, TimeFolders = [], [], []
    for i, filename in enumerate(Files[:Nmax]):

        if verbose:
            print('- %s' % filename)
        data = physion.analysis.read_NWB.Data(filename, verbose=verbose)

        Description = str(data.nwbfile.processing['ophys'].description)
        # print('     * %s' % Description)
        TSeries.append('TSeries-'+Description.split('TSeries-')[2].split('/')[0])
        if verbose:
            print('     * TSeries-folder: %s' % TSeries[-1])
        DayFolders.append(str(data.nwbfile.identifier)[:10])
        TimeFolders.append(str(data.nwbfile.identifier)[11:])
        if verbose:
            print('     * Day-folder: %s' % DayFolders[-1])
            print('     * Time-folder: %s' % TimeFolders[-1])

    return TSeries, DayFolders, TimeFolders

def find_subfolder(folder_name, root_folder,
                   day_folder=None):
    if day_folder is not None:
        List = [f[0] for f in os.walk(root_folder)\
                if ((f[0].split(os.path.sep)[-1]==folder_name) and (day_folder in f[0]))]
    else:
        List = [f[0] for f in os.walk(root_folder)\
                        if f[0].split(os.path.sep)[-1]==folder_name]
    if len(List)>0:
        return List[0]
    else:
        return None

In [None]:
datafolder = os.path.join(os.path.expanduser('~') ,'CURATED', 'SST-WT-NR1-GluN3-2023')
root_folder = os.path.join(os.path.expanduser('~') , 'DATA', 'TADDY')
new_folder = 'to-keep'
# TSeries, DayFolders, TimeFolders = find_folder_infos(datafolder)
#script = build_bash_script(datafolder, new_folder, Nmax=1000, verbose=False)

In [None]:
# folder where we look for NWB files
datafolder = os.path.join(os.path.expanduser('~') ,'CURATED', 'SST-WT-NR1-GluN3-2023')
# find names of subfolders to look for:
TSeries, DayFolders, TimeFolders = find_folder_infos(datafolder, 
                                                     verbose=False)

In [None]:
# folder where we look for NWB files
datafolder = os.path.join(os.path.expanduser('~') , 'DATA', 'TADDY')
# find names of subfolders to look for:
physion.analysis.read_NWB.scan_folder_for_NWBfiles(datafolder,
                                                   verbose=True)
TSeries, DayFolders, TimeFolders = find_folder_infos(datafolder, 
                                                     subfolder='Assembled',
                                                     verbose=False)

In [None]:
# for local test
script = """
root_folder = "/home/yann.zerlaut/DATA/TADDY/"
root_folder2 = "/home/yann.zerlaut/DATA/TADDY/"
target_folder = "/home/yann.zerlaut/DATA/SST-WT-GluN1KO-GluN3KO-2023"
"""
# on the NAS:
script = """
root_folder = "/volume1/Taddy/GluN3_V1_InVivo_Imaging/processed/"
root_folder2 = "/volume1/Taddy/GluN3_V1_InVivo_Imaging/raw/"
target_folder = "/volume1/Taddy/SST-WT-GluN1KO-GluN3KO-2023"
"""


script+= """
import os, sys, shutil, pathlib

# create target folder if not existing
pathlib.Path(target_folder).mkdir(parents=True, exist_ok=True)

def find_subfolder(folder_name, root_folder,
                   day_folder=None):
    if day_folder is not None:
        List = [f[0] for f in os.walk(root_folder)\
                if (f[0].split(os.path.sep)[-1]==folder_name) and (day_folder in f[0])]
    else:
        List = [f[0] for f in os.walk(root_folder)\
                        if f[0].split(os.path.sep)[-1]==folder_name]
    if len(List)>0:
        return List[0]
    else:
        return None

####################################################
###       copy folders 
####################################################

def do_not_include(Dir, f):
    return ('FaceCamera' in Dir) or ('RigCamera' in Dir) or\
            (('TSeries' in f) and ('.tif' in f)) or\
            ('data.bin' in f)

def ignore_files(Dir, files):
    return [f for f in files if (os.path.isfile(os.path.join(Dir, f)) and\
            do_not_include(Dir, f))]

def copy_folders(tseries, dayfolder, timefolder, i):
    print('- %i) : ' % i, dayfolder, timefolder)
    # create day folder
    pathlib.Path(os.path.join(target_folder, dayfolder)).mkdir(parents=True, exist_ok=True)
    # find imaging
    imaging = find_subfolder(tseries, root_folder)
    if imaging is None:
        imaging = find_subfolder(tseries, root_folder2)
    if imaging is None:
        print(' !! NOT FOUND ', tseries)
    timeF = find_subfolder(timefolder, root_folder, day_folder=dayfolder)

    if (timeF is not None) and (imaging is not None):
        print('    -> copying [...]')
        shutil.copytree(timeF, 
                        os.path.join(target_folder, dayfolder, timefolder),
                        dirs_exist_ok=True,
                        ignore=ignore_files)
        shutil.copytree(imaging, 
                        os.path.join(target_folder, dayfolder, timefolder, tseries),
                        dirs_exist_ok=True,
                        ignore=ignore_files)
    else:
        print('    -> DATA NOT FOUND ! ')
"""

n=0
for i, d, t in zip(TSeries, DayFolders, TimeFolders):
    n+=1
    script += """
copy_folders("%s", "%s", "%s", %i)
""" % (i, d, t, n)

# write as a bash script
script_name = 'rebuild.py'
with open(script_name, 'w') as f:
    f.write(script)

In [None]:
#cat rebuild.py