# Find Raw Data from Folders of NWB files

### from a folder of NWB files like this:

```
nwb-files-folder/
    arbitrary-subdirectory-structure/
        2023_04_20-13-06-20.nwb
        2023_04_20-15-26-40.nwb
        2023_04_20-17-06-20.nwb
        ...
    folder2/
        2024_04_20-13-06-20.nwb
        ...
    ...
```

### It will generate a `reformat.py` script, that you can execute in your raw data folder:


```
raw/
    arbitrary-subdirectory-structure/
        TSeries-04202023-001/
        TSeries-04202023-002/
        TSeries-04202023-003/
        TSeries-04202023-004/
        TSeries-04202023-005/
        2023_04_20/13-06-20/
        2023_04_20/14-28-40/
        2023_04_20/15-26-40/
        2023_04_20/17-06-20/
        ...
    folder2/
        ...
    ...
```
as:
```
python reformat.py ./raw/ REFORMATED_FOLDER
```

### to copy all the raw data related to the NWB files in a folder `to-keep/`

```
REFORMATED_FOLDER/
        2023_04_20/
            13-06-20/
                TSeries-04202023-001/
            15-26-40/
                TSeries-04202023-003/
            17-06-20/
                TSeries-04202023-005/
        2023_04_20/
            ...
```

In [1]:
import sys, os
import numpy as np
sys.path.append(os.path.join(os.path.expanduser('~'), 'work', 'physion', 'src'))
import physion

236


In [8]:

def find_folder_infos(datafolder, 
                      subfolder='',
                      Nmax=1000,
                      verbose=True):
    # Find NWBs
    Files = []
    for rF, _, Fs in os.walk(datafolder):
        for f in Fs:
            if ('.nwb' in f) and (subfolder in rF):
                Files.append(os.path.join(rF, f))
    Files = np.sort(np.array(Files))

    # find TSeries and day-time folders
    TSeries, DayFolders, TimeFolders = [], [], []
    for i, filename in enumerate(Files[:Nmax]):

        if verbose:
            print('- %s' % filename)
        data = physion.analysis.read_NWB.Data(filename, verbose=verbose)

        Description = str(data.nwbfile.processing['ophys'].description)
        # print('     * %s' % Description)
        TSeries.append('TSeries-'+Description.split('TSeries-')[2].split('/')[0])
        if verbose:
            print('     * TSeries-folder: %s' % TSeries[-1])
        DayFolders.append(str(data.nwbfile.identifier)[:10])
        TimeFolders.append(str(data.nwbfile.identifier)[11:])
        if verbose:
            print('     * Day-folder: %s' % DayFolders[-1])
            print('     * Time-folder: %s' % TimeFolders[-1])

    return TSeries, DayFolders, TimeFolders

def find_subfolder(folder_name, root_folder,
                   day_folder=None):
    if day_folder is not None:
        List = [f[0] for f in os.walk(root_folder)\
                if ((f[0].split(os.path.sep)[-1]==folder_name) and (day_folder in f[0]))]
    else:
        List = [f[0] for f in os.walk(root_folder)\
                        if f[0].split(os.path.sep)[-1]==folder_name]
    if len(List)>0:
        return List[0]
    else:
        return None

In [None]:
datafolder = os.path.join(os.path.expanduser('~') ,'CURATED', 'SST-WT-NR1-GluN3-2023')
root_folder = os.path.join(os.path.expanduser('~') , 'DATA', 'TADDY')
new_folder = 'to-keep'
# TSeries, DayFolders, TimeFolders = find_folder_infos(datafolder)
#script = build_bash_script(datafolder, new_folder, Nmax=1000, verbose=False)

In [None]:
# folder where we look for NWB files
datafolder = os.path.join(os.path.expanduser('~') ,'CURATED', 'SST-WT-NR1-GluN3-2023')
# find names of subfolders to look for:
TSeries, DayFolders, TimeFolders = find_folder_infos(datafolder, 
                                                     verbose=False)

In [None]:
# folder where we look for NWB files
datafolder = os.path.join(os.path.expanduser('~') , 'DATA', 'TADDY')
# find names of subfolders to look for:
TSeries, DayFolders, TimeFolders = find_folder_infos(datafolder, 
                                                     subfolder='Assembled',
                                                     verbose=False)

In [None]:
# on the NAS:
script = """
root_folder = "/volume1/Taddy/GluN3_V1_InVivo_Imaging/processed/"
root_folder2 = "/volume1/Taddy/GluN3_V1_InVivo_Imaging/raw/"
target_folder = "/volume1/Taddy/SST-WT-GluN1KO-GluN3KO-2023"
"""
# for local test
script = """
root_folder = "/home/yann.zerlaut/DATA/TADDY/"
root_folder2 = "/home/yann.zerlaut/DATA/TADDY/"
target_folder = "/home/yann.zerlaut/UNPROCESSED/SST-WT-GluN1KO-GluN3KO-2023"
"""


script+= """
import os, sys, shutil, pathlib

# create target folder if not existing
pathlib.Path(target_folder).mkdir(parents=True, exist_ok=True)

def find_subfolder(folder_name, root_folder,
                   day_folder=None):
    if day_folder is not None:
        List = [f[0] for f in os.walk(root_folder)\
                if (f[0].split(os.path.sep)[-1]==folder_name) and (day_folder in f[0])]
    else:
        List = [f[0] for f in os.walk(root_folder)\
                        if f[0].split(os.path.sep)[-1]==folder_name]
    if len(List)>0:
        return List[0]
    else:
        return None

####################################################
###       copy folders 
####################################################

def do_not_include(Dir, f):
    return ('FaceCamera' in Dir) or ('RigCamera' in Dir) or\
            (('TSeries' in f) and ('.tif' in f)) or\
            ('data.bin' in f)

def ignore_files(Dir, files):
    return [f for f in files if (os.path.isfile(os.path.join(Dir, f)) and\
            do_not_include(Dir, f))]

def copy_folders(tseries, dayfolder, timefolder, i):
    print('- %i) : ' % i, dayfolder, timefolder)
    # create day folder
    pathlib.Path(os.path.join(target_folder, dayfolder)).mkdir(parents=True, exist_ok=True)
    # find imaging
    imaging = find_subfolder(tseries, root_folder)
    if imaging is None:
        imaging = find_subfolder(tseries, root_folder2)
    if imaging is None:
        print(' !! NOT FOUND ', tseries)
    timeF = find_subfolder(timefolder, root_folder, day_folder=dayfolder)

    if (timeF is not None) and (imaging is not None):
        print('    -> copying [...]')
        shutil.copytree(timeF, 
                        os.path.join(target_folder, dayfolder, timefolder),
                        dirs_exist_ok=True,
                        ignore=ignore_files)
        shutil.copytree(imaging, 
                        os.path.join(target_folder, dayfolder, timefolder, tseries),
                        dirs_exist_ok=True,
                        ignore=ignore_files)
    else:
        print('    -> DATA NOT FOUND ! ')
"""

n=0
for i, d, t in zip(TSeries, DayFolders, TimeFolders):
    n+=1
    script += """
copy_folders("%s", "%s", "%s", %i)
""" % (i, d, t, n)

# write as a bash script
script_name = 'rebuild.py'
with open(script_name, 'w') as f:
    f.write(script)

In [None]:
#cat rebuild.py

In [21]:
import pandas

table = pandas.read_excel('/home/yann.zerlaut/UNPROCESSED/SST-WT-GluN1KO-GluN3KO-2023/Dataset.xlsx')

previous_NWBs = [f.replace('/', '-')+'.nwb' for f in table['folder']]
previous_NWBs

['2023_01_05-15-41-05.nwb',
 '2023_01_05-15-57-51.nwb',
 '2023_01_05-16-22-34.nwb',
 '2023_01_05-16-43-36.nwb',
 '2023_01_05-16-57-36.nwb',
 '2023_01_05-17-25-45.nwb',
 '2023_01_05-17-37-19.nwb',
 '2023_01_12-17-24-34.nwb',
 '2023_01_12-18-47-40.nwb',
 '2023_01_12-20-06-33.nwb',
 '2023_01_12-21-01-10.nwb',
 '2023_01_12-21-51-21.nwb',
 '2023_01_13-15-40-51.nwb',
 '2023_01_13-16-56-30.nwb',
 '2023_01_17-16-26-41.nwb',
 '2023_01_17-17-22-58.nwb',
 '2023_01_17-19-13-02.nwb',
 '2023_01_17-20-04-23.nwb',
 '2023_02_14-14-39-37.nwb',
 '2023_02_14-15-01-07.nwb',
 '2023_02_14-15-41-06.nwb',
 '2023_02_14-15-56-46.nwb',
 '2023_02_14-16-24-45.nwb',
 '2023_02_14-16-37-38.nwb',
 '2023_02_14-17-49-20.nwb',
 '2023_02_14-18-05-00.nwb',
 '2023_02_14-18-29-53.nwb',
 '2023_02_14-18-46-26.nwb',
 '2023_02_15-11-53-39.nwb',
 '2023_02_15-12-41-21.nwb',
 '2023_02_15-13-30-47.nwb',
 '2023_02_15-14-05-01.nwb',
 '2023_02_15-15-10-04.nwb',
 '2023_02_15-15-48-06.nwb',
 '2023_02_15-16-40-50.nwb',
 '2023_02_15-17-14-5

In [23]:
datafolder = os.path.join(os.path.expanduser('~') , 'DATA', 'TADDY')

NWBs, PATHS = [], []
newOnly = []
for rF, _, Fs in os.walk(datafolder):
    for f in Fs:
        if ('.nwb' in f) and ('Assembled' in rF):
            PATHS.append(os.path.join(rF, f))
            NWBs.append(f)
            if f not in previous_NWBs:
                newOnly.append(os.path.join(rF, f))
iNWBs = np.argsort(NWBs)
NWBs = np.array(NWBs)[iNWBs]
PATHS = np.array(PATHS)[iNWBs]
print(len(newOnly), len(PATHS), len(PATHS)-len(newOnly))

151 236 85


In [25]:
def find_folder_infos(Files, 
                      verbose=True):

    # find TSeries and day-time folders
    TSeries, DayFolders, TimeFolders = [], [], []
    for i, filename in enumerate(Files):

        if verbose:
            print('- %s' % filename)
        data = physion.analysis.read_NWB.Data(filename, verbose=verbose)

        Description = str(data.nwbfile.processing['ophys'].description)
        # print('     * %s' % Description)
        TSeries.append('TSeries-'+Description.split('TSeries-')[2].split('/')[0])
        if verbose:
            print('     * TSeries-folder: %s' % TSeries[-1])
        DayFolders.append(str(data.nwbfile.identifier)[:10])
        TimeFolders.append(str(data.nwbfile.identifier)[11:])
        if verbose:
            print('     * Day-folder: %s' % DayFolders[-1])
            print('     * Time-folder: %s' % TimeFolders[-1])

    return TSeries, DayFolders, TimeFolders

def find_subfolder(folder_name, root_folder,
                   day_folder=None):
    if day_folder is not None:
        List = [f[0] for f in os.walk(root_folder)\
                if ((f[0].split(os.path.sep)[-1]==folder_name) and (day_folder in f[0]))]
    else:
        List = [f[0] for f in os.walk(root_folder)\
                        if f[0].split(os.path.sep)[-1]==folder_name]
    if len(List)>0:
        return List[0]
    else:
        return None

In [26]:
TSeries, DayFolders, TimeFolders = find_folder_infos(newOnly, verbose=True)

- /home/yann.zerlaut/DATA/TADDY/session11/Assembled/2023_09_14-14-21-55.nwb


  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "


NWB-file reading time: 1226.9ms
     * TSeries-folder: TSeries-09142023-004
     * Day-folder: 2023_09_14
     * Time-folder: 14-21-55
- /home/yann.zerlaut/DATA/TADDY/session11/Assembled/2023_09_14-11-32-06.nwb
NWB-file reading time: 1089.7ms
     * TSeries-folder: TSeries-09142023-001
     * Day-folder: 2023_09_14
     * Time-folder: 11-32-06
- /home/yann.zerlaut/DATA/TADDY/session11/Assembled/2023_09_14-15-19-42.nwb
NWB-file reading time: 1100.0ms
     * TSeries-folder: TSeries-09142023-005
     * Day-folder: 2023_09_14
     * Time-folder: 15-19-42
- /home/yann.zerlaut/DATA/TADDY/session11/Assembled/2023_09_13-18-10-37.nwb
NWB-file reading time: 1076.0ms
     * TSeries-folder: TSeries-09132023-002
     * Day-folder: 2023_09_13
     * Time-folder: 18-10-37
- /home/yann.zerlaut/DATA/TADDY/session11/Assembled/2023_09_13-17-19-24.nwb
NWB-file reading time: 1063.1ms
     * TSeries-folder: TSeries-09132023-001
     * Day-folder: 2023_09_13
     * Time-folder: 17-19-24
- /home/yann.zerlaut/

  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "


NWB-file reading time: 956.2ms
     * TSeries-folder: TSeries-08022023-006
     * Day-folder: 2023_08_02
     * Time-folder: 17-21-11
- /home/yann.zerlaut/DATA/TADDY/session8/Assembled/2023_08_01-18-18-15.nwb
NWB-file reading time: 1188.2ms
     * TSeries-folder: TSeries-08012023-003
     * Day-folder: 2023_08_01
     * Time-folder: 18-18-15
- /home/yann.zerlaut/DATA/TADDY/session8/Assembled/2023_08_03-16-34-13.nwb
NWB-file reading time: 967.2ms
     * TSeries-folder: TSeries-08032023-003
     * Day-folder: 2023_08_03
     * Time-folder: 16-34-13
- /home/yann.zerlaut/DATA/TADDY/session8/Assembled/2023_08_02-16-08-01.nwb
NWB-file reading time: 999.8ms
     * TSeries-folder: TSeries-08022023-004
     * Day-folder: 2023_08_02
     * Time-folder: 16-08-01
- /home/yann.zerlaut/DATA/TADDY/session8/Assembled/2023_08_03-17-10-02.nwb
NWB-file reading time: 1097.6ms
     * TSeries-folder: TSeries-08032023-004
     * Day-folder: 2023_08_03
     * Time-folder: 17-10-02
- /home/yann.zerlaut/DATA/TA

  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."
  warn("Ignoring cached namespace '%s' version %s because version %s is already loaded."


NWB-file reading time: 1103.1ms
     * TSeries-folder: TSeries-06052023-1635-001
     * Day-folder: 2023_06_05
     * Time-folder: 17-31-40
- /home/yann.zerlaut/DATA/TADDY/session5/Assembled/2023_06_07-15-29-08.nwb
NWB-file reading time: 1269.5ms
     * TSeries-folder: TSeries-06072023-003
     * Day-folder: 2023_06_07
     * Time-folder: 15-29-08
- /home/yann.zerlaut/DATA/TADDY/session5/Assembled/2023_06_06-16-16-14.nwb
NWB-file reading time: 1271.8ms
     * TSeries-folder: TSeries-06062023-1459-002
     * Day-folder: 2023_06_06
     * Time-folder: 16-16-14
- /home/yann.zerlaut/DATA/TADDY/session5/Assembled/2023_06_05-17-56-33.nwb
NWB-file reading time: 1363.9ms
     * TSeries-folder: TSeries-06052023-1635-003
     * Day-folder: 2023_06_05
     * Time-folder: 17-56-33
- /home/yann.zerlaut/DATA/TADDY/session5/Assembled/2023_06_08-14-13-08.nwb
NWB-file reading time: 1256.5ms
     * TSeries-folder: TSeries-06082023-1208-003
     * Day-folder: 2023_06_08
     * Time-folder: 14-13-08
- /ho

  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "
  warn("%s '%s': Length of data does not match length of timestamps. Your data may be transposed. "


NWB-file reading time: 1199.8ms
     * TSeries-folder: TSeries-10062023-004
     * Day-folder: 2023_10_06
     * Time-folder: 16-21-50
- /home/yann.zerlaut/DATA/TADDY/session12/Assembled/2023_10_05-17-10-29.nwb
NWB-file reading time: 974.7ms
     * TSeries-folder: TSeries-10052023-005
     * Day-folder: 2023_10_05
     * Time-folder: 17-10-29
- /home/yann.zerlaut/DATA/TADDY/session12/Assembled/2023_10_06-17-00-22.nwb
NWB-file reading time: 1142.3ms
     * TSeries-folder: TSeries-10062023-005
     * Day-folder: 2023_10_06
     * Time-folder: 17-00-22
- /home/yann.zerlaut/DATA/TADDY/session12/Assembled/2023_10_04-12-56-01.nwb
NWB-file reading time: 1061.0ms
     * TSeries-folder: TSeries-10042023-001
     * Day-folder: 2023_10_04
     * Time-folder: 12-56-01
- /home/yann.zerlaut/DATA/TADDY/session12/Assembled/2023_10_03-16-44-25.nwb
NWB-file reading time: 1131.5ms
     * TSeries-folder: TSeries-10032023-004
     * Day-folder: 2023_10_03
     * Time-folder: 16-44-25
- /home/yann.zerlaut/D

In [27]:
script = """
root_folder = "/home/yann.zerlaut/DATA/TADDY/"
target_folder = "/home/yann.zerlaut/UNPROCESSED/TOSAVE"
"""


script+= """
import os, sys, shutil, pathlib

# create target folder if not existing
pathlib.Path(target_folder).mkdir(parents=True, exist_ok=True)

def find_subfolder(folder_name, root_folder,
                   day_folder=None):
    if day_folder is not None:
        List = [f[0] for f in os.walk(root_folder)\
                if (f[0].split(os.path.sep)[-1]==folder_name) and (day_folder in f[0])]
    else:
        List = [f[0] for f in os.walk(root_folder)\
                        if f[0].split(os.path.sep)[-1]==folder_name]
    if len(List)>0:
        return List[0]
    else:
        return None

####################################################
###       copy folders 
####################################################

def do_not_include(Dir, f):
    return ('FaceCamera' in Dir) or ('RigCamera' in Dir) or\
            (('TSeries' in f) and ('.tif' in f)) or\
            ('data.bin' in f)

def ignore_files(Dir, files):
    return [f for f in files if (os.path.isfile(os.path.join(Dir, f)) and\
            do_not_include(Dir, f))]

def copy_folders(tseries, dayfolder, timefolder, i):
    print('- %i) : ' % i, dayfolder, timefolder)
    # create day folder
    pathlib.Path(os.path.join(target_folder, dayfolder)).mkdir(parents=True, exist_ok=True)
    # find imaging
    imaging = find_subfolder(tseries, root_folder)
    if imaging is None:
        imaging = find_subfolder(tseries, root_folder2)
    if imaging is None:
        print(' !! NOT FOUND ', tseries)
    timeF = find_subfolder(timefolder, root_folder, day_folder=dayfolder)

    if (timeF is not None) and (imaging is not None):
        print('    -> copying [...]')
        shutil.copytree(timeF, 
                        os.path.join(target_folder, dayfolder, timefolder),
                        dirs_exist_ok=True,
                        ignore=ignore_files)
        shutil.copytree(imaging, 
                        os.path.join(target_folder, dayfolder, timefolder, tseries),
                        dirs_exist_ok=True,
                        ignore=ignore_files)
    else:
        print('    -> DATA NOT FOUND ! ')
"""

n=0
for i, d, t in zip(TSeries, DayFolders, TimeFolders):
    n+=1
    script += """
copy_folders("%s", "%s", "%s", %i)
""" % (i, d, t, n)

# write as a bash script
script_name = 'rebuild.py'
with open(script_name, 'w') as f:
    f.write(script)

In [28]:
cat rebuild.py


root_folder = "/home/yann.zerlaut/DATA/TADDY/"
target_folder = "/home/yann.zerlaut/UNPROCESSED/TOSAVE"

import os, sys, shutil, pathlib

# create target folder if not existing
pathlib.Path(target_folder).mkdir(parents=True, exist_ok=True)

def find_subfolder(folder_name, root_folder,
                   day_folder=None):
    if day_folder is not None:
        List = [f[0] for f in os.walk(root_folder)                if (f[0].split(os.path.sep)[-1]==folder_name) and (day_folder in f[0])]
    else:
        List = [f[0] for f in os.walk(root_folder)                        if f[0].split(os.path.sep)[-1]==folder_name]
    if len(List)>0:
        return List[0]
    else:
        return None

####################################################
###       copy folders 
####################################################

def do_not_include(Dir, f):
    return ('FaceCamera' in Dir) or ('RigCamera' in Dir) or            (('TSeries' in f) and ('.tif' in f)) or            ('data.bin' in f)

def i