# Tutorial about managing files in batch processing

When analysing a set of experiments you want to collect, match and group files according to information content and experimental conditions.

The Files class will help you.

In [None]:
from pathlib import Path
import tempfile

%matplotlib inline

import matplotlib.pyplot as plt

import locan as lc

In [None]:
lc.show_versions(system=False, dependencies=False, verbose=False)

## Some file structure to be analysed

In [None]:
directory = Path(tempfile.mkdtemp())
subdirectory = directory.joinpath("sub_directory")
subdirectory.mkdir()

In [None]:
files = [
    directory / "sub_directory" / "file_group_a_0.data",
    directory / "sub_directory" / "file_group_a_1.data",
    directory / "sub_directory" / "file_group_b_2.data",
    directory / "sub_directory" / "corresponding_file_0.data",
    directory / "metadata.meta",
]
for file_ in files:
    file_.touch()

In [None]:
list(directory.glob("**/*.*"))

## The Files class

In [None]:
lc.Files?

## Identify files

In [None]:
files = lc.Files.from_glob(
    directory=directory,
    pattern="**/*.*"
)
files.df

For each file a Path object is stored:

In [None]:
files.df.applymap(lambda x: x.name)

In [None]:
files.print_summary()

## Exclude files

In [None]:
files = lc.Files.from_glob(
    directory=directory,
    pattern="**/*.*"
)

files.df.applymap(lambda x: x.name)

In [None]:
stoplist = lc.Files.concatenate([
    lc.Files.from_glob(directory=files.directory, pattern="**/*.meta"),
    lc.Files.from_glob(directory=files.directory, pattern="**/*group_b*.*")
])
stoplist.df.applymap(lambda x: x.name)

In [None]:
files.exclude(stoplist=stoplist)
files.df.applymap(lambda x: x.name)

## Match corresponding files

In [None]:
files = lc.Files.from_glob(
    directory=directory,
    pattern="**/*.*",
    regex="group_a_0"
)
files.df.applymap(lambda x: x.name)

In [None]:
corresponding_files = lc.Files.from_glob(
    directory=directory,
    pattern="**/*.*",
    regex="corresponding"
)
corresponding_files.df.applymap(lambda x: x.name)

In [None]:
files.match_files(files=corresponding_files.df)
files.df.applymap(lambda x: x.name)

## Match metadata files

In [None]:
files = lc.Files.from_glob(
    directory=directory,
    pattern="**/*.*",
    regex="group_a_0"
)
files.df.applymap(lambda x: x.name)

In [None]:
files.match_file_upstream(pattern="*.meta")
files.df.applymap(lambda x: x.name)

## Group files

In [None]:
files = lc.Files.from_glob(
    directory=directory,
    pattern="**/file*.data"
)
files.df.applymap(lambda x: x.name)

In [None]:
files.set_group_identifier(name="A", pattern="group_a")
files.df

In [None]:
files.set_group_identifier(name="B", pattern="group_b")
files.df

In [None]:
files.group_identifiers()

In [None]:
grouped = files.grouped()
grouped.groups

## Indexing and iterating over files

In [None]:
files = lc.Files.from_glob(
    directory=directory,
    pattern="**/file*.data"
)
files.df.applymap(lambda x: x.name)

Slicing Files yield a new Files instance:

In [None]:
files[0:3]

Indexing Files yields a Series with the selected row:

In [None]:
files[0]

Iterating over Files yields a namedtuple  for each row: 

In [None]:
for file in files:
    print(file)
    print(file.file_path)