# Data collection

* Use clean RLS photos for training and validation.
* Use my messy photos for testing.
* Work on the multi-label species level since genera aren't particularly interesting? Probably still best to start with genera since it's easier and gives an idea of what's possible. Also, when deploying, it may be better to give a genus-level ID than none at all.
* Allow for incremental development: Start with small datasets and switch to bigger ones down the track. Remember that the supply of unseen test photos is effectively infinite, as I can always go on more dives or label old photos.

**Limitations:**

* Species with few training images are excluded. **TODO**: Maybe drop this limitation by using non-RLS photos for validation? If the test photos are non-RLS, it makes sense for the validation photos to be similar.
* My photos contain a lot of background stuff -- automatically trim to centre? or use object detection as part of the pipeline?
* My photos contain invertebrates -- remove?

In [2]:
from collections import Counter
from pathlib import Path
import shutil

import pandas as pd
from PIL import Image

In [50]:
def crop_image_file(src: Path, dst: Path, top_bottom_pixels=55):
    """Crop the top and bottom of an image. The default pixel count is useful for removing the RLS URL."""
    with Image.open(src) as im:
        width, height = im.size
        im.crop((0, top_bottom_pixels, width, height - top_bottom_pixels)).save(dst)


def create_rls_species_dataset(
    m1_csv_path: Path,
    image_dir: Path,
    output_dir: Path,
    num_species,
    min_images_per_species,
):
    species_with_min_images = set()
    for image_path in image_dir.iterdir():
        try:
            genus, taxon, suffix = image_path.name.split("-")
        except ValueError:
            print(f"Skipping {image_path}")
            continue
        image_index = int(suffix.split(".")[0])
        if image_index >= min_images_per_species - 1:
            species_with_min_images.add(f"{genus.capitalize()} {taxon}")
    m1_species = pd.read_csv(m1_csv_path)["Taxon"].drop_duplicates()
    sampled_species = m1_species[m1_species.isin(species_with_min_images)].sample(
        num_species, random_state=0
    )
    output_dir.mkdir(parents=True)
    for image_glob in sampled_species.str.replace(" ", "-").str.lower() + "-*":
        for src_filename in image_dir.glob(image_glob):
            crop_image_file(src_filename, output_dir / src_filename.name)


output_dir = Path("/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4")
try:
    shutil.rmtree(output_dir)
except FileNotFoundError:
    pass
create_rls_species_dataset(
    m1_csv_path=Path("/home/yanir/projects/fish-id/data/dump-20210717/m1.csv"),
    image_dir=Path("/home/yanir/projects/yanirs.github.io/tools/rls/img"),
    output_dir=output_dir,
    num_species=10,
    min_images_per_species=4,
)
sorted(output_dir.iterdir())

Skipping /home/yanir/projects/yanirs.github.io/tools/rls/img/heterodontus-portusjacksoni-egg-0.jpg
Skipping /home/yanir/projects/yanirs.github.io/tools/rls/img/chrysiptera-brownriggii-[dark-form]-0.jpg


[PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/aulostomus-chinensis-0.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/aulostomus-chinensis-1.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/aulostomus-chinensis-2.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/aulostomus-chinensis-3.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/aulostomus-chinensis-4.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/cephalopholis-argus-0.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/cephalopholis-argus-1.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/cephalopholis-argus-2.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-species-10-min-images-4/cephalopholis-argus-3.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rl

In [51]:
create_rls_species_dataset(
    m1_csv_path=Path("/home/yanir/projects/fish-id/data/dump-20210717/m1.csv"),
    image_dir=Path("/home/yanir/projects/yanirs.github.io/tools/rls/img"),
    output_dir=Path("/home/yanir/projects/deep-fish/data/rls-species-100-min-images-4"),
    num_species=100,
    min_images_per_species=4,
)

Skipping /home/yanir/projects/yanirs.github.io/tools/rls/img/heterodontus-portusjacksoni-egg-0.jpg
Skipping /home/yanir/projects/yanirs.github.io/tools/rls/img/chrysiptera-brownriggii-[dark-form]-0.jpg


In [12]:
def create_rls_genus_dataset(image_dir: Path, output_dir: Path, num_top_genera):
    genus_to_num_images = Counter()
    for image_path in image_dir.iterdir():
        genus = image_path.name.split("-", maxsplit=1)[0]
        genus_to_num_images[genus] += 1
    output_dir.mkdir(parents=True)
    for genus, _ in genus_to_num_images.most_common(num_top_genera):
        for src_filename in image_dir.glob(f"{genus}-*"):
            crop_image_file(src_filename, output_dir / src_filename.name)


output_dir = Path("/home/yanir/projects/deep-fish/data/rls-top-10-genera")
try:
    shutil.rmtree(output_dir)
except FileNotFoundError:
    pass
create_rls_genus_dataset(
    image_dir=Path("/home/yanir/projects/yanirs.github.io/tools/rls/img"),
    output_dir=output_dir,
    num_top_genera=10,
)
sorted(output_dir.iterdir())

[PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-achilles-0.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-achilles-1.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-albipectoralis-0.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-albipectoralis-1.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-auranticavus-0.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-bahianus-0.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-bariene-0.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-bariene-1.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-bariene-2.jpg'),
 PosixPath('/home/yanir/projects/deep-fish/data/rls-top-10-genera/acanthurus-blochii-0.jpg'),
 PosixPath('/home/yanir/projects/deep-

In [48]:
def create_test_dataset(trip_dir: Path, output_dir: Path):
    output_dir.mkdir(parents=True)
    src_to_dst = {}
    for child_path in trip_dir.glob("**/* - *.[Jj][Pp][Gg]"):
        img_id, raw_species = child_path.name[:-4].split(" - ", maxsplit=1)
        species_to_keep = []
        for name in raw_species.lower().split(" and "):
            try:
                genus, taxon = name.split(" ")
            except ValueError:
                continue
            if "[" in name or "(" in name or taxon == "todo" or taxon == "sp":
                continue
            species_to_keep.append(f"{genus}-{taxon}")
        dst = (
            f"{img_id}-{'-AND-'.join(species_to_keep)}.jpg" if species_to_keep else None
        )
        if dst:
            shutil.copy(child_path, output_dir / dst)
        src_to_dst[child_path] = dst
    pd.Series(src_to_dst).to_csv(output_dir / "src_to_dst.csv", header=False)
    return src_to_dst


output_dir = Path("/home/yanir/projects/deep-fish/data/eviota-202010")
try:
    shutil.rmtree(output_dir)
except FileNotFoundError:
    pass
create_test_dataset(
    trip_dir=Path("/home/yanir/Pictures/202010 Eviota GBR"), output_dir=output_dir
)

{PosixPath('/home/yanir/Pictures/202010 Eviota GBR/20201014 - off transect/PA141263 - Halichoeres prosopeion.JPG'): 'PA141263-halichoeres-prosopeion.jpg',
 PosixPath('/home/yanir/Pictures/202010 Eviota GBR/20201014 - off transect/PA141264 - Halichoeres prosopeion.JPG'): 'PA141264-halichoeres-prosopeion.jpg',
 PosixPath('/home/yanir/Pictures/202010 Eviota GBR/20201014 - GBR99 - IDs/PA141235 - Trimma lantana.JPG'): 'PA141235-trimma-lantana.jpg',
 PosixPath('/home/yanir/Pictures/202010 Eviota GBR/20201014 - GBR99 - IDs/PA141257 - Eviota prasites.JPG'): 'PA141257-eviota-prasites.jpg',
 PosixPath('/home/yanir/Pictures/202010 Eviota GBR/20201014 - GBR99 - IDs/PA141242 - Pictichromis coralensis.JPG'): 'PA141242-pictichromis-coralensis.jpg',
 PosixPath('/home/yanir/Pictures/202010 Eviota GBR/20201014 - GBR99 - IDs/PA141225 - Fusigobius duospilus.JPG'): 'PA141225-fusigobius-duospilus.jpg',
 PosixPath('/home/yanir/Pictures/202010 Eviota GBR/20201014 - GBR99 - IDs/PA141271 - Ucla xenogrammus.JPG'