In [1]:
cd ..

/home/taiamiti/Projects/micro-plastic/data_engineering


In [2]:
import fiftyone as fo
import os

In [3]:
from enum import IntEnum
from fiftyone import ViewField as F

In [4]:
def load_dataset_from_multiple_sources(datasets_paths):
    dataset = fo.Dataset()
    for dataset_path in datasets_paths:
        dataset_name = os.path.basename(dataset_path)
        dataset.add_dir(
            dataset_dir=dataset_path,
            dataset_type=fo.types.FiftyOneDataset,
            tags=dataset_name,
        )
    return dataset

In [5]:
class EvalProtocol(IntEnum):
    BENI_INTRA_INTER_ILE = 1
    BENI_INTRA_ILE = 2
    SED_INTRA_INTER_ILE = 3
    SED_BENI_INTRA_INTER_ILE = 4
    TRAIN_TEST = 5
    UNLABELLED = 6

In [15]:
def get_data_dict(fo_dataset, subset, protocol):
    assert subset in ["train", "test", "unlabelled"], "Wrong subset value"
    if protocol == EvalProtocol.BENI_INTRA_INTER_ILE:
        if subset == "train":
            dataset_view = (fo_dataset.match_tags("train")
                            .match(F('island') == 'TAK')
                            .match(F('sample_type').is_in(("BENI", "CBENI"))))
        else:
            dataset_view = fo_dataset.match_tags("test").match(F('sample_type').is_in(("BENI", "CBENI")))
    elif protocol == EvalProtocol.BENI_INTRA_ILE:
        dataset_view = fo_dataset.match_tags(subset).match(F('sample_type').is_in(("BENI", "CBENI")))
    elif protocol == EvalProtocol.SED_INTRA_INTER_ILE:
        if subset == "train":
            dataset_view = fo_dataset.match_tags("train").match_tags("lot1-20-04-2023-sediments")
        else:
            dataset_view = fo_dataset.match_tags("test").match(F('sample_type').is_in(("SED", "CSED")))
    elif protocol == EvalProtocol.SED_BENI_INTRA_INTER_ILE:
        if subset == "train":
            dataset_view = fo_dataset.match_tags("train").match_tags(["lot1-20-04-2023-benitiers",
                                                                      "lot1-20-04-2023-sediments"])
        else:
            dataset_view = fo_dataset.match_tags("test").match(F('sample_type').is_in(("BENI", "CBENI", "SED", "CSED")))
    elif protocol == EvalProtocol.TRAIN_TEST:
        dataset_view = fo_dataset.match_tags(subset)
    elif protocol == EvalProtocol.UNLABELLED:
        dataset_view = fo_dataset.match_tags("unlabelled")
        return [{"img": sample.filepath} for sample in dataset_view]
    else:
        raise ValueError("Wrong protocol, must be of type EvalProtocol")
    sel_files = [{"img": sample.filepath, "seg": sample.ground_truth.mask_path}
                 for sample in dataset_view if os.path.exists(sample.ground_truth.mask_path)]
    return sel_files


def get_relative_img_paths(ds, subset, protocol, fullpath_prefix):
    return [item["img"].replace(fullpath_prefix + "/", "") for item in get_data_dict(ds, subset, protocol)]


def write_annot_file(file_list, save_dir, subset, protocol):
    save_name = f"{subset}_{protocol}"
    save_name = "{}_{}".format(subset, str(protocol).replace(".", "_") + ".txt")
    save_path = os.path.join(save_dir, save_name)
    os.makedirs(save_dir, exist_ok=True)

    with open(save_path, 'w') as fp:
        fp.write('\n'.join(file_list))

In [7]:
ds_root = "data/processed/generate_annotated_dataset/"
ds_paths = [os.path.join(ds_root, ds_path) for ds_path in os.listdir(ds_root)]

In [8]:
ds = load_dataset_from_multiple_sources(ds_paths)

Importing samples...
 100% |███████████████████| 49/49 [191.0ms elapsed, 0s remaining, 256.6 samples/s]     
Import complete
Importing samples...
 100% |█████████████████| 337/337 [246.3ms elapsed, 0s remaining, 1.4K samples/s]     
Import complete
Importing samples...
 100% |█████████████████| 730/730 [221.8ms elapsed, 0s remaining, 3.3K samples/s]     
Import complete
Importing samples...
 100% |█████████████████| 374/374 [183.5ms elapsed, 0s remaining, 2.0K samples/s]     
Import complete
Importing samples...
 100% |█████████████████| 635/635 [272.8ms elapsed, 0s remaining, 2.3K samples/s]      
Import complete
Importing samples...
 100% |█████████████████| 448/448 [715.4ms elapsed, 0s remaining, 627.8 samples/s]      
Import complete
Importing samples...
 100% |███████████████| 1502/1502 [724.4ms elapsed, 0s remaining, 2.1K samples/s]      
Import complete
Importing samples...
 100% |█████████████████| 190/190 [109.8ms elapsed, 0s remaining, 1.8K samples/s]     
Import complete
Imp

In [10]:
ds

Name:        2023.11.28.21.01.12
Media type:  image
Num samples: 16312
Persistent:  False
Tags:        []
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time

In [9]:
ds.match_tags("unlabelled")

Dataset:     2023.12.04.16.37.18
Media type:  image
Num samples: 13428
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
 

In [11]:
ds.match_tags("test").match(F('sample_type').is_in(("BENI", "CBENI")))

Dataset:     2023.11.28.21.01.12
Media type:  image
Num samples: 255
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
   

In [12]:
ds.count_sample_tags()

{'lot4-28-06-2023-sediments-part3': 966,
 'lot6-12-08-2023-eau-horizontal': 2226,
 'lot10-09-10-2023-benitiers': 730,
 'lot2-30-05-2023-tak_nai': 635,
 'lot2-30-05-2023-tak_nacl': 374,
 'lot1-20-04-2023-sediments': 337,
 'lot5-04-07-2023-benitiers-part2': 50,
 'lot1-20-04-2023-benitiers': 49,
 'train': 1945,
 'lot3-08-06-2023-benitiers': 448,
 'unlabelled': 13428,
 'lot6-12-08-2023-eau-vertical': 897,
 'lot8-28-09-2023-benitiers': 1911,
 'test': 848,
 'lot4-28-06-2023-sediments-part1': 1502,
 'lot4-28-06-2023-sediments-part2': 190,
 'lot9-09-10-2023-benitiers': 1835,
 'lot5-04-07-2023-benitiers-part1': 2062,
 'lot7-28-09-2023-benitiers': 2100}

In [13]:
ds.match_tags(['lot3-08-06-2023-benitiers']).match_tags('unlabelled')

Dataset:     2023.11.28.21.01.12
Media type:  image
Num samples: 0
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
    z

In [14]:
ds.match_tags("test").match_tags("lot1-20-04-2023-sediments")

Dataset:     2023.11.28.21.01.12
Media type:  image
Num samples: 102
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
   

In [15]:
ds.match_tags(['lot2-30-05-2023-tak_nacl', 'lot2-30-05-2023-tak_nai']).match_tags('train')

Dataset:     2023.11.28.21.01.12
Media type:  image
Num samples: 587
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
   

In [16]:
ds.match_tags(['lot1-20-04-2023-sediments', 'lot2-30-05-2023-tak_nacl', 'lot2-30-05-2023-tak_nai']).match_tags('train')

Dataset:     2023.11.28.21.01.12
Media type:  image
Num samples: 822
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
   

In [17]:
ds.match_tags(['lot4-28-06-2023-sediments-part1', 'lot4-28-06-2023-sediments-part2', 'lot4-28-06-2023-sediments-part3']).match_tags('unlabelled')

Dataset:     2023.11.28.21.01.12
Media type:  image
Num samples: 2020
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
  

In [18]:
ds.match_tags('train').match(F('sample_type').is_in(("SED", "CSED")))

Dataset:     2023.11.28.21.01.12
Media type:  image
Num samples: 1262
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time: fiftyone.core.fields.FloatField
  

In [19]:
ds.count_sample_tags()

{'lot6-12-08-2023-eau-horizontal': 2226,
 'lot10-09-10-2023-benitiers': 730,
 'lot2-30-05-2023-tak_nai': 635,
 'lot1-20-04-2023-sediments': 337,
 'lot2-30-05-2023-tak_nacl': 374,
 'lot5-04-07-2023-benitiers-part2': 50,
 'train': 1945,
 'lot1-20-04-2023-benitiers': 49,
 'unlabelled': 13428,
 'lot3-08-06-2023-benitiers': 448,
 'lot6-12-08-2023-eau-vertical': 897,
 'lot8-28-09-2023-benitiers': 1911,
 'test': 848,
 'lot4-28-06-2023-sediments-part1': 1502,
 'lot4-28-06-2023-sediments-part2': 190,
 'lot9-09-10-2023-benitiers': 1835,
 'lot7-28-09-2023-benitiers': 2100,
 'lot5-04-07-2023-benitiers-part1': 2062,
 'lot4-28-06-2023-sediments-part3': 966}

In [20]:
1496/4

374.0

In [21]:
ds

Name:        2023.11.28.21.01.12
Media type:  image
Num samples: 16312
Persistent:  False
Tags:        []
Sample fields:
    id:            fiftyone.core.fields.ObjectIdField
    filepath:      fiftyone.core.fields.StringField
    tags:          fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:      fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.ImageMetadata)
    image_path:    fiftyone.core.fields.StringField
    sample_type:   fiftyone.core.fields.StringField
    island:        fiftyone.core.fields.StringField
    station:       fiftyone.core.fields.StringField
    replica:       fiftyone.core.fields.StringField
    distil:        fiftyone.core.fields.StringField
    sample_id:     fiftyone.core.fields.StringField
    image_id:      fiftyone.core.fields.StringField
    filter:        fiftyone.core.fields.StringField
    extra:         fiftyone.core.fields.StringField
    date:          fiftyone.core.fields.StringField
    exposure_time

In [22]:
beni_view = ds.match(F('sample_type').is_in(("BENI", "CBENI"))).filter_labels("detections", F("score") > 0.2)
sed_view = ds.match(F('sample_type').is_in(("SED", "CSED"))).filter_labels("detections", F("score") > 0.35)
merged_view = beni_view + sed_view
merged_view_clean = merged_view.match_tags('bad_gt', bool=False)

In [23]:
session = fo.launch_app(ds, auto=False)
session.open_tab()

Session launched. Run `session.show()` to open the App in a cell output.


<IPython.core.display.Javascript object>

In [None]:
view_filtered_out = (ds.match(F('sample_type').is_in(("BENI", "CBENI")))
         .filter_labels("detections", F("score") > 0.2, only_matches=False)
         .match(F('detections.detections').length() == 0)
        )

In [None]:
view_filtered_out

In [None]:
session.view = view_filtered_out

In [None]:
merged_view_clean.match_tags("test").match(F('sample_type').is_in(("BENI", "CBENI")))

In [10]:
save_dir = "data/processed/prepare_dataset_for_openmmseg2/"

In [25]:
# merged_view_clean.export(
#     export_dir=save_dir,
#     dataset_type=fo.types.ImageSegmentationDirectory,
#     label_field='detections',
#     export_media=True,
#     rel_dir=os.path.abspath(ds_root)    
# )

ds.export(
    export_dir=save_dir,
    dataset_type=fo.types.ImageSegmentationDirectory,
    label_field='detections',
    export_media=True,
    rel_dir=os.path.abspath(ds_root)    
)

 100% |█████████████| 16312/16312 [7.3m elapsed, 0s remaining, 54.7 samples/s]      


In [11]:
fullpath_prefix = os.path.abspath(ds_root)

In [16]:
for protocol in EvalProtocol:
    if protocol == EvalProtocol.UNLABELLED:
        file_list = get_relative_img_paths(ds, "unlabelled", protocol, fullpath_prefix)
        write_annot_file(file_list, save_dir, "unlabelled", protocol)
    for subset in ["train", "test"]:
#         file_list = get_relative_img_paths(merged_view_clean, subset, protocol, fullpath_prefix)
        file_list = get_relative_img_paths(ds, subset, protocol, fullpath_prefix)
        write_annot_file(file_list, save_dir, subset, protocol)