# Example 2 - ROI features

In this notebook, we'll separate 2019 data (only annotated) into train & test echograms, perform ROI detection, assign ROI label, ROI features, ROI images. Save them into train & test set, for further steps. 

In [11]:
import os
import numpy as np
import pandas as pd
import glob
import pickle
import random

from src.read_echogram import EchogramReader
from src.detect_ROI import ROIDetector
from src.ROI_features import FeatureExtractor
from src.transform_annotations import AnnotationTransformer
from src.match_annotations import OverlapAnnotation
from src.crop_ROI import ROICropper

%matplotlib inline

## Step 1. Load annotations & separate train & test

In this step, we'll load annotation filenames. 

In [12]:
# load original annotations (mask)
annotations_dir = "../csv/"
annotations = pd.read_csv(annotations_dir + "annotation_df_masks.csv")
# drop nan
annotations = annotations.dropna(how='any')
# add label map
label_map = {'Unclassified regions': 1, 'krill_schools': 2, 'fish_school': 3, 'AH_School': 4}

In [13]:
# select echograms with AH schools
filename_li = annotations['file_dir'].unique()
filename_li.sort() # get filenames

In [14]:
len(filename_li)

861

Next, separate train & test data. 

In [15]:
# load annotations
pkl_dir = "pkl/"
with open(pkl_dir + 'annotations_dict_new.pickle', 'rb') as handle:
    annotations_dict = pickle.load(handle)

In [16]:
echogram_li = annotations_dict.keys() # key - filename

In [17]:
random.seed(0)
test_examples = ['D20190927-T072325', 'D20191016-T184753', 'D20191016-T213424', 'D20191018-T081659', 'D20191018-T110329', 'D20191020-T145420', 'D20191024-T103607', 'D20191024-T172924', 'D20191102-T144417', 'D20191102-T160647'] # 10
# test_examples = ['D20191102-T14441', 'D20191024-T172924', 'D20191023-T130820', 'D20190927-T072325']
other_test_examples = random.sample(list(set(echogram_li) - set(test_examples)), k=40)
# combine into test
test_echogram_li = test_examples + other_test_examples
# test_echogram_li = random.sample(echogram_li, k=50)

In [18]:
train_echogram_li = [i for i in echogram_li if i not in test_echogram_li]

In [19]:
len(train_echogram_li), len(test_echogram_li)

(811, 50)

In [20]:
# save to keep train/test consistent
pkl_dir = "pkl/"
with open(pkl_dir + 'train_echogram_li.pickle', 'wb') as handle:
    pickle.dump(train_echogram_li, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(pkl_dir + 'test_echogram_li.pickle', 'wb') as handle:
    pickle.dump(test_echogram_li, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Step 2. Get ROI features, label, and npy (saved)

In this step, we'll run ROI detection on each file, get features, match with annotations (to get labels), crop images (saved as numpy array). Add ROI id if necessary. 

In [25]:
# add raw and bottom file dir
raw_dir = "../data/HB1906_EK60/rawfiles/"
bot_dir = "../data/HB1906_EK60/botfiles/"
freq_li = [18, 38, 120, 200]

In [26]:
raw_paths = sorted(glob.glob(raw_dir + '*.raw'))
bot_paths = sorted(glob.glob(bot_dir + '*.bot'))

In [27]:
fig_dir = "figures/"

In [28]:
# set up parameters
threshold = -66
kernel_size = 3
overlap_ratio = 0.4

TRY parallel processing! Est. 4 hours -> Much faster after upgrading ipython, est. 2 hours. 

In [29]:
def ROI(a):
    i, j = a
    if i not in filename_li:
        return 
    echogram = EchogramReader(i, j, freq_li)
    filename, Sv_npy, surface_idx, bottom_idx, time, depth, positions = echogram()   
    annotations_idx, labels = annotations_dict[filename] # get annotation xy indices directly
    # detect ROIs
    roi = ROIDetector(filename, Sv_npy, surface_idx, bottom_idx, fig_dir, threshold, kernel_size)
    img_shape, contours = roi()
    features = FeatureExtractor(filename, contours, Sv_npy, bottom_idx, time, depth, positions) 
    # ROI features
    contours_sel, contours_features = features() # return a list
    try:
        overlap = OverlapAnnotation(filename, img_shape, annotations_idx, labels, contours_sel, fig_dir) 
        contours_labels = overlap.assign_label(overlap_ratio) # get label
        for idx, contour in enumerate(contours_features):
            contour['label'] = contours_labels[idx] # add
        # ROI npy, select dir
        if filename in train_echogram_li:
            npy_dir = "npy_new/train/"
        if filename in test_echogram_li:
            npy_dir = "npy_new/test/"
        crop = ROICropper(filename, contours_sel, contours_labels, Sv_npy, npy_dir)
        crop()
        return contours_features
    except Exception as e:
        print(e)
        return

In [None]:
from multiprocessing import Pool
pool = Pool(os.cpu_count()) # 
res_li = pool.map(ROI, zip(raw_paths, bot_paths)) # a list of results

In [46]:
res = []
for sublist in res_li:
    if sublist == None:
        continue
    for item in sublist:
        res.append(item)
df_roi_features = pd.DataFrame(res)

In [47]:
pkl_dir = "pkl/"
df_roi_features.to_pickle(pkl_dir + 'df_roi_features_new.pkl') # *_new with kernel_size == 1, for abundance estimates

In [48]:
pool.close()
pool.join()