In [35]:
import shutil

from dataset_br.images_invalid import get_list_of_images_invalid
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [56]:
import itertools
import numpy as np
import os
import pandas as pd
import pathlib
import shutil
import sqlalchemy as sa
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.ipynb'))))

from tables import County, DataTrustedIdentifier, InfoImage
from database import connect

from unaccent import unaccent

## Database connection

In [37]:
engine, session = connect()
engine.echo = False

2023-03-21 14:42:37,163 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-21 14:42:37,164 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-21 14:42:37,165 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-21 14:42:37,166 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-21 14:42:37,167 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-21 14:42:37,167 INFO sqlalchemy.engine.Engine [raw sql] {}


In [38]:
list_images = [str(image) for image in pathlib.Path('/home/xandao/Documentos/RGB/512').rglob('*.jpeg') if image.is_file()]
print('count of images is a %d' % len(list_images))

count of images is a 105208


In [39]:
county = session.query(County).filter(County.uf=='PR').distinct().all()

list_uf = [unaccent(sa.func.lower(c.uf)) for c in county]
list_state = [unaccent(sa.func.lower(c.state)) for c in county]
list_county = [unaccent(sa.func.lower(c.county)) for c in county]

uf_unaccent_lower = unaccent(sa.func.lower(DataTrustedIdentifier.state_province)).in_(list_uf)
state_unaccent_lower = unaccent(sa.func.lower(DataTrustedIdentifier.state_province)).in_(list_state)
county_unaccent_lower = unaccent(sa.func.lower(DataTrustedIdentifier.county)).in_(list_county)

In [40]:
columns = [DataTrustedIdentifier.specific_epithet, sa.func.array_agg(DataTrustedIdentifier.seq)]

species_with_five_more_records = session.query(*columns)\
    .filter(sa.and_(DataTrustedIdentifier.country_trusted =='Brasil',
                DataTrustedIdentifier.specific_epithet.is_not(None),
                 DataTrustedIdentifier.specific_epithet.is_not(None),
               sa.or_(uf_unaccent_lower, state_unaccent_lower)))\
    .distinct()\
    .group_by(DataTrustedIdentifier.specific_epithet)\
    .order_by(DataTrustedIdentifier.specific_epithet)\
    .having(sa.func.count(DataTrustedIdentifier.specific_epithet) >= 5)\
    .all()

print('count of species: %d' % len(species_with_five_more_records))

count of species: 54


In [41]:
list_species = []
list_path = []
color = 'rgb'
image_size='512'

for i, q in enumerate(species_with_five_more_records):
    species = q[0]
    list_seq = q[1]
    query = session.query(sa.func.array_agg(sa.distinct(InfoImage.path_image))) \
        .filter(sa.and_(InfoImage.seq_id.in_(list_seq),
                     InfoImage.image_size == image_size,
                     InfoImage.color_mode == color)) \
        .group_by(InfoImage.seq_id) \
        .all()

    l = list(itertools.chain(*query)) # "remove of tuples"

    if len(l) >= 5:

        # remove images duplicates
        list_only_one_path = []
        for barcode in l:
            list_only_one_path.append(sorted(barcode)[0]) # sorted and catch first value of list

        if len(np.unique(list_only_one_path)) >= 5:
            list_species.append(species)
            list_path.append(np.unique(list_only_one_path).tolist())

print('count of species: %d' % len(list_species))

count of species: 54


In [42]:
list_images_invalid = get_list_of_images_invalid()
pd.DataFrame(list_images_invalid)

Unnamed: 0,barcode,reason
0,INPA0248526,horizontal
1,INPA0248523,horizontal
2,INPA0248528,horizontal
3,NY01421575_01,horizontal
4,HUFSJ001689_v00,not exsicate
5,HUFSJ001133_v00,not exsicate
6,HUFSJ002198_v00,not exsicate
7,HUFSJ003255_v00,not exsicate
8,HVASF000487_v01,not exsicate
9,INPA0019084_nd,label


In [44]:
list_path_correct = []
list_count_path = []
sum = 0
for i, p in enumerate(list_path):
    matching = [path for path in p if not any(barcode in path for barcode in list_images_invalid['barcode'])]

    if len(p) != len(matching):
        print('specie: %s before: %d after: %d' % (list_species[i], len(p), len(matching)))
        diff = list(set(p) ^ set(matching))
        print('diff: %s' % str(diff))

    list_path_correct.append(matching)
    list_count_path.append(len(matching))

In [49]:
df = pd.DataFrame({
    'species': list_species,
    'paths': list_path,
    'count': list_count_path
})
df

Unnamed: 0,species,paths,count
0,aduncum,[/home/xandao/Documentos/RGB/512/w_pred_mask/A...,33
1,alata,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,18
2,amalago,[/home/xandao/Documentos/RGB/512/w_pred_mask/A...,69
3,amplum,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,6
4,arboreum,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,35
5,arifolia,[/home/xandao/Documentos/RGB/512/w_pred_mask/A...,11
6,barbarana,[/home/xandao/Documentos/RGB/512/w_pred_mask/H...,10
7,blanda,[/home/xandao/Documentos/RGB/512/w_pred_mask/A...,25
8,caldasianum,[/home/xandao/Documentos/RGB/512/w_pred_mask/M...,9
9,caldense,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,26


In [60]:
out='./'
threshold='5'
path_out = os.path.join(out, color, image_size, threshold)

for i, species_and_paths in enumerate(zip(list_species, list_path), start=1):
    species = species_and_paths[0]
    list_p = species_and_paths[0]
    path_final = os.path.join(path_out, 'f%d' % i)

    if not os.path.exists(path_final):
        os.makedirs(path_final)

    for p in list_p:
        shutil.copy(p, path_final)

('aduncum', ['/home/xandao/Documentos/RGB/512/w_pred_mask/ALCB002468.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00041534.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00041548.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00044966.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00044989.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00044991.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00045021.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00045026.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00045028.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00045030.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/CEN00047028.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/EVB000213.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/FURB38506.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/FURB38507.jpeg', '/home/xandao/Documentos/RGB/512/w_pred_mask/FURB38518.jpeg', '/home/xandao/Documentos/RGB/512/w_p

## Close connection

In [8]:
session.close()
engine.dispose()