In [18]:
from dataset_br.images_invalid import get_list_of_images_invalid
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

import itertools
import numpy as np
import os
import pandas as pd
import sqlalchemy as sa
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.ipynb'))))

from tables import County, DataTrustedIdentifier, InfoImage
from database import connect
from sqlalchemy import and_, or_
from sqlalchemy.orm import Session

from unaccent import unaccent

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Database connection

In [10]:
session: Session
engine, session = connect()
engine.echo=False

2023-03-20 00:49:36,843 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-20 00:49:36,844 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-20 00:49:36,845 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-20 00:49:36,845 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-20 00:49:36,846 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-20 00:49:36,846 INFO sqlalchemy.engine.Engine [raw sql] {}


## BR dataset

In [11]:
county = session.query(County).distinct().all()

list_uf = [unaccent(sa.func.lower(c.uf)) for c in county]
list_state = [unaccent(sa.func.lower(c.state)) for c in county]
list_county = [unaccent(sa.func.lower(c.county)) for c in county]

uf_unaccent_lower = unaccent(sa.func.lower(DataTrustedIdentifier.state_province)).in_(list_uf)
state_unaccent_lower = unaccent(sa.func.lower(DataTrustedIdentifier.state_province)).in_(list_state)
county_unaccent_lower = unaccent(sa.func.lower(DataTrustedIdentifier.county)).in_(list_county)

In [12]:
columns = [DataTrustedIdentifier.specific_epithet, DataTrustedIdentifier.barcode]
query_only_state = session.query(*columns)\
    .filter(and_(DataTrustedIdentifier.country_trusted =='Brasil',
                DataTrustedIdentifier.specific_epithet.is_not(None),
                 or_(uf_unaccent_lower, state_unaccent_lower)))\
    .distinct()\
    .all()

print('count of records using only field state_province: %d' % len(query_only_state))

count of records using only field state_province: 11914


In [13]:
query_lower_unaccent_like = session.query(sa.func.concat('%', unaccent(sa.func.lower(County.state)), '%')).distinct().all()
list_state_like = [q[0] for q in query_lower_unaccent_like]
locality_unaccent_lower = unaccent(sa.func.lower(DataTrustedIdentifier.locality)).like(sa.func.any_(list_state_like))

query_has_state_in_locality = session.query(*columns)\
    .filter(and_(DataTrustedIdentifier.country_trusted=='Brasil',
                 DataTrustedIdentifier.specific_epithet.is_not(None),
                 or_(uf_unaccent_lower,
                     state_unaccent_lower,
                     locality_unaccent_lower)))\
    .distinct()\
    .all()

print('count of records using only fields state_province and locality: %d' % len(query_has_state_in_locality))

count of records using only fields state_province and locality: 11927


## Get species with five more records (this steps was made to records with ten and twenty more records)

In [14]:
columns = [DataTrustedIdentifier.specific_epithet, sa.func.array_agg(DataTrustedIdentifier.seq)]

species_with_five_more_records = session.query(*columns)\
    .filter(and_(DataTrustedIdentifier.country_trusted =='Brasil',
                DataTrustedIdentifier.specific_epithet.is_not(None),
               or_(uf_unaccent_lower, state_unaccent_lower)))\
    .distinct()\
    .group_by(DataTrustedIdentifier.specific_epithet)\
    .order_by(DataTrustedIdentifier.specific_epithet)\
    .having(sa.func.count(DataTrustedIdentifier.specific_epithet) >= 5)\
    .all()

print('count of species: %d' % len(species_with_five_more_records))

count of species: 247


## Get only one image. This steps was made to images grayscale and size 256 and 400

In [20]:
list_species = []
list_path = []
color = 'rgb'
image_size='512'

for i, q in enumerate(species_with_five_more_records):
    species = q[0]
    list_seq = q[1]
    query = session.query(sa.func.array_agg(sa.distinct(InfoImage.path_image))) \
        .filter(and_(InfoImage.seq_id.in_(list_seq),
                     InfoImage.image_size == image_size,
                     InfoImage.color_mode == color)) \
        .group_by(InfoImage.seq_id) \
        .all()

    l = list(itertools.chain(*query)) # "remove of tuples"

    if len(l) >= 5:

        # remove images duplicates
        list_only_one_path = []
        for barcode in l:
            list_only_one_path.append(sorted(barcode)[0]) # sorted and catch first value of list

        if len(np.unique(list_only_one_path)) >= 5:
            list_species.append(species)
            list_path.append(np.unique(list_only_one_path).tolist())

print('count of species: %d' % len(list_species))

count of species: 235


## List images invalid

In [19]:
list_images_invalid = get_list_of_images_invalid()
pd.DataFrame(list_images_invalid)

Unnamed: 0,barcode,reason
0,INPA0248526,horizontal
1,INPA0248523,horizontal
2,INPA0248528,horizontal
3,NY01421575_01,horizontal
4,HUFSJ001689_v00,not exsicate
5,HUFSJ001133_v00,not exsicate
6,HUFSJ002198_v00,not exsicate
7,HUFSJ003255_v00,not exsicate
8,HVASF000487_v01,not exsicate
9,INPA0019084_nd,label


## Remove path of images invalid

In [22]:
list_path_correct = []
list_count_path = []
sum = 0
for i, p in enumerate(list_path):
    matching = [path for path in p if not any(barcode in path for barcode in list_images_invalid['barcode'])]

    if len(p) != len(matching):
        print('specie: %s before: %d after: %d' % (list_species[i], len(p), len(matching)))
        diff = list(set(p) ^ set(matching))
        print('diff: %s' % str(diff))

    list_path_correct.append(matching)
    list_count_path.append(len(matching))

specie: aleyreanum before: 54 after: 53
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/INPA0023115.jpeg']
specie: arboreum before: 621 after: 620
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/INPA0248528.jpeg']
specie: bartlingianum before: 146 after: 145
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/NY01421575_01.jpeg']
specie: belterraense before: 14 after: 13
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/NL-U1484137.jpeg']
specie: blanda before: 114 after: 113
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/HUFSJ001133_v00.jpeg']
specie: callosum before: 60 after: 59
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/INPA0022379_nd.jpeg']
specie: decora before: 39 after: 38
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/HUFSJ002198_v00.jpeg']
specie: hispidum before: 332 after: 331
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/INPA0032742_nd.jpeg']
specie: hostmannianum before: 146 after: 144
diff: ['/home/xandao/Documentos/RGB/512/w_pred_mask/

In [64]:
pd.DataFrame({
    'species': list_species,
    'paths': list_path,
    'count': list_count_path
})

Unnamed: 0,species,paths,count
0,abutiloides,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,8
1,aduncum,[/home/xandao/Documentos/RGB/512/w_pred_mask/A...,377
2,aequale,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,30
3,alata,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,81
4,alatabaccum,[/home/xandao/Documentos/RGB/512/w_pred_mask/I...,28
...,...,...,...
230,viminifolium,[/home/xandao/Documentos/RGB/512/w_pred_mask/C...,29
231,vitaceum,[/home/xandao/Documentos/RGB/512/w_pred_mask/I...,13
232,wachenheimii,[/home/xandao/Documentos/RGB/512/w_pred_mask/N...,5
233,warmingii,[/home/xandao/Documentos/RGB/512/w_pred_mask/M...,6


In [23]:
print('total of images: %d' % np.sum(list_count_path))

total of images: 10512


## Close connection

In [None]:
session.close()
engine.close()