In [1]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2
import os
import pathlib
import pandas as pd
import shutil
import sqlalchemy as sa
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.ipynb'))))

from tables import County, DataTrustedIdentifier, TrustedIdentifier
from database import connect, create_table_if_not_exists, table_is_empty, \
    insert_new_identifier_trusted, find_and_replace_broken_characters, get_all_records_of_trusted_identifier, \
    insert_new_data_trusted_identifier, get_all_identifiers_ilike, get_all_records_with_diff_brasil, \
    update_country_trusted_based_original_field, has_brasil_in_country_trusted, \
    get_all_records_with_brasil_in_country_trusted, state_province_in_list_uf_or_list_state, update_country_trusted, \
    has_state_in_locality
from sqlalchemy import and_, or_
from sqlalchemy.orm import Session

from unaccent import unaccent

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
session: Session
engine, session = connect()
engine.echo=False

list_identified_trusted = [
    {'full_name': 'Aline Vieira de Melo Silva', 'searched_name': ['Silva']},
    {'full_name': 'Carmen Lúcia Falcão Ichaso', 'searched_name': ['Ichaso']},
    {'full_name': 'Daniele Monteiro Ferreira', 'searched_name': ['Monteiro']},
    {'full_name': 'Daniel Ruschel', 'searched_name': ['Ruschel']},
    {'full_name': 'Elsie Franklin Guimarães', 'searched_name': ['Guimar']},
    {'full_name': 'Eric J Tepe', 'searched_name': ['Tepe']},
    {'full_name': 'Erika Von Sohsten de Souza Medeiros', 'searched_name': ['Medeiros']},
    {'full_name': 'George Azevedo de Queiroz', 'searched_name': ['Queiroz']},
    {'full_name': 'Micheline Carvalho-Silva', 'searched_name': ['Carvalho']},
    {'full_name': 'Ricardo de la Merced Callejas Posada', 'searched_name': ['Callejas']},
    {'full_name': 'Truman George Yuncker', 'searched_name': ['Yuncker']},
    {'full_name': 'William Trelease', 'searched_name': ['Trelease']}
]

def text_bold(string):
    return '\033[1m' + string + '\033[0m'

2023-03-15 23:00:31,044 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-15 23:00:31,044 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-15 23:00:31,046 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-15 23:00:31,046 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-15 23:00:31,047 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-15 23:00:31,047 INFO sqlalchemy.engine.Engine [raw sql] {}


## identificadores confiaveis
### tem que ser distinct de value_founded, por causa das identificacoes com mais de uma pessoa

In [3]:
identifiers_trusted = session.query(TrustedIdentifier.value_founded)\
    .filter(TrustedIdentifier.trusted)\
    .distinct()

print('quantidade de identificadores confiaveis: %d' % identifiers_trusted.count())

create_table_if_not_exists(engine, TrustedIdentifier)

if table_is_empty(session, TrustedIdentifier):
    for identifier in list_identified_trusted:
        for trusted_identifier in identifier['searched_name']:
            query = get_all_identifiers_ilike(trusted_identifier, session)
            insert_new_identifier_trusted(identifier, query, trusted_identifier, session)

list_identifier_trusted = [q[0] for q in identifiers_trusted]

quantidade de identificadores confiaveis: 187


In [4]:
query_diff_identifier = get_all_records_of_trusted_identifier(list_identifier_trusted, session)

print('quantidade de registros encontrados com as variacoes dos nome dos identificadores: %d' % query_diff_identifier.count())

if table_is_empty(session, DataTrustedIdentifier):
    insert_new_data_trusted_identifier(session, query_diff_identifier)

count_data_trusted_identifier = session.query(DataTrustedIdentifier).count()
print('quantidade dados na tabela data_trusted_identifier: %d' % count_data_trusted_identifier)

quantidade de registros encontrados com as variacoes dos nome dos identificadores: 13182
quantidade dados na tabela data_trusted_identifier: 13182


## substitui caracteres nao codificados

In [5]:
list_special_characters = [{'find': 'Ã¡', 'replace': 'á'},
                           {'find': 'Ãº', 'replace': 'ú'},
                           {'find': 'Ã', 'replace': 'í'},
                           {'find': 'Ã³', 'replace': 'ó'},
                           {'find': 'Ã±', 'replace': 'ñ'},
                           {'find': 'Ã©', 'replace': 'é'}]

for attribute in [DataTrustedIdentifier.state_province, DataTrustedIdentifier.county]:
    for special_character in list_special_characters:
        find_and_replace_broken_characters(attribute, session, special_character)

## substitui as variacoes de BR, pela palavra certa

In [6]:
list_diff_br = ['Brasil', 'BRASIL', 'Brasil/Bolivia', 'Brasilia', 'brazil', 'Brazil', 'BRazil', 'BRAZIL', '[Brésil]', 'Brésil']

if has_brasil_in_country_trusted(session):
    records_with_diff_brasil = get_all_records_with_diff_brasil(list_diff_br, session)
    print('quantidade de registros encontrados com as variacoes dos nome dos identificadores + %s: %d' % (text_bold('variacoes de Brasil'), len(records_with_diff_brasil)))
    update_country_trusted_based_original_field(list_diff_br, session)


records_with_brasil_in_country_trusted = get_all_records_with_brasil_in_country_trusted(session)
print('%s: %d' % (text_bold('quantidade de registros com Brasil no campo confiavel'), len(records_with_brasil_in_country_trusted)))

[1mquantidade de registros com Brasil no campo confiavel[0m: 12144


## adiciona BR no campo confiavel (encontra estados e cidades na tabela county, e que nao tem no campo country variacoes BR)

In [7]:
county = session.query(County).distinct().all()

list_uf = [unaccent(sa.func.lower(c.uf)) for c in county]
list_state = [unaccent(sa.func.lower(c.state)) for c in county]
list_county = [unaccent(sa.func.lower(c.county)) for c in county]

update_country_trusted(list_county, list_state, list_uf, session)

  session.query(DataTrustedIdentifier)\


## BR dataset

In [8]:
query_lower_unaccent_like = session.query(sa.func.concat('%', unaccent(sa.func.lower(County.state)), '%')).distinct().all()
list_state_like = [q[0] for q in query_lower_unaccent_like]
columns = [DataTrustedIdentifier.specific_epithet, DataTrustedIdentifier.barcode]

In [9]:
query_only_state = session.query(*columns)\
    .filter(and_(DataTrustedIdentifier.country_trusted =='Brasil',
                DataTrustedIdentifier.specific_epithet.is_not(None),
                state_province_in_list_uf_or_list_state(list_state, list_uf)))\
    .distinct()\
    .all()

print('count of samples using only %s: %d' % (text_bold('state_province'), len(query_only_state)))

count of samples using only [1mstate_province[0m: 11914


In [10]:
query_has_state_in_locality = session.query(*columns)\
    .filter(and_(DataTrustedIdentifier.country_trusted=='Brasil',
                 DataTrustedIdentifier.specific_epithet.is_not(None),
                 or_(state_province_in_list_uf_or_list_state(list_state, list_uf),
                     has_state_in_locality(list_state_like)
                     )))\
    .distinct()\
    .all()

print('count of samples using only %s: %d' % (text_bold('state_province e locality'), len(query_has_state_in_locality)))

count of samples using only [1mstate_province e locality[0m: 11927


In [11]:
def check_if_image_exists(list_images, query):
    list_species_images = dict()
    for i, q in enumerate(query):
        print('row %d of %d' % (i, len(query)))
        species = q[0]
        barcode = q[1]
        result = sorted([image for image in list_images if barcode in str(image.stem)])
        if len(result) > 0:
            if not species in list_species_images.keys():
                list_species_images[species] = []

            result = [result[0]]
            for r in result:
                list_species_images[species].append(r.name)
    return list_species_images

def copy_images(input, list_images, species, threshold, output):
    for image in list_images:
        if not os.path.exists(os.path.join(output, threshold, species)):
            os.makedirs(os.path.join(output, threshold, species))

        src = os.path.join(input, image)
        dst = os.path.join(output, threshold, species, image)
        shutil.copy(src, dst)

def copy_and_separate_by_minimum(input, list_images_by_species, output):
    for t in ['5', '10', '20']:
        if not os.path.exists(os.path.join(output, t)):
            os.makedirs(os.path.join(output, t))

    for species, list_barcode in list_images_by_species.items():
        if len(list_barcode) >= 5:
            copy_images(input, list_barcode, species, '5', output)
        if len(list_barcode) >= 10:
            copy_images(input, list_barcode, species, '10', output)
        if len(list_barcode) >= 20:
            copy_images(input, list_barcode, species, '20', output)

def get_total_images(dst):
    return len([image for image in pathlib.Path(dst).rglob('*') if image.is_file()])

def get_total_species(dst):
    return len([dir for dir in pathlib.Path(dst).glob('*') if dir.is_dir()])

In [19]:
query = query_only_state
info = {'color':[], 'image_size': [], 'threshold':[], 'total_species':[], 'total_images': []}

for color in ['RGB']:
    for image_size in ['512']:
        path_fotos = '/home/xandao/Documentos/%s/%s/w_pred_mask' % (color, image_size)
        list_images = [file for file in pathlib.Path(path_fotos).glob('*')]
        dst = os.path.join('out', color, image_size)
        list_species_images = check_if_image_exists(list_images, query)
        copy_and_separate_by_minimum(path_fotos, list_species_images, dst)

        for t in ['5', '10', '20']:
            info['color'].append(color)
            info['image_size'].append(image_size)
            info['threshold'].append(t)
            info['total_species'].append(get_total_species(os.path.join(dst, t)))
            info['total_images'].append(get_total_images(os.path.join(dst, t)))

row 0 of 11914
row 1 of 11914
row 2 of 11914
row 3 of 11914
row 4 of 11914
row 5 of 11914
row 6 of 11914
row 7 of 11914
row 8 of 11914
row 9 of 11914
row 10 of 11914
row 11 of 11914
row 12 of 11914
row 13 of 11914
row 14 of 11914
row 15 of 11914
row 16 of 11914
row 17 of 11914
row 18 of 11914
row 19 of 11914
row 20 of 11914
row 21 of 11914
row 22 of 11914
row 23 of 11914
row 24 of 11914
row 25 of 11914
row 26 of 11914
row 27 of 11914
row 28 of 11914
row 29 of 11914
row 30 of 11914
row 31 of 11914
row 32 of 11914
row 33 of 11914
row 34 of 11914
row 35 of 11914
row 36 of 11914
row 37 of 11914
row 38 of 11914
row 39 of 11914
row 40 of 11914
row 41 of 11914
row 42 of 11914
row 43 of 11914
row 44 of 11914
row 45 of 11914
row 46 of 11914
row 47 of 11914
row 48 of 11914
row 49 of 11914
row 50 of 11914
row 51 of 11914
row 52 of 11914
row 53 of 11914
row 54 of 11914
row 55 of 11914
row 56 of 11914
row 57 of 11914
row 58 of 11914
row 59 of 11914
row 60 of 11914
row 61 of 11914
row 62 of 11914
ro

In [13]:
pd.DataFrame(info)

Unnamed: 0,color,image_size,threshold,total_species,total_images
0,RGB,512,5,235,10528
1,RGB,512,10,160,10004
2,RGB,512,20,105,9247


### list of invalid images

In [20]:
list_images_removed = {
    'barcode': ['INPA0248526', 'INPA0248523', 'INPA0248528', 'NY01421575_01', 'HUFSJ001689_v00', 'HUFSJ001133_v00', 'HUFSJ002198_v00', 'HUFSJ003255_v00', 'HVASF000487_v01', 'INPA0019084_nd', 'INPA0022379_nd', 'INPA0032742_nd', 'INPA0023115', 'NL-U1484137', 'INPA0012286', 'INPA0146998'],
    'reason': ['horizontal', 'horizontal', 'horizontal', 'horizontal', 'not exsicate', 'not exsicate', 'not exsicate', 'not exsicate', 'not exsicate', 'label', 'label', 'label', 'letter', 'letter', 'letter', 'incomplete']
}

pd.DataFrame(list_images_removed)

Unnamed: 0,barcode,reason
0,INPA0248526,horizontal
1,INPA0248523,horizontal
2,INPA0248528,horizontal
3,NY01421575_01,horizontal
4,HUFSJ001689_v00,not exsicate
5,HUFSJ001133_v00,not exsicate
6,HUFSJ002198_v00,not exsicate
7,HUFSJ003255_v00,not exsicate
8,HVASF000487_v01,not exsicate
9,INPA0019084_nd,label


In [21]:
list_images_separated = [image for image in pathlib.Path('out').rglob('*.jpeg') if image.is_file()]
for barcode in list_images_removed['barcode']:
    path_to_image_remove = list(filter(lambda x: barcode == str(x.stem), list_images_separated))
    for p in path_to_image_remove:
        os.remove(p)

### after remove invalid images

In [25]:
info = {'color':[], 'image_size': [], 'threshold':[], 'total_species':[], 'total_images': []}

for color in ['RGB']:
    for image_size in ['512']:
        dst = os.path.join('out', color, image_size)

        for t in ['5', '10', '20']:
            info['color'].append(color)
            info['image_size'].append(image_size)
            info['threshold'].append(t)
            info['total_species'].append(get_total_species(os.path.join(dst, t)))
            info['total_images'].append(get_total_images(os.path.join(dst, t)))

pd.DataFrame(info)

Unnamed: 0,color,image_size,threshold,total_species,total_images
0,RGB,512,5,235,10512
1,RGB,512,10,160,9988
2,RGB,512,20,105,9235
