In [2]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2
import pandas as pd
import pathlib
import sqlalchemy as sa

import os, sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.ipynb'))))

from tables import County, DataTrustedIdentifier, TrustedIdentifier
from database import connect, create_table_if_not_exists, table_is_empty, \
    insert_new_identifier_trusted, find_and_replace_broken_characters, get_all_records_of_trusted_identifier, \
    insert_new_data_trusted_identifier, get_all_identifiers_ilike, get_all_records_with_diff_brasil, \
    update_country_trusted_based_original_field, has_brasil_in_country_trusted, \
    get_all_records_with_brasil_in_country_trusted, state_province_in_list_uf_or_list_state, update_country_trusted, \
    has_state_in_locality
from sqlalchemy import and_, or_
from sqlalchemy.orm import Session

from unaccent import unaccent

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
session: Session
engine, session = connect()
engine.echo=False

# directory with all 52k images
path_fotos = '/home/xandao/Documentos/dataset_gimp/dataset-52k-sp-2021/fotos'
list_images = [file for file in pathlib.Path(path_fotos).rglob('*') if file.is_file()]

list_identified_trusted = [
    {'full_name': 'Aline Vieira de Melo Silva', 'searched_name': ['Silva']},
    {'full_name': 'Carmen Lúcia Falcão Ichaso', 'searched_name': ['Ichaso']},
    {'full_name': 'Daniele Monteiro Ferreira', 'searched_name': ['Monteiro']},
    {'full_name': 'Daniel Ruschel', 'searched_name': ['Ruschel']},
    {'full_name': 'Elsie Franklin Guimarães', 'searched_name': ['Guimar']},
    {'full_name': 'Eric J Tepe', 'searched_name': ['Tepe']},
    {'full_name': 'Erika Von Sohsten de Souza Medeiros', 'searched_name': ['Medeiros']},
    {'full_name': 'George Azevedo de Queiroz', 'searched_name': ['Queiroz']},
    {'full_name': 'Micheline Carvalho-Silva', 'searched_name': ['Carvalho']},
    {'full_name': 'Ricardo de la Merced Callejas Posada', 'searched_name': ['Callejas']},
    {'full_name': 'Truman George Yuncker', 'searched_name': ['Yuncker']},
    {'full_name': 'William Trelease', 'searched_name': ['Trelease']}
]

def text_bold(string):
    return '\033[1m' + string + '\033[0m'

print(len(list_images))

2023-03-15 14:06:19,213 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-15 14:06:19,213 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-15 14:06:19,215 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-15 14:06:19,215 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-15 14:06:19,216 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-15 14:06:19,216 INFO sqlalchemy.engine.Engine [raw sql] {}
52606


## identificadores confiaveis
### tem que ser distinct de value_founded, por causa das identificacoes com mais de uma pessoa

In [4]:
identifiers_trusted = session.query(TrustedIdentifier.value_founded)\
    .filter(TrustedIdentifier.trusted)\
    .distinct()

print('quantidade de identificadores confiaveis: %d' % identifiers_trusted.count())

create_table_if_not_exists(engine, TrustedIdentifier)

if table_is_empty(session, TrustedIdentifier):
    for identifier in list_identified_trusted:
        for trusted_identifier in identifier['searched_name']:
            query = get_all_identifiers_ilike(trusted_identifier, session)
            insert_new_identifier_trusted(identifier, query, trusted_identifier, session)

list_identifier_trusted = [q[0] for q in identifiers_trusted]

quantidade de identificadores confiaveis: 187


In [5]:
query_diff_identifier = get_all_records_of_trusted_identifier(list_identifier_trusted, session)

print('quantidade de registros encontrados com as variacoes dos nome dos identificadores: %d' % query_diff_identifier.count())

if table_is_empty(session, DataTrustedIdentifier):
    insert_new_data_trusted_identifier(session, query_diff_identifier)

count_data_trusted_identifier = session.query(DataTrustedIdentifier).count()
print('quantidade dados na tabela data_trusted_identifier: %d' % count_data_trusted_identifier)

quantidade de registros encontrados com as variacoes dos nome dos identificadores: 13182
quantidade dados na tabela data_trusted_identifier: 13182


## substitui caracteres nao codificados

In [6]:
list_special_characters = [{'find': 'Ã¡', 'replace': 'á'},
                           {'find': 'Ãº', 'replace': 'ú'},
                           {'find': 'Ã', 'replace': 'í'},
                           {'find': 'Ã³', 'replace': 'ó'},
                           {'find': 'Ã±', 'replace': 'ñ'},
                           {'find': 'Ã©', 'replace': 'é'}]

for attribute in [DataTrustedIdentifier.state_province, DataTrustedIdentifier.county]:
    for special_character in list_special_characters:
        find_and_replace_broken_characters(attribute, session, special_character)

## substitui as variacoes de BR, pela palavra certa

In [7]:
list_diff_br = ['Brasil', 'BRASIL', 'Brasil/Bolivia', 'Brasilia', 'brazil', 'Brazil', 'BRazil', 'BRAZIL', '[Brésil]', 'Brésil']

if has_brasil_in_country_trusted(session):
    records_with_diff_brasil = get_all_records_with_diff_brasil(list_diff_br, session)
    print('quantidade de registros encontrados com as variacoes dos nome dos identificadores + %s: %d' % (text_bold('variacoes de Brasil'), len(records_with_diff_brasil)))
    update_country_trusted_based_original_field(list_diff_br, session)


records_with_brasil_in_country_trusted = get_all_records_with_brasil_in_country_trusted(session)
print('%s: %d' % (text_bold('quantidade de registros com Brasil no campo confiavel'), len(records_with_brasil_in_country_trusted)))

[1mquantidade de registros com Brasil no campo confiavel[0m: 12144


## adiciona BR no campo confiavel (encontra estados e cidades na tabela county, e que nao tem no campo country variacoes BR)

In [8]:
county = session.query(County).distinct().all()

list_uf = [unaccent(sa.func.lower(c.uf)) for c in county]
list_state = [unaccent(sa.func.lower(c.state)) for c in county]
list_county = [unaccent(sa.func.lower(c.county)) for c in county]

update_country_trusted(list_county, list_state, list_uf, session)

  session.query(DataTrustedIdentifier)\


In [15]:
from database import make_operation
#
# eq_um = unaccent(sa.func.lower(DataTrustedIdentifier.state_province))==unaccent(sa.func.lower(County.state))
# eq_dois = unaccent(sa.func.lower(DataTrustedIdentifier.state_province))==unaccent(sa.func.lower(County.uf))
#
# session.query(DataTrustedIdentifier.specific_epithet, County.state).filter(or_(eq_um, eq_dois)).distinct().all()
# make_operation(session)

In [16]:
# session.close()

## Amostras do BR

In [9]:
query_lower_unaccent_like = session.query(sa.func.concat('%', unaccent(sa.func.lower(County.state)), '%')).distinct().all()
list_state_like = [q[0] for q in query_lower_unaccent_like]
columns = [DataTrustedIdentifier.specific_epithet, DataTrustedIdentifier.barcode]

query_only_state = session.query(*columns)\
    .filter(and_(DataTrustedIdentifier.country_trusted =='Brasil',
                DataTrustedIdentifier.specific_epithet.is_not(None),
                state_province_in_list_uf_or_list_state(list_state, list_uf)))\
    .distinct()\
    .all()


query_has_state_in_locality = session.query(*columns)\
    .filter(and_(DataTrustedIdentifier.country_trusted=='Brasil',
                 DataTrustedIdentifier.specific_epithet.is_not(None),
                 or_(state_province_in_list_uf_or_list_state(list_state, list_uf),
                     has_state_in_locality(list_state_like)
                     )))\
    .distinct()\
    .all()

print('quantidade de registros usando somente %s: %d' % (text_bold('state_province'), len(query_only_state)))
print('quantidade de registros usando somente %s: %d' % (text_bold('state_province e locality'), len(query_has_state_in_locality)))

quantidade de registros usando somente [1mstate_province[0m: 11914
quantidade de registros usando somente [1mstate_province e locality[0m: 11927


In [33]:
len(result)

247

In [44]:
li = [str(l.stem) for l in list_images]

In [49]:
rm = ["INPA0248526", "INPA0248523", "INPA0248528", "NY01421575_01", "HUFSJ001689_v00", "HUFSJ001133_v00", "HUFSJ002198_v00", "HUFSJ003255_v00", "HVASF000487_v01", "INPA0019084_nd", "INPA0022379_nd", "INPA0032742_nd", "INPA0023115", "NL-U1484137", "INPA0012286", "INPA0146998"]

for i, f in enumerate(rm):
    if len([pathlib.Path('out/RGB/512/5').rglob(f + '.*')]) > 0:
        print(i, f)

0 INPA0248526
1 INPA0248523
2 INPA0248528
3 NY01421575_01
4 HUFSJ001689_v00
5 HUFSJ001133_v00
6 HUFSJ002198_v00
7 HUFSJ003255_v00
8 HVASF000487_v01
9 INPA0019084_nd
10 INPA0022379_nd
11 INPA0032742_nd
12 INPA0023115
13 NL-U1484137
14 INPA0012286
15 INPA0146998


In [46]:
ff = []
for q in result:
    for barcode in q[1]:
        aa = list(filter(lambda x: barcode in x, li))
        if len(aa) > 0:
            ff.append(aa[0])
len(ff)

10583

In [40]:

import numpy as np

a = np.unique(ff)

array(['ALCB002267', 'ALCB002268', 'ALCB002269', ..., 'VIES042095',
       'VIES044708', 'VIES044720'], dtype='<U15')

In [26]:
from images import copy_all_images
from images import separate_images_per_threshold

query = query_only_state
# asds
for color in ['RGB']:
    for image_size in ['512']:
        path_fotos = '/home/xandao/Documentos/%s/%s/w_pred_mask' % (color, image_size)
        list_images = [file for file in pathlib.Path(path_fotos).glob('*')]
        dst = 'out/%s/%s' % (color, image_size)
        copy_all_images(dst, list_images, query)
        separate_images_per_threshold(dst)

## regioes

In [None]:
total = 0
total_locality = 0
for regiao in ['Norte', 'Nordeste', 'Centro-Oeste', 'Sudeste', 'Sul']:
    county = session.query(County)\
        .filter(County.regiao==regiao)\
        .distinct()\
        .all()

    list_uf = [unaccent(sa.func.lower(c.uf)) for c in county]
    list_state = [unaccent(sa.func.lower(c.state)) for c in county]
    list_county = [unaccent(sa.func.lower(c.county)) for c in county]

    query_lower_unaccent_like = session.query(sa.func.concat('%', unaccent(sa.func.lower(County.state)), '%'))\
        .filter(County.regiao==regiao)\
        .distinct()\
        .all()

    list_state_like = [q[0] for q in query_lower_unaccent_like]
    columns = [DataTrustedIdentifier.specific_epithet, DataTrustedIdentifier.barcode, DataTrustedIdentifier.country, DataTrustedIdentifier.state_province, DataTrustedIdentifier.county]

    query_only_state = session.query(*columns)\
        .filter(and_(DataTrustedIdentifier.country_trusted =='Brasil',
                    DataTrustedIdentifier.specific_epithet.is_not(None),
                    state_province_in_list_uf_or_list_state(list_state, list_uf)))\
        .distinct()\
        .all()


    query_has_state_in_locality = session.query(*columns)\
        .filter(and_(DataTrustedIdentifier.country_trusted=='Brasil',
                     DataTrustedIdentifier.specific_epithet.is_not(None),
                     or_(state_province_in_list_uf_or_list_state(list_state, list_uf),
                         has_state_in_locality(list_state_like)
                         )))\
        .distinct()\
        .all()



    print('quantidade de registros usando somente %s, na regiao %s: %d' % (text_bold('state_province'), text_bold(regiao), len(query_only_state)))
    print('quantidade de registros usando somente %s, na regiao %s: %d' % (text_bold('state_province e locality'), text_bold(regiao), len(query_has_state_in_locality)))
    total = total + len(query_only_state)
    total_locality = total_locality + len(query_has_state_in_locality)
    break
print('total sem %s: %d' % (text_bold('locality'), total))
print('total comm %s: %d' % (text_bold('locality'), total_locality))

In [None]:
for regiao in ['Norte', 'Nordeste', 'Centro-Oeste', 'Sudeste', 'Sul']:
    county = session.query(County)\
        .filter(County.regiao==regiao)\
        .distinct()\
        .all()

    list_uf = [unaccent(sa.func.lower(c.uf)) for c in county]
    list_state = [unaccent(sa.func.lower(c.state)) for c in county]
    list_county = [unaccent(sa.func.lower(c.county)) for c in county]

    query_lower_unaccent_like = session.query(sa.func.concat('%', unaccent(sa.func.lower(County.state)), '%'))\
        .filter(County.regiao==regiao)\
        .distinct()\
        .all()

    list_state_like = [q[0] for q in query_lower_unaccent_like]
    columns = [DataTrustedIdentifier.specific_epithet, DataTrustedIdentifier.barcode, DataTrustedIdentifier.country, DataTrustedIdentifier.state_province, DataTrustedIdentifier.county]

    query_only_state = session.query(*columns)\
        .filter(and_(DataTrustedIdentifier.country_trusted =='Brasil',
                    DataTrustedIdentifier.specific_epithet.is_not(None),
                    state_province_in_list_uf_or_list_state(list_state, list_uf)))\
        .distinct()\
        .all()

    for color in ['RGB']:
        for image_size in ['256']:
            path_fotos = '/home/xandao/%s/%s/w_pred_mask' % (color, image_size)
            list_images = list([file for file in pathlib.Path(path_fotos).glob('*.jpeg')])
            dst = 'out2/%s/%s/%s' % (color, image_size, regiao)
            copy_all_images(dst, list_images, query)
            separate_images_per_threshold(dst)