In [1]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2
import os
import sqlalchemy as sa
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.ipynb'))))

from tables import County, DataTrustedIdentifier, TrustedIdentifier, DataSP, InfoImage
from database import connect, create_table_if_not_exists, table_is_empty, \
    insert_new_identifier_trusted, find_and_replace_broken_characters, \
    insert_new_data_trusted_identifier, update_country_trusted_based_original_field, has_brasil_in_country_trusted, update_country_trusted, get_all_identifiers_ilike
from sqlalchemy.orm import Session

from unaccent import unaccent

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## database connection

In [2]:
session: Session
engine, session = connect()
engine.echo=False

2023-03-17 23:38:48,026 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-17 23:38:48,027 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-17 23:38:48,028 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-17 23:38:48,029 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-17 23:38:48,030 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-17 23:38:48,030 INFO sqlalchemy.engine.Engine [raw sql] {}


## full name of identifier

In [3]:
list_identifier_trusted = [
    {'full_name': 'Aline Vieira de Melo Silva', 'searched_name': ['Silva']},
    {'full_name': 'Carmen Lúcia Falcão Ichaso', 'searched_name': ['Ichaso']},
    {'full_name': 'Daniele Monteiro Ferreira', 'searched_name': ['Monteiro']},
    {'full_name': 'Daniel Ruschel', 'searched_name': ['Ruschel']},
    {'full_name': 'Elsie Franklin Guimarães', 'searched_name': ['Guimar']},
    {'full_name': 'Eric J Tepe', 'searched_name': ['Tepe']},
    {'full_name': 'Erika Von Sohsten de Souza Medeiros', 'searched_name': ['Medeiros']},
    {'full_name': 'George Azevedo de Queiroz', 'searched_name': ['Queiroz']},
    {'full_name': 'Micheline Carvalho-Silva', 'searched_name': ['Carvalho']},
    {'full_name': 'Ricardo de la Merced Callejas Posada', 'searched_name': ['Callejas']},
    {'full_name': 'Truman George Yuncker', 'searched_name': ['Yuncker']},
    {'full_name': 'William Trelease', 'searched_name': ['Trelease']}
]

def text_bold(string):
    return '\033[1m' + string + '\033[0m'

## identificadores confiaveis
### tem que ser distinct de value_founded, por causa das identificacoes com mais de uma pessoa

In [4]:
identifiers_trusted = session.query(TrustedIdentifier.value_founded)\
    .filter(TrustedIdentifier.trusted)\
    .distinct()

print('count identifiers trusted: %s' % text_bold(str(identifiers_trusted.count())))

create_table_if_not_exists(engine, TrustedIdentifier)

if table_is_empty(session, TrustedIdentifier):
    for identifier in list_identifier_trusted:
        for trusted_identifier in identifier['searched_name']:
            query = get_all_identifiers_ilike(trusted_identifier, session)
            insert_new_identifier_trusted(identifier, query, trusted_identifier, session)

list_identifier_trusted = [q[0] for q in identifiers_trusted]

count identifiers trusted: [1m187[0m


In [5]:
query_diff_identifier = session.query(DataSP)\
        .filter(DataSP.identified_by.in_(list_identifier_trusted))

print('count of records with variations in the name of the identifiers: %s' % text_bold(str(query_diff_identifier.count())))

create_table_if_not_exists(engine, InfoImage)
create_table_if_not_exists(engine, DataTrustedIdentifier)

if table_is_empty(session, DataTrustedIdentifier):
    insert_new_data_trusted_identifier(session, query_diff_identifier)

count_data_trusted_identifier = session.query(DataTrustedIdentifier)\
    .count()
print('count of records in table data_trusted_identifier: %s' % text_bold(str(count_data_trusted_identifier)))

count of records with variations in the name of the identifiers: [1m13182[0m
create table: [1minfo_image[0m
count of records in table data_trusted_identifier: [1m13182[0m


## replace unenconded caracters

In [6]:
list_special_characters = [{'find': 'Ã¡', 'replace': 'á'},
                           {'find': 'Ãº', 'replace': 'ú'},
                           {'find': 'Ã', 'replace': 'í'},
                           {'find': 'Ã³', 'replace': 'ó'},
                           {'find': 'Ã±', 'replace': 'ñ'},
                           {'find': 'Ã©', 'replace': 'é'}]

for attribute in [DataTrustedIdentifier.state_province, DataTrustedIdentifier.county]:
    for special_character in list_special_characters:
        find_and_replace_broken_characters(attribute, session, special_character)

## replace varations of word Brazil

In [7]:
list_diff_br = ['Brasil', 'BRASIL', 'Brasil/Bolivia', 'Brasilia', 'brazil', 'Brazil', 'BRazil', 'BRAZIL', '[Brésil]', 'Brésil']

if has_brasil_in_country_trusted(session):
    records_with_diff_brasil = session.query(DataTrustedIdentifier)\
        .filter(DataTrustedIdentifier.country.in_(list_diff_br))\
        .all()
    print('quantidade de registros encontrados com as variacoes dos nome dos identificadores + %s: %d' % (text_bold('variacoes de Brasil'), len(records_with_diff_brasil)))
    update_country_trusted_based_original_field(list_diff_br, session)


records_with_brasil_in_country_trusted = session.query(DataTrustedIdentifier)\
        .filter(DataTrustedIdentifier.country_trusted == 'Brasil')\
        .distinct()\
        .all()
print('%s: %d' % (text_bold('count of records with Brazil in column county_trusted'), len(records_with_brasil_in_country_trusted)))

quantidade de registros encontrados com as variacoes dos nome dos identificadores + [1mvariacoes de Brasil[0m: 11206
[1mcount of records with Brazil in column county_trusted[0m: 11206


## insert BR in column country_trusted

In [9]:
county = session.query(County).distinct().all()

list_uf = [unaccent(sa.func.lower(c.uf)) for c in county]
list_state = [unaccent(sa.func.lower(c.state)) for c in county]
list_county = [unaccent(sa.func.lower(c.county)) for c in county]

update_country_trusted(list_county, list_state, list_uf, session)

  session.query(DataTrustedIdentifier)\


## close connection

In [8]:
session.close()
engine.dispose()