In [1]:
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import os
import pandas as pd
import sqlalchemy as sa
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath('main.ipynb'))))

import database as db
from models import County, DataTrustedIdentifier, TrustedIdentifier, DataSP, Image, create_data_trusted_identifier

## Database connection

In [3]:
engine, session = db.connect()
engine.echo = False

2023-03-28 19:55:51,173 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-03-28 19:55:51,173 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-28 19:55:51,174 INFO sqlalchemy.engine.Engine select current_schema()
2023-03-28 19:55:51,175 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-03-28 19:55:51,176 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-03-28 19:55:51,176 INFO sqlalchemy.engine.Engine [raw sql] {}


## Replace unenconded character

In [4]:
list_unenconded_characters = {
    'error': ['Ã¡', 'Ãº', 'Ã', 'Ã³', 'Ã±', 'Ã©'],
    'correct': ['á', 'ú', 'í', 'ó', 'ñ', 'é']
}

pd.DataFrame(list_unenconded_characters)

Unnamed: 0,error,correct
0,Ã¡,á
1,Ãº,ú
2,Ã,í
3,Ã³,ó
4,Ã±,ñ
5,Ã©,é


In [5]:
for column in [DataTrustedIdentifier.state_province, DataTrustedIdentifier.county]:
    list_character_error = list_unenconded_characters['error']
    list_character_correct = list_unenconded_characters['correct']
    for special_character in zip(list_character_error, list_character_correct):
        special_character_to_find = special_character[0]
        special_character_to_replace = special_character[1]
        value = sa.func.replace(column, special_character_to_find, special_character_to_replace)
        session.query(DataTrustedIdentifier)\
            .update(values={column: value}, synchronize_session=False)
        session.commit()

## Update column country trusted if column country contains variations of word Brazil

In [6]:
count_of_brazil_in_country_trusted = session.query(DataTrustedIdentifier) \
    .filter(DataTrustedIdentifier.country_trusted == 'Brasil') \
    .count()

list_variations_br = ['Brasil', 'BRASIL', 'Brasil/Bolivia', 'Brasilia', 'brazil', 'Brazil', 'BRazil', 'BRAZIL', '[Brésil]', 'Brésil']

if count_of_brazil_in_country_trusted == 0:
    records_with_variations_brasil = session.query(DataTrustedIdentifier) \
        .filter(DataTrustedIdentifier.country.in_(list_variations_br)) \
        .all()

    print('count of records with variations of Brazil: %d' % len(records_with_variations_brasil))

    session.query(DataTrustedIdentifier) \
        .filter(DataTrustedIdentifier.country.in_(list_variations_br)) \
        .update({'country_trusted': 'Brasil'}, synchronize_session=False)
    session.commit()

count of records with variations of Brazil: 11206


## Find records Brazilian's state and county. After that update column country trusted

In [7]:
query = session.query(County).distinct().all()

uf_unaccented_lower, state_unaccented_lower, county_unaccented_lower = db.get_list_uf_state_county(query)

session.query(DataTrustedIdentifier) \
    .filter(sa.and_(DataTrustedIdentifier.country_trusted.is_(None),
                 sa.or_(uf_unaccented_lower, state_unaccented_lower),
                 county_unaccented_lower)) \
    .update({'country_trusted': 'Brasil'}, synchronize_session=False)
session.commit()

  session.query(DataTrustedIdentifier) \


In [8]:
count_of_brazil_in_country_trusted = session.query(DataTrustedIdentifier) \
    .filter(DataTrustedIdentifier.country_trusted == 'Brasil') \
    .count()

print('count of Brasil in country trusted: %d' % count_of_brazil_in_country_trusted)

count of Brasil in country trusted: 12144


## Close connection

In [9]:
session.close()
engine.dispose()