In [17]:
import numpy
import os
import pandas
import re
import sqlalchemy.ext.declarative
import sqlalchemy.orm
import sqlalchemy.orm.decl_api
import time

from api import *
from file import create_outfile
from tables import *
from unaccent import unaccent

# cfg

In [18]:
cfg = {
    "user": f"{os.environ['POSTGRE_USER']}",
    "password": f"{os.environ['POSTGRE_PASSWORD']}",
    "port": "5432",
    "database": "herbario"
}
filename = "original.csv"
filename_george = "dados-george.csv"


def connect(cfg):
    list_hosts = ["localhost", "192.168.0.160"]
    for host in list_hosts:
        try:
            engine = sqlalchemy.create_engine(f"postgresql+psycopg2://{cfg['user']}:{cfg['password']}@{host}:{cfg['port']}/{cfg['database']}", echo=True, pool_pre_ping=True)
            Session = sqlalchemy.orm.sessionmaker(bind=engine)
            Session.configure(bind=engine)
            session = Session()
            if engine.connect():
                return engine, session
        except Exception as e:
            print(f"problems with host {host}")


engine, session = connect(cfg)

problems with host localhost
2022-07-22 14:32:48,765 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2022-07-22 14:32:48,781 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-07-22 14:32:48,781 INFO sqlalchemy.engine.Engine select current_schema()
2022-07-22 14:32:48,781 INFO sqlalchemy.engine.Engine [raw sql] {}
2022-07-22 14:32:48,781 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2022-07-22 14:32:48,781 INFO sqlalchemy.engine.Engine [raw sql] {}


# Load data

In [19]:
dataframe = pandas.read_csv("original.csv", sep=";", low_memory=False, skipinitialspace=True)
dataframe_george = pandas.read_csv(filename_george, sep=";", low_memory=False, skipinitialspace=True)

# database

In [20]:
def make_operation(session):
    try:
        session.commit()
        session.flush()
    except Exception as e:
        session.rollback()
        print(e)
        raise
    finally:
        session.close()

def create_table_if_not_exists(table_name):
    if not sqlalchemy.inspect(engine).has_table(table_name, schema=cfg["database"]):
        Base.metadata.create_all(engine)

def create_datasp(info):
    return DataSP(seq=info["seq"],
                  modified=info["modified"], institution_code=info["institutionCode"],
                  collection_code=info["collectionCode"], catalog_number=info["catalogNumber"],
                  basis_of_record=info["basisOfRecord"], kingdom=info["kingdom"], phylum=info["phylum"],
                  classe=info["class"], order=info["order"], family=info["family"],
                  genus=info["genus"],
                  specific_epithet=info["specificEpithet"],
                  infraspecific_epithet=info["infraspecificEpithet"],
                  scientific_name=info["scientificName"],
                  scientific_name_authorship=info["scientificNameAuthorship"],
                  identified_by=info["identifiedBy"], year_identified=info["yearIdentified"],
                  month_identified=info["monthIdentified"], day_identified=info["dayIdentified"],
                  type_status=info["typeStatus"],
                  recorded_by=info["recordedBy"], record_number=info["recordNumber"],
                  field_number=info["fieldNumber"], year=info["year"], month=info["month"],
                  day=info["day"], event_time=info["eventTime"],
                  continent_ocean=info["continentOcean"], country=info["country"],
                  state_province=info["stateProvince"], county=info["county"], locality=info["locality"],
                  decimal_longitude=info["decimalLongitude"],
                  decimal_latitude=info["decimalLatitude"], verbatim_longitude=info["verbatimLongitude"],
                  verbatim_latitude=info["verbatimLatitude"],
                  coordinate_precision=info["coordinatePrecision"],
                  bounding_box=info["boundingBox"],
                  minimum_elevation_in_meters=info["minimumElevationInMeters"],
                  maximum_elevation_in_meters=info["maximumElevationInMeters"],
                  minimum_depth_in_meters=info["minimumDepthInMeters"],
                  maximum_depth_in_meters=info["maximumDepthInMeters"], sex=info["sex"],
                  preparation_type=info["preparationType"],
                  individual_count=info["individualCount"],
                  previous_catalog_number=info["previousCatalogNumber"],
                  relationship_type=info["relationshipType"],
                  related_catalog_item=info["relatedCatalogItem"],
                  occurrence_remarks=info["occurrenceRemarks"], barcode=info["barcode"],
                  imagecode=info["imagecode"], geo_flag=info["geoFlag"])

def create_county(json):
    uf, uf_name = get_uf(json)
    return County(id=get_id(json), county=get_county_name(json), uf=uf, uf_name=uf_name)

In [21]:
def get_key(json, key):
    if key in json:
        return json[key]
    raise KeyError(f"key {key} not found")


def get_id(json):
    if "id" in json:
        return json["id"]
    raise KeyError(f"key id not found")


def get_county_name(json):
    if "nome" in json:
        return json["nome"]
    raise KeyError(f"key nome not found")


def get_uf(json):
    if "microrregiao" in json:
        if "mesorregiao" in json["microrregiao"]:
            if "UF" in json["microrregiao"]["mesorregiao"]:
                return json["microrregiao"]["mesorregiao"]["UF"]["sigla"], json["microrregiao"]["mesorregiao"]["UF"]["nome"]
            raise KeyError("key UF not found")
        raise KeyError("key mesorregiao not found")
    raise KeyError("key microrregiao not found")

In [22]:
create_table_if_not_exists("county")
create_table_if_not_exists("data")

2022-07-22 14:32:49,394 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where n.nspname=%(schema)s and relname=%(name)s
2022-07-22 14:32:49,394 INFO sqlalchemy.engine.Engine [generated in 0.00106s] {'schema': 'herbario', 'name': 'county'}
2022-07-22 14:32:49,394 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-22 14:32:49,410 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-07-22 14:32:49,410 INFO sqlalchemy.engine.Engine [generated in 0.00092s] {'name': 'data'}
2022-07-22 14:32:49,410 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-07-22 14:32:49,410 INFO sqlalchemy.engine.Engine [cached since 0.006581s ago] {'name': 'county'}
2022-07-22 14:32:49,410 INFO sqlalchemy.engine.E

In [23]:
if session.query(County).count() == 0:
    response = get_municipies()

    for i, county in enumerate(response.json()):
        session.add(create_county(county))
        make_operation(session)


if session.query(DataSP).count() == 0:

    for row in dataframe.iterrows():
        session.add(create_datasp(row[1]))
        make_operation(session)


if session.query(DataSP).filter(DataSP.george == True).count() == 0:
    for row in dataframe.iterrows():
        if row[1]["GEORGE"].lower() == "sim":
            session.query(DataSP).filter(DataSP.seq == row[1]["seq"]).update({"george": True}, synchronize_session=False)
            make_operation(session)

2022-07-22 14:32:49,457 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-22 14:32:49,457 INFO sqlalchemy.engine.Engine SELECT count(*) AS count_1 
FROM (SELECT county.id AS county_id, county.county AS county_county, county.uf AS county_uf, county.uf_name AS county_uf_name 
FROM county) AS anon_1
2022-07-22 14:32:49,457 INFO sqlalchemy.engine.Engine [generated in 0.00105s] {}
2022-07-22 14:32:49,472 INFO sqlalchemy.engine.Engine SELECT count(*) AS count_1 
FROM (SELECT data.seq AS data_seq, data.modified AS data_modified, data.institution_code AS data_institution_code, data.collection_code AS data_collection_code, data.catalog_number AS data_catalog_number, data.basis_of_record AS data_basis_of_record, data.kingdom AS data_kingdom, data.phylum AS data_phylum, data.classe AS data_classe, data."order" AS data_order, data.family AS data_family, data.genus AS data_genus, data.specific_epithet AS data_specific_epithet, data.infraspecific_epithet AS data_infraspecific_epithet, data.scie

# Preprocess (dataframe)

In [24]:
def convert_header_to_snake_case(dataframe):
    return  {column_name: re.sub(r'(?<!^)(?=[A-Z])', '_', column_name).lower() for column_name in get_columns_dataframe(dataframe)}


def change_header(dataframe):
    dataframe.rename(columns=convert_header_to_snake_case(dataframe), inplace=True)


def get_columns_numeric(dataframe):
    list_of_columns_numeric = list([])
    for columns_dataframe in get_columns_dataframe(dataframe):
        for columns_table in get_columns_table(DataSP):
            if check_if_column_is_numeric(columns_dataframe, columns_table):
                list_of_columns_numeric.append(columns_dataframe)
    return list_of_columns_numeric


def get_columns_table(table):
    return table.__table__.columns


def get_columns_dataframe(dataframe):
    return list([*dataframe.columns])


def check_if_column_is_numeric(columns_dataframe, columns_table):
    return str(columns_dataframe) in str(columns_table) and ("int" in str(columns_table.type).lower() or "float" in str(columns_table.type).lower())


def replace_nan_to_null(dataframe):
    return dataframe.replace({numpy.nan: None})


def replace_values_not_numeric(dataframe):
    for column in list([*get_columns_numeric(dataframe)]):
        dataframe[column] = pandas.to_numeric(getattr(dataframe, column), errors='coerce').fillna(-1)
    return dataframe


def preprocess(dataframe):
    return replace_nan_to_null(replace_values_not_numeric(dataframe))

In [25]:
# data_piperaceae = session.query(DataSP).all()
# data_uf = session.query(sqlalchemy.func.lower(unaccent(County.uf))).distinct().all()
# data_uf_name = session.query(sqlalchemy.func.lower(unaccent(County.uf_name))).distinct().all()
# data_county = session.query(unaccent(County.county)).all()

data_piperaceae = session.query(DataSP).limit(5).all()
data_uf = session.query(sqlalchemy.func.lower(unaccent(County.uf))).distinct().limit(5).all()
data_uf_name = session.query(sqlalchemy.func.lower(unaccent(County.uf_name))).distinct().limit(5).all()
data_county = session.query(unaccent(County.county)).limit(5).all()

2022-07-22 14:32:49,542 INFO sqlalchemy.engine.Engine SELECT data.seq AS data_seq, data.modified AS data_modified, data.institution_code AS data_institution_code, data.collection_code AS data_collection_code, data.catalog_number AS data_catalog_number, data.basis_of_record AS data_basis_of_record, data.kingdom AS data_kingdom, data.phylum AS data_phylum, data.classe AS data_classe, data."order" AS data_order, data.family AS data_family, data.genus AS data_genus, data.specific_epithet AS data_specific_epithet, data.infraspecific_epithet AS data_infraspecific_epithet, data.scientific_name AS data_scientific_name, data.scientific_name_authorship AS data_scientific_name_authorship, data.identified_by AS data_identified_by, data.year_identified AS data_year_identified, data.month_identified AS data_month_identified, data.day_identified AS data_day_identified, data.type_status AS data_type_status, data.recorded_by AS data_recorded_by, data.record_number AS data_record_number, data.field_numb

In [26]:
def remove_set(data):
    return list([s for s, in data])

data_uf = remove_set(data_uf)
data_uf_name = remove_set(data_uf_name)
data_county = remove_set(data_county)
data_country = ["brazil", "brasil"]
list_of_all_data = list([{"type": "country", "data": data_country}, {"type": "state", "data": data_uf_name}, {"type": "city", "data": data_county}])

In [27]:
def column_is_string_or_varchar_or_text(column):
    return "string" in str(column.type).lower() or "varchar" in str(column.type).lower() or "text" in str(column.type).lower()

import sys
# de, da, do, dos
def remove_white_spaces(string):
    return remove_word_started_lowercase(string).split(" ")


def remove_hyphen(string):
    return remove_word_started_lowercase(string).split("-")


def remove_word_started_lowercase(string):
    return re.sub(r"\b[a-z]+\s*", "", string)

list_seq = list([])
list_of_columns_valid = list([column.key for column in get_columns_table(DataSP) if column_is_string_or_varchar_or_text(column)])

start = time.process_time()

for data in list_of_all_data:
    for d in data["data"]:
        for columns in list_of_columns_valid[len(list_of_columns_valid)-3:]:
            for data_formatted in (remove_white_spaces(d), remove_hyphen(d)):
                if len(data_formatted) > 0:
                    q = session.query(DataSP.seq).filter(sqlalchemy.and_(*[sqlalchemy.func.lower(unaccent(getattr(DataSP, columns))).ilike(f"%{s.lower()}%") for s in data_formatted])).all()
                else:
                    q = session.query(DataSP.seq).filter(sqlalchemy.func.lower(unaccent(getattr(DataSP, columns))).ilike(f"%{d}%")).all()
                if len(q) > 0:
                    x = {"seq": remove_set(q), "column": columns, "new_column": f"my_{data['type']}", "value_searched": d}
                    list_seq.append(x)
                    for y in x["seq"]:
                        session.query(DataSP).filter(DataSP.seq == y).update(values=dict({x["new_column"]: x["value_searched"]}), synchronize_session=False)
                        make_operation(session)
end = time.process_time()
elapsed_time = end-start
print(f"elapsed_time: {elapsed_time} seconds")

2022-07-22 14:32:49,611 INFO sqlalchemy.engine.Engine SELECT data.seq AS data_seq 
FROM data 
WHERE lower(unaccent(data.my_country)) ILIKE %(lower_1)s
2022-07-22 14:32:49,611 INFO sqlalchemy.engine.Engine [no key 0.00107s] {'lower_1': '%%'}
2022-07-22 14:32:49,642 INFO sqlalchemy.engine.Engine SELECT data.seq AS data_seq 
FROM data 
WHERE lower(unaccent(data.my_country)) ILIKE %(lower_1)s
2022-07-22 14:32:49,642 INFO sqlalchemy.engine.Engine [no key 0.00139s] {'lower_1': '%%'}
2022-07-22 14:32:49,657 INFO sqlalchemy.engine.Engine SELECT data.seq AS data_seq 
FROM data 
WHERE lower(unaccent(data.my_state)) ILIKE %(lower_1)s
2022-07-22 14:32:49,657 INFO sqlalchemy.engine.Engine [no key 0.00101s] {'lower_1': '%%'}
2022-07-22 14:32:49,673 INFO sqlalchemy.engine.Engine SELECT data.seq AS data_seq 
FROM data 
WHERE lower(unaccent(data.my_state)) ILIKE %(lower_1)s
2022-07-22 14:32:49,673 INFO sqlalchemy.engine.Engine [no key 0.00077s] {'lower_1': '%%'}
2022-07-22 14:32:49,695 INFO sqlalchemy.


KeyboardInterrupt



In [None]:
create_outfile(list_seq)

In [None]:
session.close()
engine.dispose()