In [82]:
import numpy
import os
import pandas
import re
import sqlalchemy
import sqlalchemy.ext.declarative
import sqlalchemy.orm
import sqlalchemy.orm.decl_api

# cfg

In [83]:
cfg = {
    "user": f"{os.environ['POSTGRE_USER']}",
    "password": f"{os.environ['POSTGRE_PASSWORD']}",
    "host": "192.168.1.6",
    "port": "5432",
    "database": "herbario"
}
filename = "original.csv"
filename_george = "dados-george.csv"
engine = sqlalchemy.create_engine(f"postgresql+psycopg2://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{cfg['database']}", echo=True, pool_pre_ping=True)
Session = sqlalchemy.orm.sessionmaker(bind=engine)
Session.configure(bind=engine)
session = Session()

# Load data

In [84]:
dataframe = pandas.read_csv(filename, sep=";", low_memory=False, skipinitialspace=True)
dataframe_george = pandas.read_csv(filename_george, sep=";", low_memory=False, skipinitialspace=True)

# "Tables"

In [85]:
Base = sqlalchemy.ext.declarative.declarative_base()

class DataSP(Base):
    __tablename__ = "data"

    seq = sqlalchemy.Column(sqlalchemy.BigInteger, primary_key=True)
    modified = sqlalchemy.Column(sqlalchemy.DateTime, nullable=True)
    institution_code = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    collection_code = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    catalog_number = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    basis_of_record = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    kingdom = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    phylum = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    classe = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    order = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    family = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    genus = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    specific_epithet = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    infraspecific_epithet = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    scientific_name = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    scientific_name_authorship = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    identified_by = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    year_identified = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    month_identified = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    day_identified = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    type_status = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    recorded_by = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    record_number = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    field_number = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    year = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    month = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    day = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    event_time = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    continent_ocean = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    country = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    state_province = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    county = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    locality = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    decimal_longitude = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    decimal_latitude = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    verbatim_longitude = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    verbatim_latitude = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    coordinate_precision = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    bounding_box = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    minimum_elevation_in_meters = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    maximum_elevation_in_meters = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    minimum_depth_in_meters = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    maximum_depth_in_meters = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    sex = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    preparation_type = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    individual_count = sqlalchemy.Column(sqlalchemy.BigInteger, nullable=True)
    previous_catalog_number = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    relationship_type = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    related_catalog_item = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    occurrence_remarks = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    barcode = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    imagecode = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    geo_flag = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    george = sqlalchemy.Column(sqlalchemy.Boolean, nullable=True)
    my_country = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    my_state = sqlalchemy.Column(sqlalchemy.String, nullable=True)
    my_city = sqlalchemy.Column(sqlalchemy.String, nullable=True)

    def __repr__(self):
        return "DataSP(seq=%s, modified=%s, institution_code=%s, collection_code=%s, catalog_number=%s, " \
               "basis_of_record=%s, kingdom=%s, phylum=%s, classe=%s, order=%s, family=%s, genus=%s, " \
               "specific_epithet=%s, infraspecific_epithet=%s, scientific_name=%s, scientific_name_authorship=%s, " \
               "identified_by=%s, year_identified=%s, month_identified=%s, day_identified=%s, type_status=%s, " \
               "recorded_by=%s, record_number=%s, field_number=%s, year=%s, month=%s, day=%s, event_time=%s, " \
               "continent_ocean=%s, country=%s, state_province=%s, county=%s, locality=%s, decimal_longitude=%s, " \
               "decimal_latitude=%s, verbatim_longitude=%s, verbatim_latitude=%s, coordinate_precision=%s, " \
               "bounding_box=%s, minimum_elevation_in_meters=%s, maximum_elevation_in_meters=%s, " \
               "minimum_depth_in_meters=%s, maximum_depth_in_meters=%s, sex=%s, preparation_type=%s, " \
               "individual_count=%s, previous_catalog_number=%s, relationship_type=%s, related_catalog_item=%s, " \
               "occurrence_remarks=%s, barcode=%s, imagecode=%s, geo_flag=%s) "

# database

In [97]:
def make_operation(session):
    try:
        # always commit changes!
        session.commit()
    except Exception as e:
        session.rollback()
        print(e)
        raise
    finally:
        session.close()

def create_table_if_not_exists(table_name):
    if not sqlalchemy.inspect(engine).has_table(table_name, schema=cfg["database"]):
        Base.metadata.create_all(engine)

def create_datasp(info):
    return DataSP(seq=info["seq"],
                  modified=info["modified"], institution_code=info["institutionCode"],
                  collection_code=info["collectionCode"], catalog_number=info["catalogNumber"],
                  basis_of_record=info["basisOfRecord"], kingdom=info["kingdom"], phylum=info["phylum"],
                  classe=info["class"], order=info["order"], family=info["family"],
                  genus=info["genus"],
                  specific_epithet=info["specificEpithet"],
                  infraspecific_epithet=info["infraspecificEpithet"],
                  scientific_name=info["scientificName"],
                  scientific_name_authorship=info["scientificNameAuthorship"],
                  identified_by=info["identifiedBy"], year_identified=info["yearIdentified"],
                  month_identified=info["monthIdentified"], day_identified=info["dayIdentified"],
                  type_status=info["typeStatus"],
                  recorded_by=info["recordedBy"], record_number=info["recordNumber"],
                  field_number=info["fieldNumber"], year=info["year"], month=info["month"],
                  day=info["day"], event_time=info["eventTime"],
                  continent_ocean=info["continentOcean"], country=info["country"],
                  state_province=info["stateProvince"], county=info["county"], locality=info["locality"],
                  decimal_longitude=info["decimalLongitude"],
                  decimal_latitude=info["decimalLatitude"], verbatim_longitude=info["verbatimLongitude"],
                  verbatim_latitude=info["verbatimLatitude"],
                  coordinate_precision=info["coordinatePrecision"],
                  bounding_box=info["boundingBox"],
                  minimum_elevation_in_meters=info["minimumElevationInMeters"],
                  maximum_elevation_in_meters=info["maximumElevationInMeters"],
                  minimum_depth_in_meters=info["minimumDepthInMeters"],
                  maximum_depth_in_meters=info["maximumDepthInMeters"], sex=info["sex"],
                  preparation_type=info["preparationType"],
                  individual_count=info["individualCount"],
                  previous_catalog_number=info["previousCatalogNumber"],
                  relationship_type=info["relationshipType"],
                  related_catalog_item=info["relatedCatalogItem"],
                  occurrence_remarks=info["occurrenceRemarks"], barcode=info["barcode"],
                  imagecode=info["imagecode"], geo_flag=info["geoFlag"])

def insert(dataframe):
    if len(dataframe.index) - 1 > session.query(DataSP).count():
        for row in dataframe.iterrows():
            session.add(create_datasp(row[1]))
            make_operation(session)

def update(dataframe):
    for row in dataframe.iterrows():
        if row[1]["GEORGE"].lower() == "sim":
            session.query(DataSP).filter(DataSP.seq == row[1]["seq"]).update({"george": True}, synchronize_session=False)
            make_operation(session)

# Preprocess

In [101]:
def convert_header_to_snake_case(dataframe):
    dict = {}
    for a in list([*dataframe.columns]):
        dict[a] = re.sub(r'(?<!^)(?=[A-Z])', '_', a).lower()
    dataframe.rename(columns=dict, inplace=True)

def get_columns_numeric(dataframe):
    l = []
    for a in list([*dataframe.columns]):
        for c in DataSP.__table__.columns:
            if str(a) in str(c) and ("int" in str(c.type).lower() or "float" in str(c.type).lower()):
                l.append(a)
    return l

def replace_nan_to_null(dataframe):
    return dataframe.replace({numpy.nan: None})

def replace_values_not_numeric(dataframe):
    for column in list([*get_columns_numeric(dataframe)]):
        dataframe[column] = pandas.to_numeric(dataframe.__getattr__(column), errors='coerce').fillna(-1)
    return dataframe

def preprocess(dataframe):
    return replace_nan_to_null(replace_values_not_numeric(dataframe))


insert(preprocess(dataframe))

2022-07-15 10:44:41,997 INFO sqlalchemy.engine.Engine INSERT INTO data (seq, modified, institution_code, collection_code, catalog_number, basis_of_record, kingdom, phylum, classe, "order", family, genus, specific_epithet, infraspecific_epithet, scientific_name, scientific_name_authorship, identified_by, year_identified, month_identified, day_identified, type_status, recorded_by, record_number, field_number, year, month, day, event_time, continent_ocean, country, state_province, county, locality, decimal_longitude, decimal_latitude, verbatim_longitude, verbatim_latitude, coordinate_precision, bounding_box, minimum_elevation_in_meters, maximum_elevation_in_meters, minimum_depth_in_meters, maximum_depth_in_meters, sex, preparation_type, individual_count, previous_catalog_number, relationship_type, related_catalog_item, occurrence_remarks, barcode, imagecode, geo_flag, george, my_country, my_state, my_city) VALUES (%(seq)s, %(modified)s, %(institution_code)s, %(collection_code)s, %(catalog


KeyboardInterrupt



2022-07-15 10:49:23,936 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-15 10:49:23,941 INFO sqlalchemy.engine.Engine SELECT count(*) AS count_1 
FROM (SELECT data.seq AS data_seq, data.modified AS data_modified, data.institution_code AS data_institution_code, data.collection_code AS data_collection_code, data.catalog_number AS data_catalog_number, data.basis_of_record AS data_basis_of_record, data.kingdom AS data_kingdom, data.phylum AS data_phylum, data.classe AS data_classe, data."order" AS data_order, data.family AS data_family, data.genus AS data_genus, data.specific_epithet AS data_specific_epithet, data.infraspecific_epithet AS data_infraspecific_epithet, data.scientific_name AS data_scientific_name, data.scientific_name_authorship AS data_scientific_name_authorship, data.identified_by AS data_identified_by, data.year_identified AS data_year_identified, data.month_identified AS data_month_identified, data.day_identified AS data_day_identified, data.type_status AS data_typ

In [100]:
def missing_integer(seq):
    return list([i for i in range(1, max(seq)) if i not in seq])

create_table_if_not_exists("data")


session.close()
engine.dispose()

2022-07-15 10:36:00,472 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where n.nspname=%(schema)s and relname=%(name)s
2022-07-15 10:36:00,474 INFO sqlalchemy.engine.Engine [generated in 0.00139s] {'schema': 'herbario', 'name': 'data'}
2022-07-15 10:36:00,474 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-15 10:36:00,482 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2022-07-15 10:36:00,482 INFO sqlalchemy.engine.Engine [generated in 0.00071s] {'name': 'data'}
2022-07-15 10:36:00,484 INFO sqlalchemy.engine.Engine COMMIT
