In [1]:
%load_ext autoreload
%autoreload 2

import duckdb
import os
import pandas as pd
import pathlib

from ibge import *

In [3]:
# Initialize DuckDB connection
conn = duckdb.connect("plants.db")

In [4]:
conn.execute("SHOW TABLES").df()

Unnamed: 0,name
0,flora_especies_ameacadas
1,ibge
2,reflora
3,sibbr


### Removo algumas colunas e mantenho apenas registros do Brasil e da família Piperaceae

In [5]:
conn.execute("DROP TABLE IF EXISTS sibbr_piperaceae")
conn.execute("""
    CREATE TABLE sibbr_piperaceae AS
    SELECT s.recordID, s.DatasetID,
            s.decimalLatitude, s.country,
            s.county, s.stateProvince,
            s.decimalLongitude,
            s.minimumElevationInMeters, s.maximumElevationInMeters,
            s.year, s.day, s.month,
            s.scientificName, s.taxonRank,
            s.kingdom, s.phylum, s.Class, s.order, s.family, s.genus, s.Species, s.Subspecies,
            s.institutionCode
            
    FROM sibbr s
    WHERE (s.country='Brazil' OR s.country='Brasil' OR s.county='Brasil') AND s.family='Piperaceae'
""")

<_duckdb.DuckDBPyConnection at 0x7fcedd894f70>

In [6]:
result = conn.execute("SELECT COUNT(*) FROM sibbr_piperaceae").fetchone()
print(f"Registros carregados na tabela: {result[0]:,}")

Registros carregados na tabela: 32,138


In [7]:
conn.execute("SELECT * FROM sibbr_piperaceae as s LIMIT 5").df()

Unnamed: 0,recordID,DatasetID,decimalLatitude,country,county,stateProvince,decimalLongitude,minimumElevationInMeters,maximumElevationInMeters,year,...,taxonRank,kingdom,phylum,Class,order,family,genus,Species,Subspecies,institutionCode
0,fff8a0ca-032d-4887-bb99-a91470ca6faf,dr1172,-14.771874,Brasil,,Bahia,-39.227366,,,1981.0,...,species,Plantae,Tracheophyta,Magnoliopsida,Piperales,Piperaceae,Pothomorphe,Pothomorphe umbellata,,GBIF
1,fff7094d-b956-4b5e-9018-80c891e1651a,dr839,,Brasil,,,,,,1998.0,...,species,Plantae,Tracheophyta,Magnoliopsida,Piperales,Piperaceae,Piper,Piper mollicomum,,JBRJ
2,fffaafb0-782f-4d79-b549-03fdab995ef5,dr839,,Brasil,,,,,,1995.0,...,species,Plantae,Tracheophyta,Magnoliopsida,Piperales,Piperaceae,Piper,Piper mollicomum,,JBRJ
3,fffa47ce-ede4-4310-8742-e1b957c759b4,dr1172,-15.15,Brasil,,Bahia,-39.083333,,,2000.0,...,species,Plantae,Tracheophyta,Magnoliopsida,Piperales,Piperaceae,Piper,Piper aduncum,,GBIF
4,fff984db-c42a-494a-8248-ef162f06f7ea,dr839,-21.647778,Brasil,,Minas Gerais,-43.873611,,,2022.0,...,species,Plantae,Tracheophyta,Magnoliopsida,Piperales,Piperaceae,Piper,Piper lhotzkyanum,,JBRJ


### Variações de país

In [8]:
conn.execute("SELECT DISTINCT(s.country) FROM sibbr_piperaceae as s WHERE s.country LIKE 'B%'").df()

Unnamed: 0,country
0,Brasil
1,Brazil


In [9]:
conn.execute("UPDATE sibbr_piperaceae SET country='Brazil' WHERE country='Brasil';")
conn.execute("SELECT DISTINCT(s.country) FROM sibbr_piperaceae as s WHERE s.country LIKE 'B%'").df()

Unnamed: 0,country
0,Brazil


## Variações de estado
### Adiciono acento

In [29]:
conn.execute("SELECT DISTINCT(s.stateProvince) FROM sibbr_piperaceae as s LIMIT 5").df()

Unnamed: 0,stateProvince
0,Alagoas
1,Paraiba
2,Pernambuco
3,
4,Para


In [31]:
import unicodedata 

def normalize_str(s):
    if s is None:
        return None
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    ).lower()

conn.create_function("remove_accent", normalize_str, return_type="VARCHAR")

<_duckdb.DuckDBPyConnection at 0x7fcedd894f70>

In [40]:
conn.execute("""UPDATE sibbr_piperaceae as s
    SET stateProvince = ibge.nome_estado
    FROM ibge
    WHERE remove_accent(lower(s.stateProvince)) = remove_accent(lower(ibge.nome_estado))
""")
conn.execute("SELECT DISTINCT(s.stateProvince) FROM sibbr_piperaceae as s LIMIT 5").df()

Unnamed: 0,stateProvince
0,São Paulo
1,Acre
2,Sergipe
3,Maranhão
4,Piauí


## Variações de cidade
### Coloco Brasil na coluna correta (Country)

In [41]:
conn.execute("SELECT s.county, s.stateProvince, s.country FROM sibbr_piperaceae as s WHERE s.county LIKE 'Brasil' LIMIT 5").df()

Unnamed: 0,county,stateProvince,country
0,Brasil,São Paulo,Brazil
1,Brasil,Minas Gerais,Brazil


In [42]:
conn.execute("UPDATE sibbr_piperaceae SET country='Brazil', county=NULL WHERE county='Brasil';")
conn.execute("SELECT s.county, s.stateProvince, s.country FROM sibbr_piperaceae as s WHERE s.country LIKE 'Brazil' LIMIT 5").df()

Unnamed: 0,county,stateProvince,country
0,,Bahia,Brazil
1,,,Brazil
2,,,Brazil
3,,Bahia,Brazil
4,,Minas Gerais,Brazil
