In [97]:
%load_ext autoreload
%autoreload 2
import duckdb
import os
import pandas as pd
import pathlib

import splink

from ibge import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Dados do speciesLink

In [75]:
splink.download_specieslink()
print("total files: %d" % len(os.listdir("./parquet")))

total files: 520


In [98]:
# Initialize DuckDB connection
conn = duckdb.connect("splink")

local_file = "./parquet"

conn.execute("DROP TABLE IF EXISTS specieslink")
conn.execute(f"""
        CREATE TABLE specieslink AS
        SELECT * FROM read_parquet('{local_file}', union_by_name=True)
    """)

result = conn.execute("SELECT COUNT(*) FROM specieslink").fetchone()
print(f"Registros carregados na tabela: {result[0]:,}")

Registros carregados na tabela: 104,000


In [99]:
conn.execute("DESCRIBE specieslink").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,barcode,VARCHAR,YES,,,
1,collectioncode,VARCHAR,YES,,,
2,scientificname,VARCHAR,YES,,,
3,kingdom,VARCHAR,YES,,,
4,family,VARCHAR,YES,,,
5,genus,VARCHAR,YES,,,
6,yearcollected,VARCHAR,YES,,,
7,monthcollected,VARCHAR,YES,,,
8,daycollected,VARCHAR,YES,,,
9,country,VARCHAR,YES,,,


In [100]:
conn.execute("SELECT * FROM specieslink LIMIT 5").df()

Unnamed: 0,barcode,collectioncode,scientificname,kingdom,family,genus,yearcollected,monthcollected,daycollected,country,...,infraspecificepithet,typestatus,relationshiptype,relatedcatalogitem,coordinateprecision,phylum,class,order,fieldnumber,individualcount
0,CONV005839,CONVOLVULACEAE_BR,Bonamia ferruginea,Plantae,Convolvulaceae,Bonamia,1974,7,29,Brazil,...,,,,,,,,,,
1,,CPAP,,Plantae,Poaceae,,1989,9,23,Brasil,...,,,,,,,,,,
2,CONV005840,CONVOLVULACEAE_BR,Bonamia ferruginea,Plantae,Convolvulaceae,Bonamia,1974,7,29,Brazil,...,,,,,,,,,,
3,,CPAP,,Plantae,Poaceae,,1991,4,11,Brasil,...,,,,,,,,,,
4,CONV005829,CONVOLVULACEAE_BR,Bonamia ferruginea,Plantae,Convolvulaceae,Bonamia,1975,8,4,Brazil,...,,,,,,,,,,


### Dados do IBGE

In [101]:
response = get_data_ibge()
data = [{"id": r["id"], "nome_cidade": r["nome"], "sigla": get_sigla(r), "nome_estado": get_nome_uf(r)} for r in response.json()]

df = pd.DataFrame.from_records(data)
conn.execute("DROP TABLE IF EXISTS ibge")
conn.execute("""
    CREATE TABLE ibge AS
    SELECT * FROM df
""")

result = conn.execute("SELECT COUNT(*) FROM ibge").fetchone()
print(f"Registros carregados na tabela: {result[0]:,}")

Registros carregados na tabela: 5,571


In [102]:
conn.execute("SELECT * FROM ibge LIMIT 5").df()

Unnamed: 0,id,nome_cidade,sigla,nome_estado
0,1100015,Alta Floresta D'Oeste,RO,Rondônia
1,1100023,Ariquemes,RO,Rondônia
2,1100031,Cabixi,RO,Rondônia
3,1100049,Cacoal,RO,Rondônia
4,1100056,Cerejeiras,RO,Rondônia


### Dados queimadas

In [96]:
queimadas = [f for f in pathlib.Path("./queimadas").rglob("*.csv")]

dfs = []
for q in queimadas:
    df = pd.read_csv(q)
    df = df[:-3] # remove as três últimas linhas
    df = df.iloc[:, :-1] # remove a última coluna
    df.rename(columns={df.columns[0]: "Ano"}, inplace=True)
    estado = q.name.replace('historico_estado_', '').replace('.csv', '').replace('_', ' ').title()
    df['Estado'] = estado
    dfs.append(df)

dfs = pd.concat(dfs, ignore_index=True)
dfs.head()

Unnamed: 0,Ano,Janeiro,Fevereiro,Março,Abril,Maio,Junho,Julho,Agosto,Setembro,Outubro,Novembro,Dezembro,Estado
0,1998,,,,,,451.0,597.0,580.0,525.0,433.0,487.0,121.0,Sao Paulo
1,1999,7.0,47.0,78.0,291.0,500.0,522.0,683.0,1210.0,910.0,763.0,400.0,47.0,Sao Paulo
2,2000,36.0,25.0,27.0,102.0,300.0,605.0,770.0,915.0,581.0,647.0,96.0,23.0,Sao Paulo
3,2001,26.0,25.0,35.0,61.0,211.0,345.0,539.0,587.0,423.0,441.0,168.0,52.0,Sao Paulo
4,2002,19.0,19.0,111.0,178.0,319.0,562.0,645.0,909.0,483.0,699.0,129.0,77.0,Sao Paulo


In [103]:
conn.execute("DROP TABLE IF EXISTS queimadas")
conn.execute("""
    CREATE TABLE queimadas AS
    SELECT * FROM dfs
""")

result = conn.execute("SELECT COUNT(*) FROM queimadas").fetchone()
print(f"Registros carregados na tabela: {result[0]:,}")

Registros carregados na tabela: 756
