In [2]:
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
import requests
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
from tqdm import tqdm
import shutil
from simpledbf import Dbf5

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [9]:
ufs = ['AC', 'AL', 'AM', 'AP', 'BA', 'CE', 'DF', 'ES', 'GO', 
       'MA', 'MG', 'MS', 'MT', 'PA', 'PB', 'PE', 'PI', 'PR', 
       'RJ', 'RN', 'RO', 'RR', 'RS', 'SC', 'SE', 'SP', 'TO']

In [10]:
cwd = 'C:/Users/jose_/OneDrive/Área de Trabalho/workspace/CNEFE'

In [11]:
cols = [
    'NM_MUN',
    'SIGLA_UF',
    'AREA_KM2',
    'CD_UF',
    'CD_MUNICIPIO',
    'CD_DISTRITO',
    'CD_SUBDISTRITO',
    'CD_SETOR',
    'SITUACAO_SETOR',
    'CEP'
]

In [12]:
def scraping_cnefe(save_unzip, ufs):
    """
    Coleta os dados do repositório Cadastro Nacional de Endereços para Fins Estatísticos (CNEFE) mantido pelo IBGE.
    
    Parameters
    ----------
        save_unzip: str
            Diretório em que serão salvos os arquivos descompactados
        ufs: list
            Siglas das UFs que serão coletadas
            
    Returns
    -------
        Pandas DataFrame depara de códido do setor censitário para CEP.

    """
    url = 'https://ftp.ibge.gov.br/Censos/Censo_Demografico_2010/Cadastro_Nacional_de_Enderecos_Fins_Estatisticos'
    setores = pd.DataFrame()

    for i in tqdm(ufs):
        path = os.path.join(url, i).replace('\\','/')
        page = requests.get(path).text
        soup = BeautifulSoup(page, 'html.parser')
        files = [os.path.join(path, node.get('href')).replace('\\','/') for node in soup.find_all('a') if node.get('href').endswith('.zip')]
        files = files[1:]

        tables = [node.get('href') for node in soup.find_all('a') if node.get('href').endswith('.zip')][1:]
        tables = [value.replace('.zip', '.TXT') for value in tables]

        for j in range(len(files)):
            
            zipurl = files[j]
            with urlopen(zipurl) as zipresp:
                with ZipFile(BytesIO(zipresp.read())) as zfile:
                    zfile.extractall(os.path.join(save_unzip, i).replace('\\','/'))
        
            try:
                df = pd.read_csv(os.path.join(save_unzip, i, tables[j]).replace('\\','/'), header=None, names=['VARIAVEL'])
            except UnicodeDecodeError:
                df = pd.read_csv(os.path.join(save_unzip, i, tables[j]).replace('\\','/'), header=None, names=['VARIAVEL'], encoding='ANSI')
            except:
                print('Não foi possível carregar a tabela')

            df['CODIGO'] = (df.VARIAVEL.str[:16]).str.replace(' ', '0')
            df['CD_UF'] = df.CODIGO.str[:2]
            df['CD_MUNICIPIO'] = df.CODIGO.str[:7]
            df['CD_DISTRITO'] = df.CODIGO.str[:9]
            df['CD_SUBDISTRITO'] = df.CODIGO.str[:11]
            df['CD_SETOR'] = df.CODIGO.str[:15]
            df['CD_SITUACAO_SETOR'] = df.CODIGO.str[15:16]
            df['SITUACAO_SETOR'] = np.where(df.CD_SITUACAO_SETOR == '1', 'URBANO', 'RURAL')
            df['CEP'] = (df.VARIAVEL.str[550:]).str.replace(' ', '0')

            df.drop(['VARIAVEL','CODIGO', 'CD_SITUACAO_SETOR'], axis=1, inplace=True)
            df.drop_duplicates(inplace=True)   
            setores = pd.concat([setores, df], ignore_index=True)
            
        #Deleta a pasta da UF
        #shutil.rmtree(os.path.join(save_unzip).replace('\\','/'), ignore_errors=True)
        
    return setores

In [13]:
df = scraping_cnefe(cwd, ufs)

100%|██████████| 27/27 [1:14:44<00:00, 166.09s/it]


In [37]:
df.head(5)

Unnamed: 0,CD_UF,CD_MUNICIPIO,CD_DISTRITO,CD_SUBDISTRITO,CD_SETOR,SITUACAO_SETOR,CEP
0,12,1200013,120001305,12000130500,120001305000001,URBANO,69945000
1,12,1200013,120001305,12000130500,120001305000002,URBANO,69945000
2,12,1200013,120001305,12000130500,120001305000003,URBANO,69945000
3,12,1200013,120001305,12000130500,120001305000004,RURAL,69945000
4,12,1200013,120001305,12000130500,120001305000005,RURAL,69945000


In [30]:
df.shape

(1269844, 8)

In [42]:
df.drop_duplicates(subset=['CEP']).shape

(541905, 7)

In [18]:
# Download em ftp://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2019/Brasil/BR/br_municipios_20200807.zip
dbf = Dbf5('BR_Municipios_2019.dbf')

In [19]:
malha_mun = dbf.to_dataframe()
malha_mun.rename(columns={'CD_MUN':'CD_MUNICIPIO'}, inplace=True)

In [20]:
malha_mun.shape

(5572, 4)

In [21]:
malha_mun.head()

Unnamed: 0,CD_MUNICIPIO,NM_MUN,SIGLA_UF,AREA_KM2
0,1100015,Alta Floresta D'Oeste,RO,7067.025
1,1100023,Ariquemes,RO,4426.571
2,1100031,Cabixi,RO,1314.352
3,1100049,Cacoal,RO,3792.892
4,1100056,Cerejeiras,RO,2783.3


In [22]:
result = pd.merge(df, malha_mun, how='left', on='CD_MUNICIPIO')

In [24]:
result.shape

(1269844, 11)

In [25]:
result.isnull().sum()

CD_UF                0
CD_MUNICIPIO         0
CD_DISTRITO          0
CD_SUBDISTRITO       0
CD_SETOR             0
CD_SITUACAO_SETOR    0
SITUACAO_SETOR       0
CEP                  0
NM_MUN               0
SIGLA_UF             0
AREA_KM2             0
dtype: int64

In [26]:
result[cols].columns

Index(['NM_MUN', 'SIGLA_UF', 'AREA_KM2', 'CD_UF', 'CD_MUNICIPIO',
       'CD_DISTRITO', 'CD_SUBDISTRITO', 'CD_SETOR', 'SITUACAO_SETOR', 'CEP'],
      dtype='object')

In [58]:
result.iloc[:500000,][cols].to_csv('./setor_cep_ibge_part1.csv', index=None, encoding='iso-8859-1')

In [59]:
result.iloc[500000:,][cols].to_csv('./setor_cep_ibge_part2.csv', index=None, encoding='iso-8859-1')

In [45]:
result[cols].to_csv('./depara_setor_cep_ibge.csv', index=None, encoding='iso-8859-1')