In [1]:
import urllib
import urllib.request
from bs4 import BeautifulSoup
import requests
from pathlib import Path

In [20]:
def obem_dados_censo(
        url: str = 'https://www.gov.br/inep/pt-br/acesso-a-informacao/dados-abertos/microdados/censo-escolar',

) -> Path:
    '''
    Realiza o download dos dados do Censo Escolar

    :param url: endereço do site do INEP com os dados do censo \n
    :return: caminho para os dados de saída
    '''

    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, features='html.parser')

    # Criando um dicionário com o nome das URL´s para download
    #####################################################################################################
    # Capturando a class no arquivo HTML cujo nome é external-link 
    # tags = soup.find_all("a", {"class":"external-link"})
    # tag = tags[0]
    # print(tag['href'])

    # Outra forma: {'2023.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2023.zip', 
    #               '2022.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2022.zip'...}
    links = {tag['href'].split("_")[-1]: tag['href'] for tag in soup.find_all("a", {"class":"external-link"})}

    #####################################################################################################

    # Baixando arquivos da URL
    #####################################################################################################
    caminho_saida = Path("dados/censo_escolar")

    # Criando o caminho
    # parents=True - vai incluir o caminho pai
    # exist_ok=True - se o caminho existir não fará nada
    caminho_saida.mkdir(parents=True, exist_ok=True)

    # Baixando somente um arquivo
    # stream=True - faz o download em paralelo, para não travar
    # r = requests.get(links['1995.zip'], stream=True)
    with open(caminho_saida / "1995.zip", "wb") as arq:
        arq.write(r.content)

    for dado in links:
        # stream=True - faz o download em paralelo, para não travar
        r = requests.get(links[dado], stream=True)
        with open(caminho_saida / "1995.zip", "wb") as arq:
            arq.write(r.content)

In [21]:
obem_dados_censo()

{'2023.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2023.zip', '2022.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2022.zip', '2021.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2021.zip', '2020.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2020.zip', '2019.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2019.zip', '2018.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2018.zip', '2017.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2017.zip', '2016.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2016.zip', '2015.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2015.zip', '2014.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escolar_2014.zip', '2013.zip': 'https://download.inep.gov.br/dados_abertos/microdados_censo_escola

In [52]:
%reload_ext autoreload
%autoreload 2
import src.aquisicao.inep.base_inep as etl_base_inep 

In [53]:
inep_etl = etl_base_inep.BaseINEPETL("dados/censo_escolar", "saida/aquisicao", "censo-escolar")

In [55]:
inep_etl.caminho_entrada

WindowsPath('dados/censo_escolar')

In [56]:
import pandas as pd

In [61]:
dados = pd.read_csv('dados/censo_escolar/escolas.CSV', encoding="latin-1", sep="|")

In [64]:
dados.shape

(1870, 238)

In [63]:
dados.head(4)

Unnamed: 0,NU_ANO_CENSO,CO_ENTIDADE,NO_ENTIDADE,CO_ORGAO_REGIONAL,TP_SITUACAO_FUNCIONAMENTO,DT_ANO_LETIVO_INICIO,DT_ANO_LETIVO_TERMINO,CO_REGIAO,CO_MESORREGIAO,CO_MICRORREGIAO,...,IN_ESP_EXCLUSIVA_MEDIO_INTEGR,IN_ESP_EXCLUSIVA_MEDIO_NORMAL,IN_COMUM_EJA_FUND,IN_COMUM_EJA_MEDIO,IN_COMUM_EJA_PROF,IN_ESP_EXCLUSIVA_EJA_FUND,IN_ESP_EXCLUSIVA_EJA_MEDIO,IN_ESP_EXCLUSIVA_EJA_PROF,IN_COMUM_PROF,IN_ESP_EXCLUSIVA_PROF
0,2020,11002719,EMEI - PEQUENO MESTRE,9,1,06/02/2020,18/12/2020,1,1101,11001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020,11015616,EEEFM CEL JORGE TEIXEIRA DE OLIVEIRA,7,1,06/02/2020,30/12/2020,1,1102,11004,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020,11017856,CEEJA MARECHAL RONDON,7,1,05/02/2020,24/12/2020,1,1102,11004,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020,11022108,EMEIF SAGRADA FAMILIA,11,1,17/02/2020,31/12/2020,1,1102,11005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
dados.tail()

Unnamed: 0,NU_ANO_CENSO,CO_ENTIDADE,NO_ENTIDADE,CO_ORGAO_REGIONAL,TP_SITUACAO_FUNCIONAMENTO,DT_ANO_LETIVO_INICIO,DT_ANO_LETIVO_TERMINO,CO_REGIAO,CO_MESORREGIAO,CO_MICRORREGIAO,...,IN_ESP_EXCLUSIVA_MEDIO_INTEGR,IN_ESP_EXCLUSIVA_MEDIO_NORMAL,IN_COMUM_EJA_FUND,IN_COMUM_EJA_MEDIO,IN_COMUM_EJA_PROF,IN_ESP_EXCLUSIVA_EJA_FUND,IN_ESP_EXCLUSIVA_EJA_MEDIO,IN_ESP_EXCLUSIVA_EJA_PROF,IN_COMUM_PROF,IN_ESP_EXCLUSIVA_PROF
1865,2020,53011520,CED DARCY RIBEIRO,17.0,1,10/02/2020,28/01/2021,5,5301,53001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1866,2020,53016009,CEPI IPE-BRANCO,8.0,1,10/02/2020,23/12/2020,5,5301,53001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1867,2020,53016793,EC 203 DO RECANTO DAS EMAS,19.0,1,10/02/2020,29/01/2021,5,5301,53001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1868,2020,53017820,COL COC JARDIM BOTANICO,99.0,1,29/01/2020,18/12/2020,5,5301,53001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1869,2020,53018060,CEAV JR AGUAS CLARAS II - CED ALMEIDA VIEIRA J...,99.0,1,03/02/2020,19/12/2020,5,5301,53001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
dados.sample(5)

Unnamed: 0,NU_ANO_CENSO,CO_ENTIDADE,NO_ENTIDADE,CO_ORGAO_REGIONAL,TP_SITUACAO_FUNCIONAMENTO,DT_ANO_LETIVO_INICIO,DT_ANO_LETIVO_TERMINO,CO_REGIAO,CO_MESORREGIAO,CO_MICRORREGIAO,...,IN_ESP_EXCLUSIVA_MEDIO_INTEGR,IN_ESP_EXCLUSIVA_MEDIO_NORMAL,IN_COMUM_EJA_FUND,IN_COMUM_EJA_MEDIO,IN_COMUM_EJA_PROF,IN_ESP_EXCLUSIVA_EJA_FUND,IN_ESP_EXCLUSIVA_EJA_MEDIO,IN_ESP_EXCLUSIVA_EJA_PROF,IN_COMUM_PROF,IN_ESP_EXCLUSIVA_PROF
983,2020,31146013,EE BOM JESUS DE AGUADA NOVA,43.0,1,10/02/2020,13/12/2020,3,3103,31012,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
560,2020,26022303,ESCOLA MUNICIPAL MANOEL GOMES,12.0,1,04/02/2020,23/12/2020,2,2601,26003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154,2020,15531929,E M E I E F PAROQUIAL CURUPIRA,15.0,1,10/02/2020,18/12/2020,1,1506,15022,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1112,2020,32006675,EMEF JOAO VENTURIM SOBRINHO,7.0,1,05/02/2020,22/12/2020,3,3201,32002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1229,2020,33154473,EM PROF LETICIA PECANHA DE AGUIAR,11012.0,1,04/02/2020,22/12/2020,3,3302,33004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1870 entries, 0 to 1869
Columns: 238 entries, NU_ANO_CENSO to IN_ESP_EXCLUSIVA_PROF
dtypes: float64(223), int64(11), object(4)
memory usage: 3.4+ MB


In [68]:
import zipfile

In [69]:
arq_zip = zipfile.ZipFile('dados/censo_escolar/2020.zip')

In [73]:
arq_zip.namelist()

['microdados_educacao_basica_2020/',
 'microdados_educacao_basica_2020/ANEXOS/',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO I - Dicionário de Dados e Tabelas Auxiliares/',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO I - Dicionário de Dados e Tabelas Auxiliares/Dicionário de Dados da Educaç╞o Básica.xlsx',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO I - Dicionário de Dados e Tabelas Auxiliares/Tabelas Auxiliares.xlsx',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO I - Dicionário de Dados e Tabelas Auxiliares/~$Dicionário de Dados da Educaç╞o Básica.xlsx',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO II -  Questionários do Censo da Educaç╞o Basica/',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO II -  Questionários do Censo da Educaç╞o Basica/Cadastro_Aluno.pdf',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO II -  Questionários do Censo da Educaç╞o Basica/Cadastro_Escola.pdf',
 'microdados_educacao_basica_2020/ANEXOS/ANEXO II -  Questionários do Censo da Educaç╞o Basica/Ca

In [74]:
buffer = arq_zip.open('microdados_educacao_basica_2020/DADOS/turmas.CSV')

In [75]:
dados = pd.read_csv(buffer, encoding="latin-1", sep='|')