In [1]:
import re

import numpy as np
import pandas as pd
import os

import time #FOR TESTS
from IPython.display import clear_output# for tests

from sqlalchemy import create_engine, Column, Integer, String, Date, ForeignKey, UniqueConstraint, MetaData, Table
from sqlalchemy.orm import DeclarativeBase, Session
from sqlalchemy.orm import sessionmaker


from dotenv import dotenv_values

from municipio_id import municipio_map

## DATA FILTER CLASS

- transformar tudo para o tipo de dado do banco de dados, se possivel com SQLALCHEMY

In [2]:
class InepFilters():
    # csv_path is path source folder + file name
    def __init__(self, csv_path) -> None:
        self.__file_path = csv_path
        source_path, self.__file_name = os.path.split(csv_path)
        if not '__' in self.__file_name:
            pattern = r'(\d{4})_(\w+)_(\w+)_(\w+)\.csv'
        else:
            pattern = r'(\d{4})_(\w+)__(\w+)\.csv'
        self.__filters = re.findall(pattern, self.__file_name)[0]
        
        self.__correspondents = {
            'EnsinoFundamental':'EF',
            'AnosIniciais':'1',
            'AnosFinais':'2',
            'TodososValoresdeColunas':'todos',
            'EnsinoMdio':'EM',
            'CorRaa':'cor_raca',
            'DependnciaAdministrativa':'dependencia_administrativa'
        }

        self.__year()
        self.__teaching_stage()
        self.__category()

    def __year(self):
        self.__year = int(self.__filters[0])

    def __teaching_stage(self):
        self.__teaching_stage = self.__correspondents[self.__filters[1]]
        if self.__teaching_stage == 'EF':
            level = self.__correspondents[self.__filters[2]]
            if level != 'todos':
                self.__teaching_stage += level

    def __category(self):
        self.__category = self.__correspondents[self.__filters[-1]]
    
    #getters
    def get_year(self):
        return self.__year
    
    def get_teaching_stage(self):
        return self.__teaching_stage
    
    def get_category(self):
        return self.__category
    
    def get_file_name(self):
        return self.__file_name
    
    def get_file_path(self):
        return self.__file_path

    def get_df(self) -> pd.DataFrame:
        return pd.read_csv(self.__file_path, sep=';')

    """ def __municipio_id(self):
        pass """



## Get path from categories and reading and group all file by path

In [3]:
def transform_file_path(category_folder_name: str) -> list:
    source_path_data = os.path.abspath(os.path.join(os.getcwd(), "../oracle_data"))
    category_dir = os.path.abspath(os.path.join(source_path_data, category_folder_name))
    category_files = [os.path.join(category_dir, i) for i in os.listdir(category_dir)]
    return category_files

cor_raca_path_files = transform_file_path("CorRaa")
depend_admin_path_files = transform_file_path("DependnciaAdministrativa")
print(len(depend_admin_path_files))


12


## GEN LIST INSTANCE FILTER BY PATH_FILE

In [4]:
cor_raca_objs = [InepFilters(i) for i in cor_raca_path_files]
depend_admin_objs = [InepFilters(i) for i in depend_admin_path_files]
all_category_objs = cor_raca_objs + depend_admin_objs


# OFICIAL WORKING AND INSERT DATA

In [5]:
""" def separate_table_filters_and_registrations():
    for matricula_data in all_category_objs:
        df_full_table = make_full_table(matricula_data)
        display(df_full_table) #FOR TESTS
        time.sleep(1) #FOR TESTS
        clear_output() #FOR TESTS
        

def make_full_table(data_filters: InepFilters) -> pd.DataFrame:
        df = data_filters.get_df()
        df.rename(columns={'Categoria 1': data_filters.get_category(), 'Matrículas':'quantidade'}, inplace=True)
        df.drop(columns=[
            'Etapa de Ensino - Superior', 'Etapa de Ensino', 'Localidade da Escola', 'Categoria 1 - Ordenação'], inplace=True)
        df['etapa_de_ensino'] = data_filters.get_teaching_stage()
        df['ano'] =  data_filters.get_year()
        df['municipio_id'] = df['Município'].map(municipio_map)
        return df

separate_table_filters_and_registrations()
 """

" def separate_table_filters_and_registrations():\n    for matricula_data in all_category_objs:\n        df_full_table = make_full_table(matricula_data)\n        display(df_full_table) #FOR TESTS\n        time.sleep(1) #FOR TESTS\n        clear_output() #FOR TESTS\n        \n\ndef make_full_table(data_filters: InepFilters) -> pd.DataFrame:\n        df = data_filters.get_df()\n        df.rename(columns={'Categoria 1': data_filters.get_category(), 'Matrículas':'quantidade'}, inplace=True)\n        df.drop(columns=[\n            'Etapa de Ensino - Superior', 'Etapa de Ensino', 'Localidade da Escola', 'Categoria 1 - Ordenação'], inplace=True)\n        df['etapa_de_ensino'] = data_filters.get_teaching_stage()\n        df['ano'] =  data_filters.get_year()\n        df['municipio_id'] = df['Município'].map(municipio_map)\n        return df\n\nseparate_table_filters_and_registrations()\n "

### testing data (DISPOSABLE)

In [6]:
arquivo1 = cor_raca_objs[2]

In [7]:
print(arquivo1.get_category()) 
df = arquivo1.get_df()
#df = df.sort_values(by='Categoria 1 - Ordenação')
df.rename(columns={'Categoria 1': arquivo1.get_category(), 'Matrículas':'quantidade'}, inplace=True)
df.drop(columns=[
    'Etapa de Ensino - Superior', 'Etapa de Ensino', 'Localidade da Escola', 'Categoria 1 - Ordenação'], inplace=True)
df['etapa_de_ensino'] = arquivo1.get_teaching_stage()
df['ano'] = arquivo1.get_year()
df['municipio_id'] = df['Município'].map(municipio_map)
display(df)
print((df['UF'] == 'MG').all()) # verifica se todos os valores da coluna tem tal valor

cor_raca


Unnamed: 0,País,Região,UF,Município,quantidade,cor_raca,etapa_de_ensino,ano,municipio_id
0,Brasil,Sudeste,MG,Buritis,5,Amarela,EF2,2021,3109303
1,Brasil,Sudeste,MG,Buritis,310,Branca,EF2,2021,3109303
2,Brasil,Sudeste,MG,Buritis,219,Não declarada,EF2,2021,3109303
3,Brasil,Sudeste,MG,Buritis,1106,Parda,EF2,2021,3109303
4,Brasil,Sudeste,MG,Buritis,57,Preta,EF2,2021,3109303
...,...,...,...,...,...,...,...,...,...
1962,Brasil,Sudeste,MG,Nova Lima,1798,Branca,EF2,2021,3144805
1963,Brasil,Sudeste,MG,Nova Lima,8,Indígena,EF2,2021,3144805
1964,Brasil,Sudeste,MG,Nova Lima,2024,Não declarada,EF2,2021,3144805
1965,Brasil,Sudeste,MG,Nova Lima,2077,Parda,EF2,2021,3144805


True


## Organizing table for insert

In [8]:
def make_full_table(data_filters: InepFilters) -> pd.DataFrame:
        df = data_filters.get_df()
        df.rename(columns={'Categoria 1': data_filters.get_category(), 'Matrículas':'quantidade'}, inplace=True)
        df.drop(columns=[
            'Etapa de Ensino - Superior', 'Etapa de Ensino', 'Localidade da Escola', 'Categoria 1 - Ordenação'], inplace=True)
        df['etapa_de_ensino'] = data_filters.get_teaching_stage()
        df['ano'] =  data_filters.get_year()
        df['municipio_id'] = df['Município'].map(municipio_map)
        return df

## Settings database and create SQLALCHEMY engine

In [9]:
config = dotenv_values("./.env")
username = config.get("DATABASE_USERNAME")
password = config.get("DATABASE_PASSWORD")
dbname = config.get("DATABASE_NAME")
port = config.get("DATABASE_PORT")
host = config.get("DATABASE_HOST")

engine = create_engine(f"postgresql+psycopg2://{username}:{password}@{host}:{port}/{dbname}", echo=True)

# inserting in database -> refactore

# based in: https://chatgpt.com/share/d66f961d-ab64-4613-82d1-aa50d7802423

## necessity apply design patterns: [Design Patterns](https://refactoring.guru/)

In [10]:
class Base(DeclarativeBase):
    ...

In [11]:
class Municipio(Base):
    __tablename__ = 'Municipio'
    id = Column(Integer, primary_key=True, autoincrement=True)
    nome = Column(String, nullable=False)

class Filtro(Base):
    __tablename__ = 'Filtro'

    id = Column(Integer, primary_key=True, autoincrement=True)
    municipio_id = Column(Integer, ForeignKey('Municipio.id'), nullable=False)
    etapa_de_ensino = Column(String(5), nullable=False)
    ano = Column(Integer, nullable=False)
    
    __table_args__ = (UniqueConstraint('municipio_id', 'etapa_de_ensino', 'ano', name='unique_municipio_etapa_ano'),)

class MatriculaRaca(Base):
    __tablename__ = 'MatriculaRaca'
    id = Column(Integer, primary_key=True, autoincrement=True)
    id_filtro = Column(Integer, ForeignKey('Filtro.id'), nullable=False)
    cor_raca = Column(String(20), nullable=True)
    quantidade = Column(Integer, nullable=True)

class MatriculaDependenciaAdministrativa(Base):
    __tablename__ = 'MatriculaDependenciaAdministrativa'
    id = Column(Integer, primary_key=True, autoincrement=True)
    id_filtro = Column(Integer, ForeignKey('Filtro.id'), nullable=False)
    dependencia_administrativa = Column(String(20), nullable=True)
    quantidade = Column(Integer, nullable=True)

# Configuração do banco de dados
Base.metadata.create_all(engine)

2024-08-02 18:01:43,105 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2024-08-02 18:01:43,106 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-08-02 18:01:43,164 INFO sqlalchemy.engine.Engine select current_schema()
2024-08-02 18:01:43,165 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-08-02 18:01:43,222 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2024-08-02 18:01:43,223 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-08-02 18:01:43,278 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-02 18:01:43,283 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

### Verifica se o filtro existe. Se não existir, cria e retorna o ID

In [12]:
def get_or_create_filtro(session: Session, municipio_id: int, etapa_de_ensino: str, ano: int):
    # Verificar se o filtro já existe
    filtro = session.query(Filtro).filter_by(municipio_id=municipio_id, etapa_de_ensino=etapa_de_ensino, ano=ano).first()
    # Se não existir, criar um novo filtro
    if not filtro:
        filtro = Filtro(municipio_id=municipio_id, etapa_de_ensino=etapa_de_ensino, ano=ano)
        session.add(filtro)
        session.commit()
        session.refresh(filtro)  # Atualizar a instância do filtro para obter o ID gerado
    
    return filtro.id

In [13]:
def insert_csv_data_in_database(data: pd.DataFrame, category: str):
    with Session(engine) as session:
        # Processar cada linha do DataFrame
        for index, row in data.iterrows():
            # Obter ou criar o filtro e obter seu ID
            id_filtro = get_or_create_filtro(session, row['municipio_id'], row['etapa_de_ensino'], row['ano'])

            # Inserir na tabela Matriculas_Por_Raca
            if(category == "cor_raca"):
                matricula = MatriculaRaca(id_filtro=id_filtro, 
                                          cor_raca=row['cor_raca'], 
                                          quantidade=row['quantidade'])
            
            else:
                matricula = MatriculaDependenciaAdministrativa(id_filtro=id_filtro, 
                                                               dependencia_administrativa=row['dependencia_administrativa'], 
                                                               quantidade=row['quantidade'])

            session.add(matricula)

        # Commitar todas as inserções
        session.commit()

In [14]:
def alternate_table_and_insert():
    for matricula_data in all_category_objs:
        df_full_table = make_full_table(matricula_data)
        insert_csv_data_in_database(df_full_table, matricula_data.get_category())

In [15]:
alternate_table_and_insert()

2024-08-02 18:01:43,585 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-08-02 18:01:43,590 INFO sqlalchemy.engine.Engine SELECT "Filtro".id AS "Filtro_id", "Filtro".municipio_id AS "Filtro_municipio_id", "Filtro".etapa_de_ensino AS "Filtro_etapa_de_ensino", "Filtro".ano AS "Filtro_ano" 
FROM "Filtro" 
WHERE "Filtro".municipio_id = %(municipio_id_1)s AND "Filtro".etapa_de_ensino = %(etapa_de_ensino_1)s AND "Filtro".ano = %(ano_1)s 
 LIMIT %(param_1)s
2024-08-02 18:01:43,591 INFO sqlalchemy.engine.Engine [generated in 0.00162s] {'municipio_id_1': 3109303, 'etapa_de_ensino_1': 'EF2', 'ano_1': 2022, 'param_1': 1}
2024-08-02 18:01:43,656 INFO sqlalchemy.engine.Engine INSERT INTO "Filtro" (municipio_id, etapa_de_ensino, ano) VALUES (%(municipio_id)s, %(etapa_de_ensino)s, %(ano)s) RETURNING "Filtro".id
2024-08-02 18:01:43,657 INFO sqlalchemy.engine.Engine [generated in 0.00110s] {'municipio_id': 3109303, 'etapa_de_ensino': 'EF2', 'ano': 2022}
2024-08-02 18:01:43,690 INFO sqlalchemy.engine

KeyboardInterrupt: 