In [117]:
import parametros as par
import dask.dataframe as dd
from dask.distributed import Client
import time
import asyncio
import pandas as pd
from loguru import logger
#from pandera.typing.dask import DataFrame, Series
#import pandera as pa
from sqlalchemy import table, column, select, types, Float
from sqlalchemy.dialects import oracle
from datetime import datetime
import sys


In [118]:
client= Client("tcp://10.128.0.48:8786")

In [119]:
p=par.ParamApdw()

In [120]:
data=p.arquivo_origem

In [121]:
url_db = p.url_db

In [136]:
timestamp=datetime.today().strftime('%Y-%m-%d_%H:%M')
logger.add(f'out_{timestamp}.log', level="TRACE")

3

In [137]:
def read_oracle(colunas=None,meta=None):
    #le  a tabela
    try:
        logger.info("Iniciando leitura do Oracle")
        if (colunas is None) & (meta is None) :
            df = dd.read_sql_table(table_name='tb_raw_data', con=url_db,index_col= 'cod', dtype={
                k.lower(): p.schema_base[k] for k in p.schema_base 
                }).reset_index()
            #erro abaixo sucess
            logger.success("Leitura de toda a tabela concluida.")
            return df
        elif meta is not None:
            df = dd.read_sql_table(table_name='tb_raw_data', con=url_db,index_col= 'cod', meta= p.meta_schema)
            logger.success("Leitura dos Metadados concluida.")
            return df
        else:
            df = dd.read_sql_table(table_name='tb_raw_data', columns=colunas, con=url_db,index_col= 'cod', dtype={
                k.lower(): p.schema_base[k] for k in p.schema_base if k.lower() in colunas
                }).reset_index()
            logger.success("Leitura das colunas concluida.")
            return df
    except:
        logger.error("Erro na leitura do Oracle")
        

LEITURA BRUTA DE DADOS DO DIRETORIO
====================================================================================

In [138]:
def read_raw_data(data):
    #le os dados
    try:
        logger.info("Lendo dados")
        df=dd.read_csv(data, sep="|", assume_missing=True).astype(dtype=p.raw_schema_base)
        logger.success("Leitura concluida")
        return df
    except:
        logger.error("Erro na leitura do arquivo")
    

In [139]:
def pipeline_raw_oracle(df):    
    try:
        start_time = time.time()
        logger.info("Iniciando Inserção no Oracle")
        df_result=(
            df.drop('Unnamed: 11', axis=1)
              .assign(DAT_ULT_AT=dd.to_datetime(df['DAT_ULT_AT'],format='%Y-%m-%d',errors='coerce'))
              .to_sql("tb_raw_data",url_db, if_exists='replace', index=False,  
                      compute=True, parallel=True, chunksize=500,dtype={
                          k: p.oracle_types[v] for (k,v) in p.schema_base.items() if k in (p.raw_schema_base.keys())
                      }
                     )            
        )
        end_time = time.time()
        elapsed_time = end_time - start_time
        logger.success(f"Inserção concluida, {round(elapsed_time,2)} segs ")
    except:
        logger.error("Erro no envio bruto para o Oracle")   


In [140]:
df=read_raw_data(data)
pipeline_raw_oracle(df)

FUNÇÃO RELATORIO DE VALORES FEVEREIRO
====================================================================================

In [130]:
def soma_valores_fevereiro(df,colunas):
    #Aplica as regras de negocio
    try:
        start_time = time.time()
        logger.info("Iniciando Processo SomaValoresFevereiro")
        df=df.drop_duplicates()
        df_result = (df.assign(dat_ult_at=dd.to_datetime(df['dat_ult_at'],format='%Y-%m-%d',errors='coerce'))
                    .query("cod==1 & dat_ult_at.between('2023-02-01','2023-02-28')")
                    .groupby(['cod','dat_ult_at'])['vlr_doc_seg_emit','vlr_premio_cobra','vlr_sinistro_avi','vlr_sinistro_pag'].sum()
                    .to_sql("tb_soma_valores_fevereiro",url_db, if_exists='replace', index=True,  compute=True, parallel=True, chunksize=500, dtype={
                        k.lower(): p.oracle_types[v] for (k,v) in p.schema_base.items() if k.lower() in colunas
                    }
           ))
        end_time = time.time()
        elapsed_time = end_time - start_time
        logger.success(f"Inserção concluida,{round(elapsed_time,2)} ")
    except:
        logger.error(f"Erro na task SomaValoresFevereiro" )

In [131]:
#%%timeit
colunas= ['cod','dat_ult_at','vlr_doc_seg_emit','vlr_premio_cobra','vlr_sinistro_avi','vlr_sinistro_pag']
df=read_oracle(colunas)
soma_valores_fevereiro(df,colunas)

FUNÇÃO CONSULTA TABELA DE REFERÊNCIA
====================================================================================

In [132]:
def consulta_tab_ref(df, colunas):
    try:
        start_time = time.time()
        logger.info("Iniciando Processo ConsultaTabRef")
        df_ref = dd.read_sql(sql='table_ref', con=url_db,index_col= 'cod')
        df_merge= dd.merge(df,df_ref, how='left', left_index = True, right_index = True)
        df_merge=(df_merge.fillna({
            'nome_cliente': "Cliente não encontrado"
        }).to_sql("tb_consulta_ref",url_db, if_exists='replace', index=True,  compute=True, parallel=True, chunksize=500,dtype={
                        k.lower(): p.oracle_types[v] for (k,v) in p.schema_base.items() if k.lower() in colunas
                    }))
        end_time = time.time()
        elapsed_time = end_time - start_time
        logger.success(f"Inserção concluida,{round(elapsed_time,2)} ")
    except: 
        logger.error(f"Erro na task ConsultaTabRef" )


In [133]:
#%%timeit 
colunas= ['cod','dat_ult_at','vlr_doc_seg_emit','vlr_premio_cobra','vlr_sinistro_avi','vlr_sinistro_pag']
df=read_oracle(colunas)
consulta_tab_ref(df,colunas)


FUNÇÃO SEPARA COD RA
====================================================================================


In [134]:
def filtra_cod(cod):
    try:
        start_time = time.time()
        logger.info(f"Iniciando inserção tabela {cod}")
        df_codra = client.persist(df[df['cod_ra']==cod])
        df_codra.to_sql(f"tb_codra_dask_{cod}",url_db, if_exists='replace', index=False, compute = True, parallel = True, chunksize=500,dtype={
            k.lower(): p.oracle_types[v] for (k,v) in p.schema_base.items()
            })
        end_time = time.time()
        elapsed_time = end_time - start_time
        logger.success(f"Inserção da tabela {cod} concluida,{elapsed_time} ")
    except:
        logger.error("Erro na task FiltraCodRa")

async def escreve_tabelas():
    tasks = [asyncio.to_thread(filtra_cod, cod) for cod in df.cod_ra.unique()]
    res = await asyncio.gather(*tasks)

In [135]:
#for i in range(7):
df=read_oracle(meta=p.meta_schema)
df=client.persist(df)    
await escreve_tabelas()

Pandera 
=

In [None]:
schema= pa.DataFrameSchema({
    '00AAAJ': pa.Column(str,checks=pa.Check.isin(['aaaa']))
})

In [None]:
df = dd.read_sql_table(table_name='tb_raw_data', con=url_db,index_col= 'cod', meta= meta_schema)
df=client.persist(df)

In [None]:
try:
    schema.validate(df.compute())
except pa.errors.SchemaError as err: 
    df_erros= err.failure_cases
    

In [None]:
df_erros.info