In [0]:
file = {'namespace':'/oni','type_raw':'/usr', 'file_folder':'/observatorio_nacional/monitor_investimentos/painel_monitor', 'file_1':'/empresas_auxiliar/','file_2':'/dim_cnae/','file_3':'/estrutura_setorial/', 'prm_path':'', 'extension':'txt','column_delimiter':'','encoding':'','null_value':''}

adf = { 'adf_factory_name': 'cnibigdatafactory', 'adf_pipeline_name': 'raw_trs_tb_email', 'adf_pipeline_run_id': '61fc4f3c-c592-426d-bb36-c85cb184bb82', 'adf_trigger_id': '92abb4ec-2b1f-44e0-8245-7bc165f91016', 'adf_trigger_name': '92abb4ec-2b1f-44e0-8245-7bc165f91016', 'adf_trigger_time': '2024-05-07T00:58:48.0960873Z', 'adf_trigger_type': 'PipelineActivity' }

dls = { 'folders': { 'landing': '/tmp/dev/uld', 'error': '/tmp/dev/err', 'staging': '/tmp/dev/stg', 'log': '/tmp/dev/log', 'raw': '/tmp/dev/raw', 'archive': '/tmp/dev/ach' }, 'systems': { 'raw': 'usr' }, 'path_prefix': '' }


dbutils.widgets.text('user_parameters', '{"null": "null"}')

dbutils.widgets.text('env', 'dev')

dbutils.widgets.text('storage', '{"url": "https://cnibigdatadlsgen2.dfs.core.windows.net", "container": "datalake"}')

In [0]:
# Biblioteca cni_connectors, que dá acesso aos dados no datalake
from cni_connectors import adls_connector as adls_conn

# A biblioteca criada para facilitar a declaração dos testes. É necessário importá-la.
### from datatest.gx_context_provider import GXContextProvider

# Essa declaração é necessária em todos os notebooks. Somente através dela podemos acessar os dados no datalake.
var_adls_uri, notebook_params = adls_conn.connect_adls()

from pyspark.sql.functions import udf, from_utc_timestamp, current_timestamp, lit, input_file_name, monotonically_increasing_id, substring_index
from trs_control_field import trs_control_field as tcf
import pyspark.sql.functions as f
import crawler.functions as cf
from pyspark.sql import SparkSession
import time
import pandas as pd
from pyspark.sql.functions import col, when, explode, lit
import json
from unicodedata import normalize 
import datetime
import re
from core.string_utils import normalize_replace



### Raw specific parameter section

In [0]:
# Esses são os dicionários de configuração da transformação enviados pelo ADF e acessados via widgets. Os diretórios de origem e destino das tabelas são compostos por valores em 'dls' e 'tables'.
# Parametros necessario para na ingestão

file = notebook_params.var_file
dls = notebook_params.var_dls
adf = notebook_params.var_adf

In [0]:
uld = dls['folders']['landing']
raw = dls['folders']['raw']
usr = dls['systems']['raw']

In [0]:
uld_path_1 = "{uld}{namespace}{file_folder}{file_1}".format(uld=uld, namespace=file['namespace'], file_folder=file['file_folder'], file_1=file['file_1'])
uld_path_2 = "{uld}{namespace}{file_folder}{file_2}".format(uld=uld, namespace=file['namespace'], file_folder=file['file_folder'], file_2=file['file_2'])
uld_path_3 = "{uld}{namespace}{file_folder}{file_3}".format(uld=uld, namespace=file['namespace'], file_folder=file['file_folder'], file_3=file['file_3'])

adl_uld_1 = f"{var_adls_uri}{uld_path_1}"
adl_uld_2 = f"{var_adls_uri}{uld_path_2}"
adl_uld_3 = f"{var_adls_uri}{uld_path_3}"

In [0]:
raw_path_1 = "{raw}{type_raw}{namespace}{file_folder}{file_1}".format(raw=raw, type_raw=file['type_raw'], namespace=file['namespace'], file_folder=file['file_folder'], file_1=file['file_1'])
raw_path_2 = "{raw}{type_raw}{namespace}{file_folder}{file_2}".format(raw=raw, type_raw=file['type_raw'], namespace=file['namespace'], file_folder=file['file_folder'], file_2=file['file_2'])
raw_path_3 = "{raw}{type_raw}{namespace}{file_folder}{file_3}".format(raw=raw, type_raw=file['type_raw'], namespace=file['namespace'], file_folder=file['file_folder'], file_3=file['file_3'])

adl_raw_1 = f"{var_adls_uri}{raw_path_1}"
adl_raw_2 = f"{var_adls_uri}{raw_path_2}"
adl_raw_3 = f"{var_adls_uri}{raw_path_3}"

### Apply transformations and save dataframe

In [0]:
df_1 = spark.read.format("csv").option("header","true").option("encoding", "UTF-8").option('sep', ';').load(adl_uld_1, mode="FAILFAST", ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True,inferSchema=True)
for c in df_1.columns:
  df_1 = df_1.withColumnRenamed(c, re.sub(r'[,;{}()\n\t=-]', '', normalize('NFKD', c).encode('ASCII', 'ignore').decode('ASCII').replace(' ', '_').replace('-', '_').upper()))

df_2 = spark.read.format("csv").option("header","true").option("encoding", "UTF-8").option('sep', ';').load(adl_uld_2, mode="FAILFAST", ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True,inferSchema=True)
for c in df_2.columns:
  df_2 = df_2.withColumnRenamed(c, re.sub(r'[,;{}()\n\t=-]', '', normalize('NFKD', c).encode('ASCII', 'ignore').decode('ASCII').replace(' ', '_').replace('-', '_').upper()))

df_3 = spark.read.format("csv").option("header","true").option("encoding", "UTF-8").option('sep', ';').load(adl_uld_3, mode="FAILFAST", ignoreLeadingWhiteSpace=True, ignoreTrailingWhiteSpace=True,inferSchema=True)
for c in df_3.columns:
  df_3 = df_3.withColumnRenamed(c, re.sub(r'[,;{}()\n\t=-]', '', normalize('NFKD', c).encode('ASCII', 'ignore').decode('ASCII').replace(' ', '_').replace('-', '_').upper()))

In [0]:
adl_raw_1 = f"{var_adls_uri}{raw_path_1}"
adl_raw_2 = f"{var_adls_uri}{raw_path_2}"
adl_raw_3 = f"{var_adls_uri}{raw_path_3}"

adl_raw = [adl_raw_1, adl_raw_2, adl_raw_3]

sparkframe = [df_1, df_2, df_3]
for df, adl in zip(sparkframe, adl_raw):
  dh_insercao_raw = adf['adf_trigger_time']
  if dh_insercao_raw is not None:
    dh_insercao_raw = dh_insercao_raw.split(".")[0]
    df = cf.append_control_columns(df, dh_insercao_raw=dh_insercao_raw)
  df.write.parquet(path=adl, mode='overwrite')
