In [2]:
import os

# Prepara ambiente

### Para este teste estou usando a versão spark-3.1.2-bin-hadoop3.2

In [3]:
os.environ['SPARK_HOME'] = '/c/spark-3.1.2-bin-hadoop3.2'

### instala o pydeequ 

https://github.com/awslabs/python-deequ

In [4]:
!pip install pydeequ
!pip install pyspark



In [5]:
from pyspark.sql import SparkSession, Row

import pydeequ

spark = (SparkSession
            .builder
            .config("spark.jars.packages", pydeequ.deequ_maven_coord)
            .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
            .getOrCreate())

Please set env variable SPARK_VERSION
21/12/07 09:23:11 WARN Utils: Your hostname, DBC-0001023 resolves to a loopback address: 127.0.1.1; using 172.27.112.1 instead (on interface eth3)
21/12/07 09:23:11 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/c/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/taua/.ivy2/cache
The jars for the packages stored in: /home/taua/.ivy2/jars
com.amazon.deequ#deequ added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a9cceebf-c6a1-4605-a612-c8de5fe956c5;1.0
	confs: [default]
	found com.amazon.deequ#deequ;1.2.2-spark-3.0 in central
	found org.scalanlp#breeze_2.12;0.13.2 in central
	found org.scalanlp#breeze-macros_2.12;0.13.2 in central
	found org.scala-lang#scala-reflect;2.12.1 in central
	found com.github.fommil.netlib#core;1.1.2 in central
	found net.sf.opencsv#opencsv;2.3 in central
	found com.github.rwl#jtransforms;2.4.0 in central
	found junit#junit;4.8.2 in central
	found org.apache.commons#commons-math3;3.2 in central
	found org.spire-math#spire_2.12;0.13.0 in central
	found org.spire-math#spire-macros_2.12;0.13.0 in central
	found org.typelevel#machinist_2.12;0.6.1 in central
	found com.chuusai#shapeless_2.12;2.3.2 in central
	found org.typelevel#macro-compat_2.12;1.1.1 in ce

### Executando um teste para verificar se a sessão spark esta correta

In [6]:
df_test = spark.sparkContext.parallelize([
    Row(coluna1="Banana",  valor=1.50, quantidade=5),
    Row(coluna1="Maça",    valor=1.85, quantidade=6),
    Row(coluna1="Laranja", valor=3.00, quantidade=None)]).toDF()


df_test.toPandas()



Unnamed: 0,coluna1,valor,quantidade
0,Banana,1.5,5.0
1,Maça,1.85,6.0
2,Laranja,3.0,


### Agora que sabemos que nossa sessão spark esta correta, vamos importar um fonte de dados e criar nosso dataframe

## Importando dados 

Para este exemplo vou utilizar um parte dos dados de empresas brasileiras.
Estes dados são disponibilizados em .zip pelo governo federal

https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/cadastros/consultas/dados-publicos-cnpj

São vários zips, para facilitar salver um dos arquivos sem ./fonte_de_dados

In [7]:
from pyspark.sql.types import StructType,StructField, StringType

schema = StructType([ \
    StructField("cnpj_basico",StringType(),True), \
    StructField("razao_social",StringType(),True), \
    StructField("natureza_juridica",StringType(),True), \
    StructField("qualificacao_do_responsavel", StringType(), True), \
    StructField("capital_social", StringType(), True), \
    StructField("porte_da_empresa", StringType(), True),
    StructField("ente_federativo_responsavel", StringType(), True),
  ])


#detalhes sobre o layout em https://www.gov.br/receitafederal/pt-br/assuntos/orientacao-tributaria/cadastros/consultas/arquivos/novolayoutdosdadosabertosdocnpj-dez2021.pdf

In [8]:
df_dados_de_empresa = spark.read.csv('./fonte_de_dados/',sep=';',schema=schema)
df_dados_de_empresa.show()




+---------------------+--------------------+--------------------+---------------------------+--------------+----------------+---------------------------+
|          cnpj_basico|        razao_social|   natureza_juridica|qualificacao_do_responsavel|capital_social|porte_da_empresa|ente_federativo_responsavel|
+---------------------+--------------------+--------------------+---------------------------+--------------+----------------+---------------------------+
| PK   �zSmU�...|                null|                null|                       null|          null|            null|                       null|
| ����Mr�|� H�x�...|                null|                null|                       null|          null|            null|                       null|
| ,y+3��::���4��...|                null|                null|                       null|          null|            null|                       null|
| ?���U�f�m�`Ɏ�tX�...|                null|                null|             



### Analisando o dataframe

In [9]:
df_dados_de_empresa.describe().show()



+-------+--------------------+--------------------+--------------------+---------------------------+--------------------+--------------------+---------------------------+
|summary|         cnpj_basico|        razao_social|   natureza_juridica|qualificacao_do_responsavel|      capital_social|    porte_da_empresa|ente_federativo_responsavel|
+-------+--------------------+--------------------+--------------------+---------------------------+--------------------+--------------------+---------------------------+
|  count|              530545|              185662|               65729|                      23732|                8604|                3164|                       1164|
|   mean|           21881.425|   5.261654135338346|   6.571428571428571|          7.769230769230769|   5.888888888888889|                 8.0|                        9.0|
| stddev|   391311.5359487437|   8.003453059740613|    9.33143237944212|         10.017932638971695|   2.666666666666667|  1.4142135623730951|   

#### Vamos criar uma função para validar o CNPJ

In [10]:
from itertools import cycle

LENGTH_CNPJ = 14

def is_cnpj_valido(cnpj: str) -> bool:
    if len(cnpj) != LENGTH_CNPJ:
        return False

    if cnpj in (c * LENGTH_CNPJ for c in "1234567890"):
        return False

    cnpj_r = cnpj[::-1]
    for i in range(2, 0, -1):
        cnpj_enum = zip(cycle(range(2, 10)), cnpj_r[i:])
        dv = sum(map(lambda x: int(x[1]) * x[0], cnpj_enum)) * 10 % 11
        if cnpj_r[i - 1:i] != str(dv % 10):
            return False

    return True

#### Agora vamos criar uma nova coluna que vai receber o resultado da validação do CNPJ

In [11]:
from pyspark.sql.functions import col, lit
from pyspark.sql.functions import udf

is_cnpj_valido_lambda = udf(lambda x: is_cnpj_valido(x), StringType())
df_dados_de_empresa = df_dados_de_empresa.withColumn("cnpj_valido",is_cnpj_valido_lambda(col('cnpj_basico')))

### Data analysis

Before we define checks on the data, we want to calculate some statistics on the dataset; we call them metrics. As with Deequ, PyDeequ supports a rich set of metrics. For more information, see Test data quality at scale with Deequ or the GitHub repo. In the following example, we use the AnalysisRunner to capture the metrics you’re interested in:

In [12]:
from pydeequ.analyzers import *
analysisResult = AnalysisRunner(spark) \
    .onData(df_dados_de_empresa) \
    .addAnalyzer(Size()) \
    .addAnalyzer(Completeness("cnpj_basico")) \
    .addAnalyzer(Completeness("razao_social")) \
    .addAnalyzer(Completeness("natureza_juridica")) \
    .addAnalyzer(Completeness("qualificacao_do_responsavel")) \
    .addAnalyzer(Completeness("capital_social")) \
    .addAnalyzer(Completeness("porte_da_empresa")) \
    .addAnalyzer(Completeness("ente_federativo_responsavel")) \
    .run()
analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()



+-------+--------------------+------------+--------------------+
| entity|            instance|        name|               value|
+-------+--------------------+------------+--------------------+
| Column|qualificacao_do_r...|Completeness|0.044543964923410735|
| Column|    porte_da_empresa|Completeness|0.005938694801014308|
| Column|        razao_social|Completeness|   0.348479757947509|
| Column|   natureza_juridica|Completeness| 0.12337056592157694|
|Dataset|                   *|        Size|            532777.0|
| Column|         cnpj_basico|Completeness|  0.9958106299633805|
| Column|      capital_social|Completeness| 0.01614934578632336|
| Column|ente_federativo_r...|Completeness|0.002184778997591863|
+-------+--------------------+------------+--------------------+



Com base em uma analise inicial do dataframe, já podemos observar que existem problemas nos campos de porte_da_empresa e ente_federativo_responsavel

In [13]:
print(f'Registros com ente_federativo_responsavel igual a null: {df_dados_de_empresa.filter(df_dados_de_empresa.ente_federativo_responsavel.isNull()).count()}')
print(f'Registros com porte_da_empresa igual a null: {df_dados_de_empresa.filter(df_dados_de_empresa.porte_da_empresa.isNull()).count()}')



Registros com ente_federativo_responsavel igual a null: 531613




Registros com porte_da_empresa igual a null: 529613




### Defining and running tests for data

```
After analyzing and understanding the data, we want to verify that the properties we have derived also hold for new versions of the dataset. By defining assertions on the data distribution as part of a data pipeline, we can ensure that every processed dataset is of high quality, and that any application consuming the data can rely on it.
```

### Algumas validações que vamos adicionar

```
CNPJ preenchido e unico

Razão Social preenchido

Código do porte da empresa precisa ser algum código válido:
    00  – NÃO INFORMADO 
    01  -  MICRO EMPRESA 
    03 - EMPRESA DE PEQUENO PORTE 
    05 - DEMAIS
    
Se o CNPJ é válido de acordo com a função que criamos    
```

In [15]:
from pydeequ.checks import *
from pydeequ.verification import *
check = Check(spark, CheckLevel.Warning, "Review Check")

checkResult = VerificationSuite(spark)\
    .onData(df_dados_de_empresa)\
    .addCheck(check.hasSize(lambda x: x >= 3)\
        .isComplete("cnpj_basico")\
        .isComplete("razao_social")\
        .isUnique("cnpj_basico")\
        .isContainedIn("porte_da_empresa", ["00", "01", "03", "05"]) \
        .isContainedIn("cnpj_valido", ['true']))\
    .run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.toPandas()

Python Callback server started!


21/12/07 09:24:22 ERROR Executor: Exception in task 4.0 in stage 12.0 (TID 54)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/c/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 604, in main
    process()
  File "/c/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 596, in process
    serializer.dump_stream(out_iter, outfile)
  File "/c/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 211, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "/c/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 132, in dump_stream
    for obj in iterator:
  File "/c/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/serializers.py", line 200, in _batched
    for item in iterator:
  File "/c/spark-3.1.2-bin-hadoop3.2/python/lib/pyspark.zip/pyspark/worker.py", line 450, in mapper
    result = tuple(f(*[a[o] for o in ar

Unnamed: 0,check,check_level,check_status,constraint,constraint_status,constraint_message
0,Review Check,Warning,Warning,SizeConstraint(Size(None)),Failure,org.apache.spark.SparkException: Job aborted d...
1,Review Check,Warning,Warning,CompletenessConstraint(Completeness(cnpj_basic...,Failure,org.apache.spark.SparkException: Job aborted d...
2,Review Check,Warning,Warning,CompletenessConstraint(Completeness(razao_soci...,Failure,org.apache.spark.SparkException: Job aborted d...
3,Review Check,Warning,Warning,UniquenessConstraint(Uniqueness(List(cnpj_basi...,Failure,Value: 0.9712107361298288 does not meet the co...
4,Review Check,Warning,Warning,ComplianceConstraint(Compliance(porte_da_empre...,Failure,org.apache.spark.SparkException: Job aborted d...
5,Review Check,Warning,Warning,ComplianceConstraint(Compliance(cnpj_valido co...,Failure,org.apache.spark.SparkException: Job aborted d...


## Conclusão

```
Conseguimos importar uma fonte de dados e fazer uma analise e validação inicial. Isso pode contribuir para a qualidade dos dados que estamos trabalhando.
```