In [1]:
%run utils.ipynb

In [2]:
spark = get_spark(catalog="iceberg", storage="storage.io")
spark

In [3]:
df = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("s3a://ecommerce/raw/orders.csv")
)

df.printSchema()
df.show(2)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------

In [3]:
import os
os.environ["SPARK_VERSION"] = "3.5"

from pydeequ.checks import *
from pydeequ.profiles import *
from pydeequ.analyzers import *
from pydeequ.suggestions import *
from pydeequ.verification import *
from pydeequ.repository import *

In [8]:
analysis = (
    AnalysisRunner(spark)
    .onData(df)
    .addAnalyzer(Size())
    .addAnalyzer(Completeness("order_id"))
    .run()
)

print(analysis)

analysis_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysis)
analysis_df.show()

AnalyzerContext(Map(Size(None) -> DoubleMetric(Dataset,Size,*,Success(99441.0),None), Completeness(order_id,None,None) -> DoubleMetric(Column,Completeness,order_id,Success(1.0),Some((order_id IS NOT NULL)))))
+-------+--------+------------+-------+
| entity|instance|        name|  value|
+-------+--------+------------+-------+
|Dataset|       *|        Size|99441.0|
| Column|order_id|Completeness|    1.0|
+-------+--------+------------+-------+





In [10]:
profiler = (
    ColumnProfilerRunner(spark)
    .onData(df)
    .run()
)

print(profiler)

for col, profile in profiler.profiles.items():
    print(profile)

<pydeequ.profiles.ColumnProfilesBuilder object at 0x748649cbe1d0>
StandardProfiles for column: order_id: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 99770,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {
        "Boolean": 0,
        "Fractional": 0,
        "Integral": 0,
        "Unknown": 0,
        "String": 99441
    },
    "histogram": null
}
StandardProfiles for column: order_delivered_customer_date: {
    "completeness": 0.9701833247855512,
    "approximateNumDistinctValues": 93654,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": null
}
StandardProfiles for column: customer_id: {
    "completeness": 1.0,
    "approximateNumDistinctValues": 102585,
    "dataType": "String",
    "isDataTypeInferred": false,
    "typeCounts": {
        "Boolean": 0,
        "Fractional": 0,
        "Integral": 0,
        "Unknown": 0,
        "String": 99441
    },
    "histogram": null
}
StandardPr

In [19]:
constraint = (
    ConstraintSuggestionRunner(spark)
    .onData(df)
    .addConstraintRule(DEFAULT())
    .run()
)

pl.DataFrame(constraint["constraint_suggestions"])

constraint_name,column_name,current_value,description,suggesting_rule,rule_description,code_for_constraint
str,str,str,str,str,str,str
"""CompletenessConstraint(Complet…","""order_id""","""Completeness: 1.0""","""'order_id' is not null""","""CompleteIfCompleteRule()""","""If a column is complete in the…",""".isComplete(""order_id"")"""
"""UniquenessConstraint(Uniquenes…","""order_id""","""ApproxDistinctness: 1.00330849…","""'order_id' is unique""","""UniqueIfApproximatelyUniqueRul…","""If the ratio of approximate nu…",""".isUnique(""order_id"")"""
"""CompletenessConstraint(Complet…","""order_delivered_customer_date""","""Completeness: 0.97018332478555…","""'order_delivered_customer_date…","""RetainCompletenessRule()""","""If a column is incomplete in t…",""".hasCompleteness(""order_delive…"
"""CompletenessConstraint(Complet…","""customer_id""","""Completeness: 1.0""","""'customer_id' is not null""","""CompleteIfCompleteRule()""","""If a column is complete in the…",""".isComplete(""customer_id"")"""
"""UniquenessConstraint(Uniquenes…","""customer_id""","""ApproxDistinctness: 1.03161673…","""'customer_id' is unique""","""UniqueIfApproximatelyUniqueRul…","""If the ratio of approximate nu…",""".isUnique(""customer_id"")"""
…,…,…,…,…,…,…
"""CompletenessConstraint(Complet…","""order_status""","""Completeness: 1.0""","""'order_status' is not null""","""CompleteIfCompleteRule()""","""If a column is complete in the…",""".isComplete(""order_status"")"""
"""ComplianceConstraint(Complianc…","""order_status""","""Compliance: 0.9702034372140264""","""'order_status' has value range…","""FractionalCategoricalRangeRule…","""If we see a categorical range …",""".isContainedIn(""order_status"",…"
"""CompletenessConstraint(Complet…","""order_delivered_carrier_date""","""Completeness: 0.98206977001438…","""'order_delivered_carrier_date'…","""RetainCompletenessRule()""","""If a column is incomplete in t…",""".hasCompleteness(""order_delive…"
"""CompletenessConstraint(Complet…","""order_purchase_timestamp""","""Completeness: 1.0""","""'order_purchase_timestamp' is …","""CompleteIfCompleteRule()""","""If a column is complete in the…",""".isComplete(""order_purchase_ti…"


In [17]:
from pyspark.sql.functions import concat, substring, lit

check = Check(spark, CheckLevel.Warning, "Check")

verification = (
    VerificationSuite(spark).onData(df)
    .addCheck(
        check
        .isUnique("order_id")
        .isComplete("order_id")
        .isComplete("customer_id")
        .isComplete("order_purchase_timestamp")
        .isComplete("order_estimated_delivery_date")
        .isContainedIn("order_status", ["approved" ,"processing" ,"invoiced" ,"canceled" ,"delivered" ,"shipped" ,"unavailable" ,"created"])
    ).run()
)

verification_df = VerificationResult.checkResultsAsDataFrame(spark, verification)

verification_df.show(truncate=False)

+-----+-----------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|check|check_level|check_status|constraint                                                                                                                                                                                                                                                                                                    |constraint_status|constraint_message|
+-----+-----------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [5]:
repository = FileSystemMetricsRepository(spark, "s3a://ecommerce/metrics/metrics.orders.json")
result_key = ResultKey(spark, ResultKey.current_milli_time(), {"tag": "orders"})

analysis = (
    AnalysisRunner(spark)
    .onData(df)
    .addAnalyzer(Size())
    .addAnalyzer(Completeness("order_id"))
    .useRepository(repository)
    .saveOrAppendResult(result_key)
    .run()
)

print(analysis)

analysis_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysis)
analysis_df.show()

AnalyzerContext(Map(Size(None) -> DoubleMetric(Dataset,Size,*,Success(99441.0),None), Completeness(order_id,None,None) -> DoubleMetric(Column,Completeness,order_id,Success(1.0),Some((order_id IS NOT NULL)))))
+-------+--------+------------+-------+
| entity|instance|        name|  value|
+-------+--------+------------+-------+
|Dataset|       *|        Size|99441.0|
| Column|order_id|Completeness|    1.0|
+-------+--------+------------+-------+





In [10]:
import json

storage = get_storage("storage.io")
response = storage.get_object("ecommerce", "metrics/metrics.orders.json")
json.loads(response.read())

[{'resultKey': {'dataSetDate': 1722937954894, 'tags': {'tag': 'orders'}},
  'analyzerContext': {'metricMap': [{'analyzer': {'analyzerName': 'Size'},
     'metric': {'metricName': 'DoubleMetric',
      'entity': 'Dataset',
      'instance': '*',
      'name': 'Size',
      'value': 99441.0}},
    {'analyzer': {'analyzerName': 'Completeness', 'column': 'order_id'},
     'metric': {'metricName': 'DoubleMetric',
      'entity': 'Column',
      'instance': 'order_id',
      'name': 'Completeness',
      'value': 1.0}}]}}]

In [18]:
check = Check(spark, CheckLevel.Error, "Check")

verification = (
    VerificationSuite(spark).onData(df)
    .addCheck(check.isUnique("order_status"))
    .run()
)

verification_df = VerificationResult.checkResultsAsDataFrame(spark, verification)

verification_df.show(truncate=False)

+-----+-----------+------------+--------------------------------------------------------------+-----------------+----------------------------------------------------+
|check|check_level|check_status|constraint                                                    |constraint_status|constraint_message                                  |
+-----+-----------+------------+--------------------------------------------------------------+-----------------+----------------------------------------------------+
|Check|Error      |Error       |UniquenessConstraint(Uniqueness(List(order_status),None,None))|Failure          |Value: 0.0 does not meet the constraint requirement!|
+-----+-----------+------------+--------------------------------------------------------------+-----------------+----------------------------------------------------+



In [33]:
verification_df.filter(verification_df.check_status == "Error").count()

1

In [2]:
spark = get_spark(catalog="iceberg", storage="lakehouse.io")
df = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("s3a://ecommerce/staging/orders/orders.2017-01-05.csv")
)

df.printSchema()
df.show(2)

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------

In [4]:
import os
os.environ["SPARK_VERSION"] = "3.5"

from pydeequ.checks import *
from pydeequ.profiles import *
from pydeequ.analyzers import *
from pydeequ.suggestions import *
from pydeequ.verification import *
from pydeequ.repository import *

from pyspark.sql.functions import concat, substring, lit

check = Check(spark, CheckLevel.Error, "orders")

verification = (
    VerificationSuite(spark).onData(df)
    .addCheck(
        check
        .isUnique("order_id")
        .isComplete("order_id")
        .isComplete("customer_Fid")
        .isComplete("order_purchase_timestamp")
        .isComplete("order_estimated_delivery_date")
        .isContainedIn("order_status", ["approved" ,"processing" ,"invoiced" ,"canceled" ,"delivered" ,"shipped" ,"unavailable" ,"created"])
    ).run()
)

verification_df = VerificationResult.checkResultsAsDataFrame(spark, verification)

verification_df.show(truncate=False)

+------+-----------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------------------------------------+
|check |check_level|check_status|constraint                                                                                                                                                                                                                                                                                                    |constraint_status|constraint_message                              |
+------+-----------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
spark.sql("select * from iceberg.ecommerce.orders").show()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+----+-----+---+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|year|month|day|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+----+-----+---+
|09f58c00f941827ab...|7c94da97db6fe83e1...|   delivered|     2017-01-05 16:05:07|2017-01-07 03:35:34|         2017-01-11 15:47:40|          2017-01-16 15:43:31|          2017-02-13 00:00:00|2017|    1|  5|
|0bda8164c1a12b6a3...|3f402674c608ea670...|   delivered|     2017-01-05 13:36:07|2017-01-07 03:45:47|         2017-01-11 16:09:00|          2017-01-16 17:27:34|          2017-0

In [26]:
schema = (
    "review_id string,"
    "order_id string,"
    "review_score int,"
    "review_comment_title string,"
    "review_comment_message string,"
    "review_creation_date timestamp,"
    "review_answer_timestamp timestamp"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .load("s3a://ecommerce/raw/order_reviews.csv")
)

df.printSchema()
df.show(2)

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: string (nullable = true)
 |-- review_answer_timestamp: string (nullable = true)

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                

In [44]:
schema = (
    "review_id string,"
    "order_id string,"
    "review_score int,"
    "review_comment_title string,"
    "review_comment_message string,"
    "review_creation_date timestamp,"
    "review_answer_timestamp timestamp"
)

df = (
    spark.read
    .format("csv")
    .schema(schema)
    .option("header", "true")
    .option("inferSchema", "false")
    .load("s3a://ecommerce/raw/order_reviews.csv")
)

df.printSchema()
df.show(2)

root
 |-- review_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- review_score: integer (nullable = true)
 |-- review_comment_title: string (nullable = true)
 |-- review_comment_message: string (nullable = true)
 |-- review_creation_date: timestamp (nullable = true)
 |-- review_answer_timestamp: timestamp (nullable = true)

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|         

In [45]:
df.show()

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:13|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:24|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:00|   

In [24]:
df.select("review_score").na.drop().show()

+------------+
|review_score|
+------------+
+------------+



In [46]:
print(df.na.drop(subset=["review_score"]).count())
df.na.drop(subset=["review_score"]).show()

99225
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:13|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:24|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:

In [47]:
idf = df.filter(col("review_comment_title").cast("integer").isNotNull())
print(idf.count())
idf.show()

323
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|14613df954694f243...|1e91914c9706a5d46...|           4|                  4 |   Se fosse vidro t...|                NULL|                   NULL|
|382d855ffdd4f5e66...|f506fd1443d94ad16...|           5|                  10|  Cada vez que comp...| 2018-06-03 00:00:00|    2018-06-04 00:38:19|
|0a9d200bdef258e70...|f9dba21dfbb520ed9...|           5|             1000000|             Muito bom| 2018-05-11 00:00:00|    2018-05-12 11:07:23|
|3814f35a6ebd64d27...|19d1e5344936559f5...|           4|                   5|                  NULL| 2018-07-11 00:00:00

In [39]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, NumericType

_df = (
    df
    .filter(col("review_comment_title").dtype == NumericType), 
)

_df.show()

AttributeError: type object 'NumericType' has no attribute '_get_object_id'

In [48]:
profiler = (
    ColumnProfilerRunner(spark)
    .onData(df)
    .run()
)

for col, profile in profiler.profiles.items():
    print(profile)

NumericProfiles for column: review_score: {
    "completeness": 0.9526026765999117,
    "approximateNumDistinctValues": 6,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": [
        [
            "4",
            19142,
            0.18377143296019663
        ],
        [
            "NullValue",
            4937,
            0.04739732340008832
        ],
        [
            "5",
            57328,
            0.5503734567308616
        ],
        [
            "1",
            11424,
            0.10967531345404274
        ],
        [
            "0",
            1,
            9.600430099268447e-06
        ],
        [
            "2",
            3151,
            0.030250955242794877
        ],
        [
            "3",
            8179,
            0.07852191778191663
        ]
    ],
    "kll": "None",
    "mean": 4.086379440665155,
    "maximum": 5.0,
    "minimum": 0.0,
    "sum": 405471.0,
    "stdDev": 1.3476279911235343

In [56]:
from pyspark.sql.functions import col

cdf = df.withColumn("review_comment_title", col("review_comment_title").cast("integer"))

cdf.show()

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:13|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:24|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:00|   

In [60]:
profiler = (
    ColumnProfilerRunner(spark)
    .onData(cdf)
    .run()
)

for _, profile in profiler.profiles.items():
    print(profile)

NumericProfiles for column: review_score: {
    "completeness": 0.9526026765999117,
    "approximateNumDistinctValues": 6,
    "dataType": "Integral",
    "isDataTypeInferred": false,
    "typeCounts": {},
    "histogram": [
        [
            "4",
            19142,
            0.18377143296019663
        ],
        [
            "NullValue",
            4937,
            0.04739732340008832
        ],
        [
            "5",
            57328,
            0.5503734567308616
        ],
        [
            "1",
            11424,
            0.10967531345404274
        ],
        [
            "0",
            1,
            9.600430099268447e-06
        ],
        [
            "2",
            3151,
            0.030250955242794877
        ],
        [
            "3",
            8179,
            0.07852191778191663
        ]
    ],
    "kll": "None",
    "mean": 4.086379440665155,
    "maximum": 5.0,
    "minimum": 0.0,
    "sum": 405471.0,
    "stdDev": 1.3476279911235343

In [62]:
df.filter(col("review_score").isNull()).show()

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|,2018-02-16 00:00...|                NULL|        NULL|                NULL|                  NULL|                NULL|                   NULL|
|A entrega foi efe...|                NULL|        NULL|                NULL|                  NULL|                NULL|                   NULL|
|O produto já come...|                NULL|        NULL|                NULL|                  NULL|                NULL|                   NULL|
|     sem problemas."| 2018-03-23 00:00:00|        NULL|                NULL|                  NULL|                NULL|   

In [63]:
df.filter(col("review_score").isNull()).count()

4937

In [72]:
df.filter(col("review_creation_date").isNull()).show(truncate=False)

+---------------------------------------------------+--------------------------------+------------+--------------------+----------------------------------------------------------------------+--------------------+-----------------------+
|review_id                                          |order_id                        |review_score|review_comment_title|review_comment_message                                                |review_creation_date|review_answer_timestamp|
+---------------------------------------------------+--------------------------------+------------+--------------------+----------------------------------------------------------------------+--------------------+-----------------------+
|4b49719c8a200003f700d3d986ea1a19                   |9d6f15f95d01e79bd1349cc208361f09|4           |NULL                |Mas um pouco ,travando...pelo valor ta Boa.                           |NULL                |NULL                   |
|,2018-02-16 00:00:00,2018-02-20 10:52:22           

In [65]:
df.filter(col("review_creation_date").isNull()).count()

8832

In [66]:
df.show()

+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:13|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:24|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:00|   

In [73]:
print(df.filter("review_score is null or order_id is null").count())
df.filter("review_score is null or order_id is null or length(review_id) <> 32").show(truncate=False)

4937
+------------------------------------------------------+--------------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|review_id                                             |order_id                  |review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+------------------------------------------------------+--------------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|,2018-02-16 00:00:00,2018-02-20 10:52:22              |NULL                      |NULL        |NULL                |NULL                  |NULL                |NULL                   |
|A entrega foi efetuada muito antes do prazo dado.     |NULL                      |NULL        |NULL                |NULL                  |NULL                |NULL                   |
|O produto já começou a ser usado e até o presente     |NULL     

In [78]:
cdf = df.filter(
    "order_id is not null and "
    "review_id is not null and "
    "review_score is not null and "
    "length(review_id) = 32"
)

print(cdf.count())
cdf.show()

99224
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:13|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:24|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:

In [80]:
cdf = df.filter(
    "order_id is not null and "
    "review_id is not null and "
    "review_score is not null and "
    "review_creation_date is not null and "
    "length(review_id) = 32"
)

print(cdf.count())
cdf.show()

95307
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|7bc2406110b926393...|73fc7af87114b3971...|           4|                NULL|                  NULL| 2018-01-18 00:00:00|    2018-01-18 21:46:59|
|80e641a11e56f04c1...|a548910a1c6147796...|           5|                NULL|                  NULL| 2018-03-10 00:00:00|    2018-03-11 03:05:13|
|228ce5500dc1d8e02...|f9e4b658b201a9f2e...|           5|                NULL|                  NULL| 2018-02-17 00:00:00|    2018-02-18 14:36:24|
|e64fb393e7b32834b...|658677c97b385a9be...|           5|                NULL|  Recebi bem antes ...| 2017-04-21 00:00:

In [79]:
cdf = df.filter(
    "order_id is not null and "
    "review_id is not null and "
    "review_score is not null and "
    "length(review_id) = 32"
)

check = Check(spark, CheckLevel.Warning, "order_reviews")

check_results = (
    VerificationSuite(spark)
    .onData(cdf)
    .addCheck(
        check
        .isComplete("order_id")
        .isComplete("review_id")
        .isComplete("review_creation_date")
        .hasDataType("review_score", ConstrainableDataTypes.Integral)
    )
    .run()
)

verification_df = VerificationResult.checkResultsAsDataFrame(spark, check_results)

verification_df.show(truncate=False)

+-------------+-----------+------------+---------------------------------------------------------------------------------------+-----------------+------------------------------------------------------------------+
|check        |check_level|check_status|constraint                                                                             |constraint_status|constraint_message                                                |
+-------------+-----------+------------+---------------------------------------------------------------------------------------+-----------------+------------------------------------------------------------------+
+-------------+-----------+------------+---------------------------------------------------------------------------------------+-----------------+------------------------------------------------------------------+



### order_payments

In [81]:
df = (
    spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("s3a://ecommerce/raw/order_payments.csv")
)

df.printSchema()
df.show(2)

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: integer (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- payment_value: double (nullable = true)

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|
+--------------------+------------------+------------+--------------------+-------------+
only showing top 2 rows



In [88]:
cdf = df

check = Check(spark, CheckLevel.Warning, "order_payments")

check_results = (
    VerificationSuite(spark)
    .onData(cdf)
    .addCheck(
        check
        .isComplete("order_id")
        .hasDataType("payment_value", ConstrainableDataTypes.Fractional)
        .hasDataType("payment_sequential", ConstrainableDataTypes.Integral)
        .hasDataType("payment_installments", ConstrainableDataTypes.Integral)
        .isContainedIn("payment_type", ["credit_card", "debit_card", "voucher", "boleto", "not_defined"])
    )
    .run()
)

verification_df = VerificationResult.checkResultsAsDataFrame(spark, check_results)

verification_df.show(truncate=False)

+--------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
|check         |check_level|check_status|constraint                                                                                                                                                                                                                                          |constraint_status|constraint_message|
+--------------+-----------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------------+
+--------------+-----------+