In [1]:
import pydeequ

In [2]:
from pyspark.sql import SparkSession

In [4]:
import sagemaker_pyspark

In [5]:
classpath = ":".join(sagemaker_pyspark.classpath_jars())

In [6]:
spark = (SparkSession
    .builder
    .config("spark.driver.extraClassPath", classpath)
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

In [7]:
df = spark.read.option("header",True).csv("amazon_reviews_us_Camera_v1_00.tsv", sep ='\t')

In [8]:
df.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)



In [9]:
from pydeequ.checks import *
from pydeequ.verification import *
check = Check(spark, CheckLevel.Warning, "Check amazon file")

In [12]:
df.createOrReplaceTempView("table")
res_sql=spark.sql('SELECT review_date from table limit 1').show()

+-----------+
|review_date|
+-----------+
| 2015-08-31|
+-----------+



In [62]:
checkResult_1 = VerificationSuite(spark).onData(df).addCheck(check
                                                           .isComplete("review_id")
                                                           .isUnique("review_id")
                                                           .isContainedIn("verified_purchase", ["N", "Y"])
                                                           .hasDataType("total_votes", ConstrainableDataTypes.Integral)
                                                           .hasPattern("review_date", "([12]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01]))", lambda x: x == 1)).run()

In [64]:
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult_1)
checkResult_df.createOrReplaceTempView("check_res")
checkResult_df.show()

+-----------------+-----------+------------+--------------------+-----------------+------------------+
|            check|check_level|check_status|          constraint|constraint_status|constraint_message|
+-----------------+-----------+------------+--------------------+-----------------+------------------+
+-----------------+-----------+------------+--------------------+-----------------+------------------+
only showing top 20 rows



In [65]:
checkResult_df = VerificationResult.successMetricsAsDataFrame(spark, checkResult)
checkResult_df.show()

+------+--------------------+--------------------+--------------------+
|entity|            instance|                name|               value|
+------+--------------------+--------------------+--------------------+
|Column|           review_id|        Completeness|                 1.0|
|Column|           review_id|          Uniqueness|                 1.0|
|Column|         total_votes|      Histogram.bins|                 5.0|
|Column|         total_votes|Histogram.abs.Boo...|                 0.0|
|Column|         total_votes|Histogram.ratio.B...|                 0.0|
|Column|         total_votes|Histogram.abs.Fra...|                 0.0|
|Column|         total_votes|Histogram.ratio.F...|                 0.0|
|Column|         total_votes|Histogram.abs.Int...|           1801972.0|
|Column|         total_votes|Histogram.ratio.I...|  0.9999988901060726|
|Column|         total_votes|Histogram.abs.Unk...|                 2.0|
|Column|         total_votes|Histogram.ratio.U...|1.109893927437