In [None]:
import time
import pandas as pd

In [None]:
nval = 5  #number of validation runs to build runtime average over
sf = "3M"
df = pd.read_csv(f"../data/hotel_search_logs_{sf}_small.csv")

## duckdq

In [None]:
import duckdq

In [None]:
runtimes = 0
for i in range(nval):
    start = time.time()
    checkResult = duckdq.VerificationSuite() \
        .on_data(df) \
        .add_check(
            duckdq.Check(duckdq.CheckLevel.EXCEPTION, "Check Error")
              .is_complete("date_time") 
              .is_complete("site_name")  
              .is_complete("posa_continent") 
              .is_complete("user_location_country")
              .is_complete("user_location_city")  
              .is_unique("orig_destination_distance")
              .is_unique("user_id")
              .is_unique("is_mobile") 
              .is_unique("is_package") 
              .is_unique("channel")).run()
    end = time.time()
    runtimes += end-start
print(f"DDQ (Validation, {sf} rows): {runtimes/nval}")

## great_expectations

In [None]:
import great_expectations as ge

In [None]:
runtimes = 0
for i in range(nval):
    start = time.time()
    ge_df = ge.from_pandas(df)
    ge_df.expect_column_values_to_not_be_null("date_time")
    ge_df.expect_column_values_to_not_be_null("site_name")
    ge_df.expect_column_values_to_not_be_null("posa_continent")
    ge_df.expect_column_values_to_not_be_null("user_location_country")
    ge_df.expect_column_values_to_not_be_null("user_location_city")
    ge_df.expect_column_values_to_be_unique("orig_destination_distance")
    ge_df.expect_column_values_to_be_unique("user_id")
    ge_df.expect_column_values_to_be_unique("is_mobile")
    ge_df.expect_column_values_to_be_unique("is_package")
    ge_df.expect_column_values_to_be_unique("channel")
    end = time.time()
    runtimes += end-start
print(f"GE (Validation, {sf} rows): {runtimes/nval}")

## pydeequ

In [None]:
import findspark
findspark.init()
import pydeequ
from pyspark.sql import SparkSession, Row
from pydeequ.verification import VerificationSuite
from pydeequ.checks import Check, CheckLevel

In [None]:
start = time.time()
spark = (SparkSession
    .builder
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())
end = time.time()
print(f"DQ (Startup): {end-start}")

In [None]:
start = time.time()
pd_df = spark.read.csv(f"../data/hotel_search_logs_{sf}.csv",header=True)
pd_df.cache()
pd_df.count()
end = time.time()
print(f"DQ (Data Transfer, {sf} rows): {end-start}")

In [None]:
runtimes = 0
for i in range(nval):
    start = time.time()
    checkResult = VerificationSuite(spark) \
        .onData(pd_df) \
        .addCheck(Check(spark, CheckLevel.Error, "Check Error")
              .isComplete("date_time") 
              .isComplete("site_name") 
              .isComplete("posa_continent")
              .isComplete("user_location_country") 
              .isComplete("user_location_city")
              .isUnique("orig_destination_distance") 
              .isUnique("user_id") 
              .isUnique("is_mobile") 
              .isUnique("is_package") 
              .isUnique("channel")).run()
    end = time.time()
    runtimes += end-start
print(f"DQ (Validation, {sf} rows): {runtimes/nval}")

## hooqu

In [None]:
from hooqu.checks import Check, CheckLevel, CheckStatus
from hooqu.verification_suite import VerificationSuite
from hooqu.constraints import ConstraintStatus

In [None]:
runtimes = 0
for i in range(nval):
    start = time.time()
    verification_result = (
          VerificationSuite()
          .on_data(df)
          .add_check(
              Check(CheckLevel.ERROR, "Basic Check")
              .is_complete("date_time") 
              .is_complete("site_name") 
              .is_complete("posa_continent") 
              .is_complete("user_location_country") 
              .is_complete("user_location_city") 
              .is_unique("orig_destination_distance") 
              .is_unique("user_id") 
              .is_unique("is_mobile") 
              .is_unique("is_package") 
              .is_unique("channel") 
          )
          .run()
    )
    end = time.time()
    runtimes += end-start
print(f"HQ (Validation, {sf} rows): {runtimes/nval}")