In [0]:
#%pip install great_expectations
import great_expectations as ge

# 1) point at your Delta table (or DataFrame)
df = spark.read.format("delta").load("/tmp/bronze/covid_nyt").toPandas()
context = ge.get_context()

# 2) create a new suite
suite = context.create_expectation_suite("covid_quality", overwrite_existing=True)

# 3) turn your Pandas DF into a GE Dataset
gedf = ge.from_pandas(df, expectation_suite_name="covid_quality")

# 4) add some expectations
gedf.expect_column_values_to_not_be_null("date")
gedf.expect_column_values_to_be_between("cases", min_value=0)
gedf.expect_column_values_to_be_unique(["date","county","state"])

# 5) run a checkpoint
from great_expectations.checkpoint import SimpleCheckpoint
checkpoint = SimpleCheckpoint(
    name="covid_checkpoint",
    data_context=context,
    batch_request=gedf.get_batch_request(),
    expectation_suite_name="covid_quality"
)
results = checkpoint.run()
print(results.get_statistics())

In [0]:
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset

# Load as SparkDF
sdf = spark.table("bronze.covid_nyt")

# Wrap it
gedf_spark = SparkDFDataset(sdf, expectation_suite_name="covid_quality")

# Declare the same expectations
gedf_spark.expect_column_values_to_not_be_null("date")
gedf_spark.expect_column_values_to_be_between("cases", min_value=0)
gedf_spark.expect_column_values_to_be_unique(["date","county","state"])

# Run validation directly
results = gedf_spark.validate()
print(results.get_statistics())


In [0]:
%pip install --upgrade "great_expectations[spark]"

In [0]:
import great_expectations as ge
from great_expectations.core.batch import RuntimeBatchRequest

# 1) Get (or initialize) your Data Context
context = ge.get_context()

# 2) Define a RuntimeBatchRequest pointing at your Spark table
batch_request = RuntimeBatchRequest(
    datasource_name="my_spark_datasource",            # name of your Spark datasource in great_expectations.yml
    data_connector_name="default_runtime_data_connector",
    data_asset_name="bronze_covid_nyt",                # arbitrary internal name
    runtime_parameters={"query": "SELECT * FROM bronze.covid_nyt"},
    batch_identifiers={"pipeline_stage": "bronze"},
)

# Create the suite (won’t overwrite if it already exists)
# Create (or update) the suite
context.create_expectation_suite("covid_quality")

validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="covid_quality"
)

# 4) Declare your expectations
validator.expect_column_values_to_not_be_null("date")
validator.expect_column_values_to_be_between("cases", min_value=0)
validator.expect_column_values_to_be_unique(["date", "county", "state"])

# 5) Execute validation
results = validator.validate()
print(results.get_statistics())

#Otherwise I would have written the following code
#SELECT * FROM bronze.covid_nyt WHERE date IS NULL;
#SELECT * FROM bronze.covid_nyt WHERE cases < 0;
#SELECT date, county, state, COUNT(*) FROM bronze.covid_nyt GROUP BY date, county, state HAVING COUNT(*) > 1;

