In [None]:
import great_expectations as gx
import pandas as pd

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir=".")

# You can take a look at all configurations related to GX here
print(context)

In [None]:
# You have been working with a single file, but how to work with a folder or a DB with several tables?
# Now, we have some new terms, the data source contains some data assets, and each asset will be divided into several batches

# First, add a folder as a data source
context.sources.add_pandas_filesystem(
    name="my_ds_7", base_directory="../data/2024"
)

In [None]:
my_ds = context.datasources["my_ds_7"]

In [None]:
my_batching_regex = "yellow_tripdata_2024-.*.parquet"

# Create the data asset (as one or more files from our data source)
my_asset = my_ds.add_parquet_asset(
    name="my_tripdata_data_asset", batching_regex=my_batching_regex
)

# Define a Batch Request to include all batches in the available data set
my_batch_request = my_asset.build_batch_request()
batches = my_asset.get_batch_list_from_batch_request(my_batch_request)

In [None]:
for batch in batches:
    print(batch.batch_spec)

In [None]:
context.add_or_update_expectation_suite("my_asset_expectation_suite")

asset_validator = context.get_validator(
    batch_request=my_batch_request,
    expectation_suite_name="my_asset_expectation_suite",
)

asset_validator.head()

In [None]:
# Add the same expectations as the single-file
asset_validator.expect_column_values_to_not_be_null("vendorid")
asset_validator.expect_column_values_to_not_be_null("ratecodeid")
asset_validator.expect_column_values_to_not_be_null("dolocationid")
asset_validator.expect_column_values_to_not_be_null("pulocationid")
asset_validator.expect_column_values_to_not_be_null("payment_type")
asset_validator.expect_column_values_to_not_be_null("pickup_latitude")
asset_validator.expect_column_values_to_not_be_null("pickup_longitude")
asset_validator.expect_column_values_to_not_be_null("dropoff_latitude")
asset_validator.expect_column_values_to_not_be_null("dropoff_longitude")

asset_validator.expect_column_values_to_be_between("trip_distance", min_value=0, max_value=100)
asset_validator.expect_column_values_to_be_between("extra", min_value=0, max_value=3)

asset_validator.save_expectation_suite(discard_failed_expectations=False)

In [None]:
# Similar to a single file, create a checkpoint to validate the result
# Define the checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="yellow_tripdata_asset_checkpoint",
    validator=asset_validator
)

# Get the result after validator
checkpoint_result = checkpoint.run()

# Quick view on the validation result
context.view_validation_result(checkpoint_result)