In [16]:
import great_expectations as gx
import pandas as pd

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir=".")

# You can take a look at all configurations related to GX here
print(context)

{
  "anonymous_usage_statistics": {
    "enabled": true,
    "explicit_url": false,
    "usage_statistics_url": "https://stats.greatexpectations.io/great_expectations/v1/usage_statistics",
    "data_context_id": "2ae80661-d081-4ae0-b8bb-311b43d38161",
    "explicit_id": true
  },
  "checkpoint_store_name": "checkpoint_store",
  "config_variables_file_path": "uncommitted/config_variables.yml",
  "config_version": 3.0,
  "data_docs_sites": {
    "local_site": {
      "class_name": "SiteBuilder",
      "show_how_to_buttons": true,
      "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": "uncommitted/data_docs/local_site/"
      },
      "site_index_builder": {
        "class_name": "DefaultSiteIndexBuilder"
      }
    }
  },
  "datasources": {},
  "evaluation_parameter_store_name": "evaluation_parameter_store",
  "expectations_store_name": "expectations_store",
  "fluent_datasources": {
    "my_ds_7": {
      "type": "pandas_filesystem",
   

In [None]:
# You have been working with a single file, but how to work with a folder or a DB with several tables?
# Now, we have some new terms, the data source contains some data assets, and each asset will be divided into several batches

# First, add a folder as a data source
context.sources.add_pandas_filesystem(
    name="my_ds_7", base_directory="../data/2024"
)

In [18]:
my_ds = context.datasources["my_ds_7"]

In [None]:
my_batching_regex = "yellow_tripdata_2024-.*.parquet"

# Create the data asset (as one or more files from our data source)
my_asset = my_ds.add_parquet_asset(
    name="my_tripdata_data_asset", batching_regex=my_batching_regex
)

# Define a Batch Request to include all batches in the available data set
my_batch_request = my_asset.build_batch_request()
batches = my_asset.get_batch_list_from_batch_request(my_batch_request)

In [20]:
for batch in batches:
    print(batch.batch_spec)

{'path': '../data/2024/yellow_tripdata_2024-01.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/2024/yellow_tripdata_2024-02.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}


In [21]:
context.add_or_update_expectation_suite("my_asset_expectation_suite")

asset_validator = context.get_validator(
    batch_request=my_batch_request,
    expectation_suite_name="my_asset_expectation_suite",
)

asset_validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,dolocationid,pulocationid,ratecodeid,vendorid,congestion_surcharge,extra,fare_amount,improvement_surcharge,mta_tax,passenger_count,...,tip_amount,tolls_amount,total_amount,dropoff_datetime,pickup_datetime,trip_distance,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,236,68,1.0,2,2.5,1.0,20.5,1.0,0.5,1.0,...,1.28,0.0,26.78,2024-02-01 00:19:58,2024-02-01 00:04:45,4.39,37.922526,-96.761538,-35.0153,138.63557
1,243,48,1.0,2,2.5,1.0,31.0,1.0,0.5,1.0,...,9.0,0.0,45.0,2024-02-01 01:10:53,2024-02-01 00:56:31,7.71,36.103413,-84.131863,35.255744,-80.860263
2,261,132,2.0,2,2.5,0.0,70.0,1.0,0.5,2.0,...,0.0,6.94,82.69,2024-02-01 00:43:12,2024-02-01 00:07:50,28.69,40.642948,-73.779373,40.7119,-74.012527
3,163,161,1.0,1,2.5,3.5,9.3,1.0,0.5,1.0,...,2.85,0.0,17.15,2024-02-01 00:10:47,2024-02-01 00:01:49,1.1,30.302121,-81.619652,35.86313,-78.636702
4,79,246,1.0,1,2.5,3.5,15.6,1.0,0.5,1.0,...,0.0,0.0,20.6,2024-02-01 00:51:15,2024-02-01 00:37:35,2.6,37.915937,-96.78586,40.729269,-73.987361


In [22]:
# Add the same expectations as the single-file
asset_validator.expect_column_values_to_not_be_null("vendorid")
asset_validator.expect_column_values_to_not_be_null("ratecodeid")
asset_validator.expect_column_values_to_not_be_null("dolocationid")
asset_validator.expect_column_values_to_not_be_null("pulocationid")
asset_validator.expect_column_values_to_not_be_null("payment_type")
asset_validator.expect_column_values_to_not_be_null("pickup_latitude")
asset_validator.expect_column_values_to_not_be_null("pickup_longitude")
asset_validator.expect_column_values_to_not_be_null("dropoff_latitude")
asset_validator.expect_column_values_to_not_be_null("dropoff_longitude")

asset_validator.expect_column_values_to_be_between("trip_distance", min_value=0, max_value=100)
asset_validator.expect_column_values_to_be_between("extra", min_value=0, max_value=3)

asset_validator.save_expectation_suite(discard_failed_expectations=False)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

In [23]:
# Similar to a single file, create a checkpoint to validate the result
# Define the checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="yellow_tripdata_asset_checkpoint",
    validator=asset_validator
)

# Get the result after validator
checkpoint_result = checkpoint.run()

# Quick view on the validation result
context.view_validation_result(checkpoint_result)

Calculating Metrics:   0%|          | 0/62 [00:00<?, ?it/s]