In [38]:
import great_expectations as gx
import pandas as pd
import warnings
warnings.filterwarnings("ignore", message="`result_format` configured at the Validator-level*")

# Load the data
df = pd.read_csv("./data/transactions.csv")

# Regex for amount(Any number + "." + Any number): ^-?\d+\.\d+$
# Regex for currency(Three uppercase letters): ^[A-Z]{3}$
# Regex for timestamp format: ^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$

amount_pattern = r"^-?\d+\.\d+$"
currency_pattern = r"^[A-Z]{3}$"
timestamp_pattern = r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$"

# Fixes amount (removes spaces and convert to numeric then float)
df["amount"] = df["amount"].astype(str).str.replace(" ", "")  # Remove spaces
df["amount"] = pd.to_numeric(df["amount"])  # Convert to numeric
df["amount"] = df["amount"].astype(float) # Convert to float

# Fixes currency (removes spaces)
df["currency"] = df["currency"].astype(str).str.replace(" ", "")


# Create the ephemeral GX context
context = gx.get_context()

# Add a pandas datasource
data_source = context.data_sources.add_pandas(name="pandas")

# Add a dataframe asset
data_asset = data_source.add_dataframe_asset(name="transactions_data")

# Define the batch (entire DataFrame)
batch_definition = data_asset.add_batch_definition_whole_dataframe(name="batch_def")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

# Create the expectation suite with a name
suite = gx.core.expectation_suite.ExpectationSuite(name="transactions_suite")

# Get the validator using the suite
validator = context.get_validator(batch=batch, expectation_suite=suite)

# Add expectations
validator.expect_column_values_to_be_between("amount", min_value=0.01, max_value=100000)
validator.expect_column_values_to_match_regex("amount", regex=amount_pattern)
validator.expect_column_values_to_match_regex("currency", regex=currency_pattern)
validator.expect_column_values_to_match_regex("timestamp", regex=timestamp_pattern)

# Validate
results = validator.validate(result_format="COMPLETE")

# Print results
print(results)

# Checks results for any unexpected counts.
# Put Unexpected into an invalid DF and expected into a valid DF
# (Currently only invalid transactions are those not using correct regex format or over a certain amount)

unexpected_transactions = [
    invalid_index
    for result in results["results"]
    for invalid_index in result["result"].get("unexpected_index_list", [])
]

invalid_df = df.iloc[unexpected_transactions] # List to iloc
valid_df = df.drop(index=unexpected_transactions) # Drop invalid rows

print(f"Invalid Transactions: {len(invalid_df)}")
print(f"Valid Transactions: {len(valid_df)}")


  df = pd.read_csv("./data/transactions.csv")
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 83.28it/s]  
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 45.99it/s]  
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 61.12it/s]  
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 52.94it/s]  
Calculating Metrics: 100%|██████████| 29/29 [00:00<00:00, 48.92it/s]

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "batch_id": "pandas-transactions_data",
          "column": "amount",
          "min_value": 0.01,
          "max_value": 100000.0
        },
        "meta": {}
      },
      "result": {
        "element_count": 100000,
        "unexpected_count": 3,
        "unexpected_percent": 0.003,
        "partial_unexpected_list": [
          3353381.0,
          2217915.0,
          3700879.0
        ],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.003,
        "unexpected_percent_nonmissing": 0.003,
        "partial_unexpected_counts": [
          {
            "value": 2217915.0,
            "count": 1
          },
          {
            "value": 3353381.0,
            "count": 1
          },
          {
            "value": 3700879.0,
            "count": 1
   


