In [1]:
from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-clean'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'supermarket-sales'
path_to_data = 'P2M3_zaky_ramdhani_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-supermarket-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,invoice_id,branch,city,customer_type,gender,product_line,unit_price,quantity,tax_5%,total,date,time,payment,cogs,gross_margin_percentage,gross_income,rating
0,0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.7,7,26.1,549.0,1/5/2019,13:08,Ewallet,522.8,4.8,26.1,9.1
1,1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.3,5,3.8,80.2,3/8/2019,10:29,Cash,76.4,4.8,3.8,9.6
2,2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.3,7,16.2,340.5,3/3/2019,13:23,Credit card,324.3,4.8,16.2,7.4
3,3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.2,8,23.3,489.0,1/27/2019,20:33,Ewallet,465.8,4.8,23.3,8.4
4,4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.3,7,30.2,634.4,2/8/2019,10:37,Ewallet,604.2,4.8,30.2,5.3


In [7]:
validator.expect_column_values_to_be_unique('invoice_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
validator.expect_column_values_to_be_between(
    column='gross_income', min_value=0, max_value=100
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
validator.expect_column_values_to_be_in_set('branch', ['A', 'B', 'C'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Expectation 7 : Column `total_amount` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('quantity', ['int64', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [18]:
validator.expect_column_value_lengths_to_be_between('tax_5%', min_value=0, max_value=100, mostly=1, result_format="BASIC", include_config=False)

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [20]:
validator.expect_column_median_to_be_between('total', 0, 1000)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 253.85
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [22]:
validator.expect_column_unique_value_count_to_be_between('unit_price', min_value=0, max_value=1000)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 589
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [23]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

In [24]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [25]:
# Run a checkpoint
checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/63 [00:00<?, ?it/s]

In [26]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://c:\\Users\\Zaky\\github-classroom\\FTDS-assignment-bay\\p2-ftds008-hck-m3-zakyramdhani\\gx\\uncommitted/data_docs/local_site/index.html'}