# Initialize a new Expectation Suite by profiling a batch of your data.
This process helps you avoid writing lots of boilerplate when authoring suites by allowing you to select columns and other factors that you care about and letting a profiler write some candidate expectations for you to adjust.

**Expectation Suite Name**: `demo_profiler`


In [2]:
import datetime

import pandas as pd

import great_expectations as ge
import great_expectations.jupyter_ux
from great_expectations.profile.user_configurable_profiler import (
    UserConfigurableProfiler,
)
from great_expectations.core.batch import BatchRequest
from great_expectations.checkpoint import SimpleCheckpoint
from great_expectations.exceptions import DataContextError

context = ge.data_context.DataContext()

2022-08-29T13:14:14-0400 - INFO - Great Expectations logging enabled at 20 level by JupyterUX module.


# Setup for the Runtime Batch Request

In [3]:
# Read in the parquet file to memory 
df_batches_month = pd.read_parquet(f"../../data/2020_22_liquor_month.parquet/partition_col=2020/")
df_batches_month = df_batches_month[~df_batches_month["Category Name"].isna()]
df_batches_month = df_batches_month[~df_batches_month["Vendor Name"].isna()]

  PANDAS_VERSION = LooseVersion(pd.__version__)


In [6]:
from great_expectations.core.batch import RuntimeBatchRequest

batch_request = RuntimeBatchRequest(
    datasource_name="demo_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="my_runtime_asset_name",  # This can be anything that identifies this data_asset for you
    runtime_parameters={"batch_data": df_batches_month},  # Pass your DataFrame here.
    batch_identifiers={"runtime_batch_identifier_name": "liquor_data_ref_20_idfer"},
)

expectation_suite_name = "demo_profiler"

validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)
column_names = [f'"{column_name}"' for column_name in validator.columns()]
print(f"Columns: {', '.join(column_names)}.")
validator.head(n_rows=5, fetch_all=False)

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Columns: "Invoice/Item Number", "Store Number", "Store Name", "Address", "City", "Store Location", "County Number", "County", "Category", "Category Name", "Vendor Number", "Vendor Name", "Item Number", "Item Description", "Pack", "Bottle Volume (ml)", "State Bottle Cost", "State Bottle Retail", "Bottles Sold", "Sale (Dollars)", "Volume Sold (Liters)", "Volume Sold (Gallons)", "dir0", "dir1".


Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,Invoice/Item Number,Store Number,Store Name,Address,City,Store Location,County Number,County,Category,Category Name,Vendor Number,Vendor Name,Item Number,Item Description,Pack,Bottle Volume (ml),State Bottle Cost,State Bottle Retail,Bottles Sold,Sale (Dollars),Volume Sold (Liters),Volume Sold (Gallons),dir0,dir1
0,INV-24292600043,4404,KUM & GO #292 / Ankeny,1825 N Ankeny Blvd,Ankeny,,77.0,POLK,1012100.0,Canadian Whiskies,260.0,DIAGEO AMERICAS,10805,Crown Royal Regal Apple,24,375,8.0,12.0,2,24.0,0.75,0.19,1,2
1,INV-24306300051,5257,MAD Ave Quik Shop,"405, Madison Ave",Ottumwa,,90.0,WAPELLO,1031100.0,American Vodkas,434.0,LUXCO INC,36307,Hawkeye Vodka,12,1000,4.05,6.08,12,72.96,12.0,3.17,1,2
2,INV-24290400075,2502,Hy-Vee Wine and Spirits / Ankeny,410 North Ankeny Blvd,Ankeny,,77.0,POLK,1012200.0,Scotch Whiskies,260.0,DIAGEO AMERICAS,5326,Johnnie Walker Black,12,750,21.49,32.24,4,128.96,3.0,0.79,1,2
3,INV-24312800016,5709,JW Liquor,4518 Mortonsen Street Suite #109,Ames,,85.0,STORY,1081200.0,Cream Liqueurs,260.0,DIAGEO AMERICAS,68036,Baileys Original Irish Cream,12,750,16.49,24.74,6,121.5,4.5,1.18,1,2
4,INV-24294900180,2647,Hy-Vee #7 / Cedar Rapids,5050 Edgewood Rd,Cedar Rapids,POINT (-91.701581 42.030129),57.0,LINN,1081200.0,Cream Liqueurs,260.0,DIAGEO AMERICAS,74086,Godiva Dark Chocolate Liqueur,12,750,17.0,25.5,3,76.5,2.25,0.59,1,2


In [None]:
## Default config, we are not using it

batch_request = {'datasource_name': 'demo_datasource', 
                 'data_connector_name': 'default_inferred_data_connector_name', 
                 'data_asset_name': '2020_22_liquor_month.parquet', 
                 'limit': 1000}

expectation_suite_name = "demo_profiler"

validator = context.get_validator(
    batch_request=BatchRequest(**batch_request),
    expectation_suite_name=expectation_suite_name
)
column_names = [f'"{column_name}"' for column_name in validator.columns()]
print(f"Columns: {', '.join(column_names)}.")
validator.head(n_rows=5, fetch_all=False)

# Select columns

Select the columns on which you would like to set expectations and those which you would like to ignore.

Great Expectations will choose which expectations might make sense for a column based on the **data type** and **cardinality** of the data in each selected column.

Simply comment out columns that are important and should be included. You can select multiple lines and use a Jupyter
keyboard shortcut to toggle each line: **Linux/Windows**:
`Ctrl-/`, **macOS**: `Cmd-/`
        

In [7]:
exclude_column_names = [
    "Invoice/Item Number",
    "Store Number",
    "Store Name",
    "Address",
    "City",
    "Store Location",
    "County Number",
    "County",
    "Category",
    "Category Name",
    "Vendor Number",
    "Vendor Name",
    "Item Number",
    "Item Description",
    "Pack",
    "Bottle Volume (ml)",
    "State Bottle Cost",
    "State Bottle Retail",
    "Bottles Sold",
    "Sale (Dollars)",
    "Volume Sold (Liters)",
    "Volume Sold (Gallons)",
    "partition_col",
]

# Run the UserConfigurableProfiler

The suites generated here are **not meant to be production suites** -- they are **a starting point to build upon**.

**To get to a production-grade suite, you will definitely want to [edit this
suite](https://docs.greatexpectations.io/en/latest/guides/how_to_guides/creating_and_editing_expectations/how_to_edit_an_expectation_suite_using_a_disposable_notebook.html?utm_source=notebook&utm_medium=profile_based_expectations)
after this initial step gets you started on the path towards what you want.**

This is highly configurable depending on your goals.
You can ignore columns or exclude certain expectations, specify a threshold for creating value set expectations, or even specify semantic types for a given column.
You can find more information about [how to configure this profiler, including a list of the expectations that it uses, here.](https://docs.greatexpectations.io/en/latest/guides/how_to_guides/creating_and_editing_expectations/how_to_create_an_expectation_suite_with_the_user_configurable_profiler.html)



In [8]:
profiler = UserConfigurableProfiler(
    profile_dataset=validator,
    excluded_expectations=None,
    ignored_columns=exclude_column_names,
    not_null_only=False,
    primary_or_compound_key=None,
    semantic_types_dict=None,
    table_expectations_only=False,
    value_set_threshold="MANY",
)
suite = profiler.build_suite()
validator.expectation_suite = suite

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

2022-08-29T13:20:12-0400 - INFO - 	0 expectation(s) included in expectation_suite.


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling:   0%|          | 0/2 [00:00<?, ?it/s, Column=dir0]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

2022-08-29T13:20:14-0400 - INFO - Column type for column dir0 is unknown. Skipping expect_column_values_to_be_in_type_list for this column.


Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

2022-08-29T13:20:15-0400 - INFO - Column type for column dir1 is unknown. Skipping expect_column_values_to_be_in_type_list for this column.
2022-08-29T13:20:15-0400 - INFO - 	8 expectation(s) included in expectation_suite.
Creating an expectation suite with the following expectations:

Table-Level Expectations
expect_table_columns_to_match_ordered_list
expect_table_row_count_to_be_between

Expectations by Column
Column Name: dir0 | Column Data Type: UNKNOWN | Cardinality: VERY_FEW
expect_column_proportion_of_unique_values_to_be_between
expect_column_values_to_be_in_set
expect_column_values_to_not_be_null


Column Name: dir1 | Column Data Type: UNKNOWN | Cardinality: FEW
expect_column_proportion_of_unique_values_to_be_between
expect_column_values_to_be_in_set
expect_column_values_to_not_be_null




# Save & review your new Expectation Suite

Let's save the draft expectation suite as a JSON file in the
`great_expectations/expectations` directory of your project and rebuild the Data
 Docs site to make it easy to review your new suite.

In [9]:
print(validator.get_expectation_suite(discard_failed_expectations=False))
validator.save_expectation_suite(discard_failed_expectations=False)

checkpoint_config = {
    "class_name": "SimpleCheckpoint",
    "validations": [
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name
        }
    ]
}
checkpoint = SimpleCheckpoint(
    f"{validator.active_batch_definition.data_asset_name}_{expectation_suite_name}",
    context,
    **checkpoint_config
)
checkpoint_result = checkpoint.run()

context.build_data_docs()

validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0]
context.open_data_docs(resource_identifier=validation_result_identifier)

2022-08-29T13:20:18-0400 - INFO - 	8 expectation(s) included in expectation_suite.
{
  "meta": {
    "citations": [
      {
        "batch_request": {
          "data_asset_name": "2020_22_liquor_month.parquet",
          "data_connector_name": "default_inferred_data_connector_name",
          "datasource_name": "demo_datasource",
          "limit": 1000
        },
        "citation_date": "2022-08-29T03:01:02.643561Z",
        "comment": "Created suite added via CLI"
      }
    ],
    "great_expectations_version": "0.15.18",
    "columns": {
      "Invoice/Item Number": {
        "description": ""
      },
      "Store Number": {
        "description": ""
      },
      "Store Name": {
        "description": ""
      },
      "Address": {
        "description": ""
      },
      "City": {
        "description": ""
      },
      "Store Location": {
        "description": ""
      },
      "County Number": {
        "description": ""
      },
      "County": {
        "description": "

ValueError: Error: batch_data found in batch_request -- only primitive types are allowed as Checkpoint constructor arguments.


## Next steps
After you review this initial Expectation Suite in Data Docs you
should edit this suite to make finer grained adjustments to the expectations.
This can be done by running `great_expectations suite edit demo_profiler`.