# Great expectation

Nama  : Talitha Salsabila

Batch : RMT-032

# Install Library

In [1]:
# Install the library

!pip install -q great-expectations

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.2/49.2 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.7/113.7 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.7/526.7 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# Instantiate Data Context

In [2]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# Connect to datasource

In [3]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'expectation_data_clean'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'clean_data'
path_to_data = 'P2M3_talitha_salsabila_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation_dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,area_code,state,market,market_size,profit,margin,sales,cogs,total_expenses,marketing,inventory,budget_profit,budget_cogs,budget_margin,budget_sales,productid,date,product_type,product,type
0,203,Connecticut,East,Small Market,107.0,176.0,292.0,116.0,69.0,38.0,962.0,110.0,110.0,160.0,270.0,2,4/1/2010 0:00,Coffee,Columbian,Regular
1,203,Connecticut,East,Small Market,75.0,135.0,225.0,90.0,60.0,29.0,1148.0,90.0,80.0,130.0,210.0,2,7/1/2010 0:00,Coffee,Columbian,Regular
2,203,Connecticut,East,Small Market,122.0,195.0,325.0,130.0,73.0,42.0,1134.0,130.0,110.0,180.0,290.0,2,11/1/2010 0:00,Coffee,Columbian,Regular
3,203,Connecticut,East,Small Market,105.0,174.0,289.0,115.0,69.0,37.0,1166.0,110.0,100.0,160.0,260.0,2,12/1/2010 0:00,Coffee,Columbian,Regular
4,203,Connecticut,East,Small Market,104.0,135.0,223.0,90.0,56.0,29.0,1148.0,90.0,80.0,130.0,210.0,2,7/1/2011 0:00,Coffee,Columbian,Regular


# Expectation

In [5]:
# Expectation 1 : Column `productid` must be between 1 and 13

validator.expect_column_values_to_be_between(
    column='productid', min_value=1, max_value=13)




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "productid",
      "min_value": 1,
      "max_value": 13,
      "batch_id": "expectation_data_clean-clean_data"
    },
    "meta": {}
  },
  "result": {
    "element_count": 4248,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Expectation 6 : Column `type` must contain one of the following 2 things :
# 1 = Regular
# 2 = Decaf

validator.expect_column_values_to_be_in_set('type', [1, 2])




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "type",
      "value_set": [
        1,
        2
      ],
      "batch_id": "expectation_data_clean-clean_data"
    },
    "meta": {}
  },
  "result": {
    "element_count": 4248,
    "unexpected_count": 4248,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular",
      "Regular"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": nul

In [9]:
# Expectation 3 : Column `sales` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('sales', ['integer', 'float'])




Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_type_list",
    "kwargs": {
      "column": "sales",
      "type_list": [
        "integer",
        "float"
      ],
      "batch_id": "expectation_data_clean-clean_data"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# Expectation 4 : Column `date` must be unique

validator.expect_column_values_to_be_unique('date')




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "date",
      "batch_id": "expectation_data_clean-clean_data"
    },
    "meta": {}
  },
  "result": {
    "element_count": 4248,
    "unexpected_count": 4248,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "4/1/2010 0:00",
      "7/1/2010 0:00",
      "11/1/2010 0:00",
      "12/1/2010 0:00",
      "7/1/2011 0:00",
      "8/1/2011 0:00",
      "9/1/2011 0:00",
      "10/1/2011 0:00",
      "11/1/2011 0:00",
      "6/1/2010 0:00",
      "11/1/2010 0:00",
      "8/1/2011 0:00",
      "10/1/2011 0:00",
      "11/1/2011 0:00",
      "6/1/2010 0:00",
      "10/1/2010 0:00",
      "11/1/2010 0:00",
      "1/1/2011 0:00",
      "2/1/2011 0:00",
      "3/1/2011 0:00"
    ],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0
  },
  "meta": {},
  "ex

In [17]:
# Expectation 5 : Column `productid` stdev between 1 and 5

validator.expect_column_stdev_to_be_between(
    column="productid",
    min_value=1,
    max_value=5
)





Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_stdev_to_be_between",
    "kwargs": {
      "column": "productid",
      "min_value": 1,
      "max_value": 5,
      "batch_id": "expectation_data_clean-clean_data"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 3.6640716334723153
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [20]:
# Expectation 6 : Data entries between 4000 and 5000

validator.expect_table_row_count_to_be_between(
    min_value=4000,
    max_value=5000
)





Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_row_count_to_be_between",
    "kwargs": {
      "min_value": 4000,
      "max_value": 5000,
      "batch_id": "expectation_data_clean-clean_data"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 4248
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [21]:
# Expectation 7 : Number of column equal to 20

validator.expect_table_column_count_to_equal(value=20)

  and should_run_async(code)




Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_column_count_to_equal",
    "kwargs": {
      "value": 20,
      "batch_id": "expectation_data_clean-clean_data"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 20
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

# Checkpoint

In [22]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [24]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics: 0it [00:00, ?it/s]

# Data Docs

In [25]:
# Build data docs

context.build_data_docs()

{'local_site': 'file:///content/gx/uncommitted/data_docs/local_site/index.html'}